aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJauhien Piatlicki (jauhien) <piatlicki@gmail.com>2013-08-12 01:33:59 +0200
committerJauhien Piatlicki (jauhien) <piatlicki@gmail.com>2013-08-12 01:33:59 +0200
commitb8e92db3d4cde97bb2ebcd181cba3841cd789432 (patch)
treedb5f9be231a29c21ed9e3df8b03ba9fd1e00a299 /gs_pypi
parentg_sorcery/logger: ProgressBar (diff)
downloadg-sorcery-b8e92db3d4cde97bb2ebcd181cba3841cd789432.tar.gz
g-sorcery-b8e92db3d4cde97bb2ebcd181cba3841cd789432.tar.bz2
g-sorcery-b8e92db3d4cde97bb2ebcd181cba3841cd789432.zip
gs_pypi/pypi_db: parse data
Diffstat (limited to 'gs_pypi')
-rw-r--r--gs_pypi/pypi_db.py148
1 files changed, 88 insertions, 60 deletions
diff --git a/gs_pypi/pypi_db.py b/gs_pypi/pypi_db.py
index af3e65a..44e77a4 100644
--- a/gs_pypi/pypi_db.py
+++ b/gs_pypi/pypi_db.py
@@ -11,72 +11,100 @@
:license: GPL-2, see LICENSE for more details.
"""
-from g_sorcery.compatibility import py2k
+import bs4
-if py2k:
- import xmlrpclib
-else:
- import xmlrpc.client as xmlrpclib
-
-import datetime
-import re
-import sys
-
-from g_sorcery.g_collections import Package, serializable_elist
-from g_sorcery.logger import Logger
+from g_sorcery.g_collections import Package
from g_sorcery.package_db import DBGenerator
class PypiDBGenerator(DBGenerator):
def get_download_uries(self, common_config, config):
- return [config["repo_uri"] + "/pypi"]
-
- def process_uri(self, uri, data):
- url = uri["uri"]
- client = xmlrpclib.ServerProxy(url)
- logger = Logger()
- logger.info("downloading packages data")
- pkg_list = client.list_packages()
-
- number_of_packages = len(pkg_list)
- downloaded_number = 0
-
- for pkg in pkg_list:
- data[pkg] = {}
-
- chars = ['-','\\','|','/']
- show = chars[downloaded_number % 4]
- percent = (downloaded_number * 100)//number_of_packages
- length = 70
- progress = (percent * length)//100
- blank = length - progress
-
- sys.stdout.write("\r %s [%s%s] %s%%" % (show, "#" * progress, " " * blank, percent))
- sys.stdout.flush()
- downloaded_number += 1
-
- versions = []
- while not versions:
- try:
- versions = client.package_releases(pkg)
- except Exception as error:
- logger.warn("Something went wrong: " + str(error))
- logger.info("Trying again...")
- versions = []
-
- for version in versions:
- data[pkg][version] = {}
- while not data[pkg][version]:
- try:
- data[pkg][version] = client.release_data(pkg, version)
- except Exception as error:
- logger.warn("Something went wrong: " + str(error))
- logger.info("Trying again...")
- data[pkg][version] = {}
-
- sys.stdout.write("\r %s [%s] %s%%" % ("-", "#" * length, 100))
- sys.stdout.flush()
- print("")
+ self.repo_uri = config["repo_uri"]
+ return [{"uri": self.repo_uri + "?%3Aaction=index", "output": "packages"}]
+
+ def parse_data(self, data_f):
+ soup = bs4.BeautifulSoup(data_f.read())
+ packages = soup.table
+ data = {}
+ data["index"] = {}
+
+ pkg_uries = []
+
+ for entry in packages.find_all("tr")[1:-1]:
+ package, description = entry.find_all("td")
+
+ if description.contents:
+ description = description.contents[0]
+ else:
+ description = ""
+ package, version = package.a["href"].split("/")[2:]
+ data["index"][(package, version)] = description
+ pkg_uries.append({"uri": self.repo_uri + "pypi/" + package + "/" + version,
+ "parser": self.parse_package_page,
+ "output": package + "-" + version})
+ pkg_uries = self.decode_download_uries(pkg_uries)
+ for uri in pkg_uries:
+ self.process_uri(uri, data)
+
+ return data
+
+ def parse_package_page(self, data_f):
+ soup = bs4.BeautifulSoup(data_f.read())
+ data = {}
+ data["files"] = []
+ data["info"] = {}
+ for table in soup("table")[-1:]:
+ for entry in table("tr")[1:-1]:
+ fields = entry("td")
+
+ FILE = 0
+ URL = 0
+ MD5 = 1
+
+ TYPE = 1
+ PYVERSION = 2
+ UPLOADED = 3
+ SIZE = 4
+
+ file_inf = fields[FILE]("a")[0]["href"].split("#")
+ file_url = file_inf[URL]
+ file_md5 = file_inf[MD5][4:]
+
+ file_type = fields[TYPE].string
+ file_pyversion = fields[PYVERSION].string
+ file_uploaded = fields[UPLOADED].string
+ file_size = fields[SIZE].string
+
+ data["files"].append({"url": file_url,
+ "md5": file_md5,
+ "type": file_type,
+ "pyversion": file_pyversion,
+ "uploaded": file_uploaded,
+ "size": file_size})
+
+ for ul in soup("ul", class_ = "nodot")[:1]:
+ for entry in ul.contents:
+ if not hasattr(entry, "name") or entry.name != "li":
+ continue
+ entry_name = entry("strong")[0].string
+ if not entry_name:
+ continue
+
+ if entry_name == "Categories":
+ data["info"][entry_name] = []
+ for cat_entry in entry("a"):
+ data["info"][entry_name].append(cat_entry.string.split(" :: "))
+ continue
+
+ if entry("span"):
+ data["info"][entry_name] = entry("span")[0].string
+ continue
+
+ if entry("a"):
+ data["info"][entry_name] = entry("a")[0]["href"]
+ continue
+
+ return data
def process_data(self, pkg_db, data, common_config, config):
category = "dev-python"