aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMart Raudsepp <leio@gentoo.org>2016-12-07 03:56:00 +0200
committerMart Raudsepp <leio@gentoo.org>2016-12-07 03:56:00 +0200
commitdde4a3a9c8fbe76897219886f21d046392d65730 (patch)
treed780e88e7e3d48921b46d8db698a48785f86538c
parentmodels: Add package maintainers relationship table and ORM relationships (diff)
downloadgrumpy-dde4a3a9c8fbe76897219886f21d046392d65730.tar.gz
grumpy-dde4a3a9c8fbe76897219886f21d046392d65730.tar.bz2
grumpy-dde4a3a9c8fbe76897219886f21d046392d65730.zip
sync: Add package description and maintainers sync
Maintains a sync timestamp to skip recently synced packages, so if a previous run got stuck, we can skip re-doing it too soon. Saves the DB transaction after every 100 packages, because packages.g.o seems to rate-limit us, so at least we will have things saved into DB periodically to cancel out when we get stuck and restart.
-rw-r--r--backend/lib/sync.py49
1 files changed, 45 insertions, 4 deletions
diff --git a/backend/lib/sync.py b/backend/lib/sync.py
index e53fa9b..567da2d 100644
--- a/backend/lib/sync.py
+++ b/backend/lib/sync.py
@@ -1,8 +1,11 @@
import xml.etree.ElementTree as ET
import requests
+import time
+from datetime import datetime
from .. import app, db
from .models import Category, Maintainer, Package, PackageVersion
+SYNC_BUFFER_SECS = 30*60
proj_url = "https://api.gentoo.org/metastructure/projects.xml"
pkg_url_base = "https://packages.gentoo.org/"
http_session = requests.session()
@@ -144,11 +147,49 @@ def sync_packages():
db.session.commit()
def sync_versions():
- for package in Package.query.all():
+ cnt = 0
+ ts = datetime.utcfromtimestamp(time.time() - SYNC_BUFFER_SECS)
+ now = datetime.utcnow()
+ existing_maintainers = {}
+ for maintainer in Maintainer.query.all():
+ existing_maintainers[maintainer.email] = maintainer
+
+ for package in Package.query.filter(Package.last_sync_ts < ts).all():
+ cnt += 1
data = http_session.get(pkg_url_base + "packages/" + package.full_name + ".json")
if not data:
print("No JSON data for package %s" % package.full_name) # FIXME: Handle better; e.g mark the package as removed if no pkgmove update
continue
- from pprint import pprint
- pprint(data.json())
- break
+
+ pkg = data.json()
+
+ print ("Updating package: %s" % package.full_name)
+ if 'description' in pkg:
+ package.description = pkg['description']
+
+ maintainers = []
+ if 'maintainers' in pkg:
+ for maint in pkg['maintainers']:
+ if 'email' not in maint:
+ print("WARNING: Package %s was told to have a maintainer without an e-mail identifier" % package.full_name)
+ continue
+ if maint['email'] in existing_maintainers: # FIXME: Some proxy-maintainers are using mixed case e-mail address, right now we'd be creating duplicates right now if the case is different across different packages
+ maintainers.append(existing_maintainers[maint['email']])
+ else:
+ is_project = False
+ if 'type' in maint and maint['type'] == 'project':
+ is_project = True
+ print("Adding %s maintainer %s" % ("project" if is_project else "individual", maint['email']))
+ new_maintainer = Maintainer(email=maint['email'], is_project=is_project, name=maint['name'] if 'name' in maint else None)
+ db.session.add(new_maintainer)
+ existing_maintainers[maint['email']] = new_maintainer
+ maintainers.append(new_maintainer)
+
+ # Intentionally outside if 'maintainers' in pkg, because if there are no maintainers in JSON, it's falled to maintainer-needed and we need to clean out old maintainer entries
+ package.maintainers = maintainers # TODO: Retain order to know who is primary; retain description associated with the maintainership
+ package.last_sync_ts = now
+
+ if not cnt % 100:
+ print("%d packages updated, committing DB transaction" % cnt)
+ db.session.commit()
+ now = datetime.utcnow()