#!/usr/bin/python import contextlib import collections import functools import itertools import mmap import multiprocessing import operator import os import re import subprocess import sys @contextlib.contextmanager def mmap_open(path): handle = fd = None try: fd = os.open(path, os.O_RDONLY) handle = mmap.mmap(fd, os.fstat(fd).st_size, mmap.MAP_SHARED, mmap.PROT_READ) os.close(fd) fd = None yield handle finally: if fd: os.close(fd) if handle: handle.close() def readline_iterate(handle): line = handle.readline() while line: yield line line = handle.readline() mangler = [] mangler.append(functools.partial( re.compile(r"^\(paludis (0.1.*)\)$", re.M|re.I).sub, r"Package-Manager: paludis-\1/")) # Special case not covered by the main portage mangler. mangler.append(functools.partial( re.compile('r^\(Portage (2\.1\.2[^\)]+)\)$', re.M|re.I).sub, r'Package-Manager: portage-\1')) mangler.append(functools.partial( re.compile(r' *\((?:manifest +recommit|(?:un)?signed +manifest +commit)\) *$', re.M|re.I).sub, r'')) def mangle_portage(match, allowed=frozenset('abcdef0123456789')): content = match.group() assert isinstance(content, (unicode, str)) content = content.strip() assert ('(', ')') == (content[0], content[-1]), content content = content[1:-1] values = [x.strip() for x in content.split(',')] # portage version: blah version = values[0].split(':', 1)[1].strip() results = ['Package-Manager: portage-' + version] values = [x for x in values if 'unsigned manifest' not in x.lower()] repoman = [x for x in values if 'repoman options:' in x.lower()] assert len(repoman) <= 1, content if repoman: repoman = ' '.join(repoman[0].split(':', 1)[1].split()) results.append('RepoMan-Options: ' + repoman) values = [x.lower() for x in values] signage = [x for x in values if 'key' in x and 'signed' in x and 'unsigned' not in x] assert len(signage) <= 1, content if signage: signage = signage[0].rstrip().rsplit(None, 1)[1] if signage.startswith('0x'): signage = signage[2:] if signage in ('key', 'ultrabug'): # Known bad keys; this is why portage needs to do basic enforcement... signage = None elif signage in ('williamh@gentoo.org', 'w.d.hubbs@gmail.com'): signage = '30C46538' elif signage.endswith('!'): assert allowed.issuperset(signage[:-1]), content else: assert allowed.issuperset(signage), content if signage: results.append('Manifest-Sign-Key: 0x' + signage.upper()) return "\n".join(results) # the TM/R is for crap like this: # (Portage version: 2.2_pre7/cvs/Linux 2.6.25.4 Intel(R) Core(TM)2 Duo CPU E6750 @ 2.66GHz) mangler.append(functools.partial( re.compile(r'^\(portage version: +(?:\((?:tm|r)\)|[^\)\n])+\)$', re.M|re.I).sub, mangle_portage)) known_footers = ('Package-Manager', 'RepoMan-Options', 'Manifest-Sign-Key') fields = ('author', 'msg', 'files', 'timestamp', 'footerless_msg') fields_map = dict((attr, idx) for idx, attr in enumerate(fields)) fake_fields = ('footerless_msg', 'timestamp') file_idx = fields_map['files'] class record(collections.namedtuple('record', fields)): def safe_combine(self, other): files = self.files.copy() assert not set(files).intersection(other.files), (files, other.files) files.update(other.files) items = list(self) items[file_idx] = files return self.__class__(*items) def update_files(self, other): files = self.files.copy() files.update(other.files if isinstance(other, record) else other) items = list(self) items[file_idx] = files return self.__class__(*items) @staticmethod def calculate_footerless_msg(msg): return tuple(x for x in msg.splitlines() if x.split(':', 1)[0] not in known_footers) def deserialize_records(source, blob_idx): line = source.readline() while line: while line.split()[0] in ('reset', 'progress'): line = source.readline() # First get the free form fields; stop after we get the commit msg. assert line.split()[0] == 'commit', line d = {} while True: line = source.readline() chunks = line.split(None, 1) assert len(chunks) == 2, line if chunks[0] in ('from', 'mark'): continue assert chunks[0] in ('author', 'committer', 'data') if chunks[0] != 'data': d[chunks[0]] = chunks[1].strip() continue # Process the commit message... size = int(chunks[1]) data = source.read(size) assert len(data) == size, (line, data) for func in mangler: data = func(data) # Throw away the prefixed/trailing whitespace- some of our manglers leave those behind # unfortunately. For fast-export reasons, a newline trailing is needed- but that should be it. d['msg'] = data.strip() + "\n" line = source.readline() # Note that cvs2git writes slightly funky data statements; the byte count # doesn't necessarily include the trailing newline. if line == '\n': line = source.readline() break assert line # From can show up here on occasion... annoying. if line.split()[0:1] == ['from']: line = source.readline() files = {} while line != '\n': # Two types I can spot; M=modify, and D=delete. assert line[-1] == '\n' line = line[:-1] mode = line.split(None, 1) assert len(mode) == 2, line if mode[0] == 'D': files[intern(os.path.normpath(mode[1]))] = (mode[0], line) elif mode[0] == 'M': # M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog # if it's not a sha1, but startswith ':'... then it's an index. chunks = line.split(None, 4) assert len(chunks) == 4, line fname = intern(os.path.normpath(chunks[3])) if chunks[2][0] == ':': line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname]) files[fname] = (mode[0], line) else: raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line)) line = source.readline() d['files'] = files # Basic sanity check for the code above... d.setdefault('author', d.get('committer')) assert d['author'] is not None assert d['author'] == d['committer'], d d.pop('committer') # Skank the timestamp out... chunks = d['author'].rsplit(None, 1) assert len(chunks) == 2 and chunks[1] == '+0000', d['author'] chunks = chunks[0].rsplit(None, 1) d['timestamp'] = long(chunks[1]) d['author'] = intern(chunks[0]) d['footerless_msg'] = record.calculate_footerless_msg(d['msg']) assert set(fields).issuperset(d), d yield record(*[d.get(x) for x in fields]) # Bleh... of course namedtuple doesn't make this easy. line = source.readline() def serialize_records(records, handle, target='refs/heads/master', progress=100): write = handle.write write('reset %s\n' % target) total = len(records) total_len = len(str(total)) progress_interval = max(1, total // progress) for idx, record in enumerate(sorted(records, key=lambda x:(x.timestamp,sorted(x.files),x.author,x.footerless_msg)), 1): if idx % progress_interval == 0: write('progress %s%%: %s of %i commits\n' % (str(int(100 * (float(idx)/total))).rjust(2), str(idx).rjust(total_len), total)) write('commit %s\n' % target) write('mark :%i\n' % idx) # fields = ('mark', 'author', 'committer', 'msg', 'files') for name, value in zip(fields, record): if name in fake_fields: continue elif name == 'mark': write("%s %s\n" % (name, value)) elif name == 'author': val = "%s %i +0000" % (value, record.timestamp) write('author %s\ncommitter %s\n' % (val, val)) elif name == 'msg': write("data %i\n%s" % (len(value), value)) elif name == 'files': for filename in sorted(value): write("%s\n" % (value[filename][1],)) else: raise AssertionError("serialize is out of sync; don't know field %s" % name) write("\n") def deserialize_blob_map(path): with mmap_open(path) as handle: source = (x.strip().split() for x in readline_iterate(handle)) return dict((int(x[0].lstrip(':')), x[1]) for x in source) def simple_dedup(records): # dedup via timestamp/author/msg dupes = collections.defaultdict(list) for idx, record in enumerate(records): dupes[(record.timestamp, record.author, record.footerless_msg)].append((idx, record)) mangled = [] for key, value in dupes.iteritems(): if len(value) == 1: continue value.sort(key=operator.itemgetter(0)) combined = value[0][1] for idx, item in value[1:]: combined = combined.safe_combine(item) value[:] = [(value[0][0], combined)] mangled.append((key, value)) l = itertools.imap(operator.itemgetter(0), dupes.itervalues()) return itertools.imap(operator.itemgetter(1), sorted(l, key=operator.itemgetter(0))) def manifest_dedup(records, backwards=(5*60)): # While searching back 5 minutes is a bit much... it's happened more than one might # think sadly. slots = collections.defaultdict(list) for idx, record in enumerate(records): if len(record.files) != 1: slots[record.timestamp].append((idx, record)) continue manifest = record.files.items()[0] # if it's a deletion, we don't care... if not manifest[0].endswith('/Manifest') or manifest[1][0] == 'D': slots[record.timestamp].append((idx, record)) continue manifest_dir = os.path.dirname(manifest[0]) update = True for timestamp in xrange(record.timestamp, max(0, record.timestamp - backwards), -1): potential = slots.get(timestamp) if potential is None: continue for update_pos, (idx, target) in enumerate(reversed(potential), 1): # while intersecting pathways first is slower... we do it this way so that we can # spot if another author stepped in for a directory- if that occurs, manifest recommit # or not, we shouldn't mangle that history. if all(manifest_dir != os.path.dirname(x) for x in target.files): potential[0 - update_pos] = (idx, target.update_files(record)) continue if (target.author == record.author and target.footerless_msg == record.footerless_msg): potential[-update_pos] = (idx, target.update_files(record)) # same author/msg; allow the combination. update = False # note if author/msg didn't match, this becomes a forced injection. break if update: slots[record.timestamp].append((idx, record)) # And... do the collapse. l = [] for value in slots.itervalues(): l.extend(value) # Sort by idx, but strip idx on the way out. return itertools.imap(operator.itemgetter(1), sorted(l, key=operator.itemgetter(0))) def get_blob(sha1): return subprocess.check_output(['git', 'show', sha1], cwd='git') import traceback def process_record(data): try: return _process_record(data) except Exception: return traceback.format_exc() def _process_record(data): idx, manifests, record = data rewritten_record = record for fname, data in manifests: # Hacky, but it's just a test.. chunked = data[1].split() sha1 = chunked[2] blob = get_blob(sha1) if '-----BEGIN PGP SIGNATURE-----' in blob: continue # Don't touch any old v1 manifests... blob = [x for x in blob.splitlines() if x] if not blob: # Empty manifest? The hell? continue if any(x.startswith('MD5') for x in blob): continue blob2 = [x for x in blob if x.startswith('DIST')] if not blob or blob2 != blob: if blob2: p = subprocess.Popen(['git', 'hash-object', '-w', '--stdin', '--path', fname], cwd='git', stdout=subprocess.PIPE, stdin=subprocess.PIPE) stdout, _ = p.communicate("\n".join(blob2)) assert p.wait() == 0 new_sha1 = stdout.strip() assert len(new_sha1) == 40, new_sha1 rewritten_record = rewritten_record.update_files( {fname:(data[0], " ".join(chunked[:2] + [new_sha1, fname]))}) else: rewritten_record = rewritten_record.update_files({}) del rewritten_record.files[fname] if rewritten_record is not record: return (idx, record) else: return None def thin_manifest_conversion(records, processing_pool): potentials = [] for idx, record in enumerate(records): manifests = [(fname, data) for fname, data in record.files.iteritems() if fname.endswith('/Manifest') and data[0] != 'D'] if manifests: potentials.append((idx, manifests, record)) rewrites = deletes = 0 for result in processing_pool.imap_unordered( process_record, potentials, chunksize=30): if result is not None: if not isinstance(result, tuple): raise Exception(result) idx, value = result if not value.files: # Just drop the commit. value = None deletes += 1 else: records[idx] = value rewrites += 1 sys.stderr.write("potential:%i, deletes: %i, rewrites:%i\n" % (len(potentials), deletes, rewrites)) return itertools.ifilter(None, records) def process_directory(paths): commit_path, idx_path = paths with mmap_open(commit_path) as data: return tuple(manifest_dedup( deserialize_records(data, deserialize_blob_map(idx_path)))) def main(argv): # allocate the pool now, before we start getting memory abusive; this is # used for thin-manifest conversion if active/enabled. #clean_pool = multiprocessing.Pool() # Be careful here to just iterate over source; doing so allows this script # to do basic processing as it goes (specifically while it's being fed from # the mainline cvs2git parallelized repo creator). source = argv if not argv: # See python manpage for details; stdin buffers if you iterate over it; # we want each line as they're available, thus use this form. source = readline_iterate(sys.stdin) def consumable(): for directory in source: directory = directory.strip() tmp = os.path.join(directory, 'cvs2svn-tmp') commits = os.path.join(tmp, 'git-dump.dat') if not os.path.exists(commits): sys.stderr.write("skipping %s; no commit data\n" % directory) sys.stderr.flush() continue yield (commits, os.path.join(tmp, 'git-blob.idx')) records = [] record_generator = multiprocessing.Pool() for result in record_generator.imap_unordered(process_directory, consumable()): records.extend(result) record_generator.close() record_generator.join() del record_generator sys.stderr.write("All commits loaded.. starting dedup runs\n") sys.stderr.flush() sorter = operator.attrgetter('timestamp') # Get them into timestamp ordering first; this is abusing python stable # sort pretty much since any commits to the same repo w/ the same timestamp # will still have their original ordering (just that chunk will be moved). # This allows us to combine the history w/out losing the ordering per repo. records.sort(key=sorter) records[:] = simple_dedup(records) # records[:] = manifest_dedup(records) # records[:] = thin_manifest_conversion(records, clean_pool) serialize_records(records, sys.stdout) return 0 if __name__ == '__main__': sys.exit(main(sys.argv[1:]))