#!/usr/bin/env python """ changes.py - Show Site Changes Author: Sean B. Palmer, inamidst.com """ import cgitb; cgitb.enable() import sys, os, re, bz2 docroot = os.environ.get('DOCUMENT_ROOT', '.') sys.path.append(os.path.join(docroot, 'dev/code')) import inventory htlogdir = os.path.join(docroot, 'odds/.htlogdir') logdir = open(htlogdir).read() logdir = logdir.replace('$DOCUMENT_ROOT', docroot) logdir = logdir.strip() r_month = re.compile(r'^\d{4}-\d{2}$') def cgipath(default='/changes/'): pathinfo = os.environ.get('PATH_INFO', '') requri = os.environ.get('REQUEST_URI', default) if requri.endswith(pathinfo): return requri[:-len(pathinfo)] return requri def dateToFilename(date): day, t = date.split(' ', 1) norm = day + '.' + ''.join(t.split(':')) return os.path.join(logdir, day[:7], 'diff.%s.bz2' % norm) def diffname(date, path): # Parsing from bzip2'd files was too slow, so now this uses logs. # (The log files are compiled in bin/changes, i.e. made automatically.) # # Before: 700 8.580 0.012 13.000 0.019 changes.py:33(diffname) # After: 700 0.050 0.000 0.130 0.000 changes.py:33(diffname) # - profile.run('changes.latestAsHTML(15)') bz2fn = dateToFilename(date) logfn = bz2fn[:-4] + '.log' if not os.path.isfile(logfn): return None f = open(logfn) for line in f: line = line.strip('\r\n') if line.endswith(' www' + path): f.close() return bz2fn f.close() return None def pathToFilename(path): """Convert a path to a diff escaped path. >>> pathToFilename('/phenny/modules/codepoint.py') 'phenny_modules_codepoint.py' """ path = path.lstrip('/') path = path.replace('_', '__') return path.replace('/', '_') def infoToFilename(date, path): """Convert a date and path to a diff filename. >>> infoToFilename('2006-03-05 00:30:56', '/phenny/modules/codepoint.py') '2006-03-05-00:30:56-phenny_modules_codepoint.py.diff' """ date = date.replace(' ', '-') path = pathToFilename(path) return date + '-' + path + '.diff' def getdiff(filename): """Get a diff using an escaped diff filename, returning lines. >>> diff = '2006-03-05-00:30:56-phenny_modules_codepoint.py.diff' >>> lines = getdiff(diff) >>> len(list(lines)) 28 """ import tarfile, itertools def generator(): tar = tarfile.open('diffs/' + filename[:7] + '.tar.bz2', 'r:bz2') for tarinfo in tar: name = tarinfo.name[8:] # minus leading "yyyy-mm/" if tarinfo.isreg() and (name == filename): diff = tar.extractfile(tarinfo) while True: line = diff.readline() if not line: break yield line diff.close() break tar.close() # In case you haven't noticed, this is absurd lines = generator() lines, copy = itertools.tee(lines) try: copy.next() except StopIteration: return False else: return lines def diffsFromArchive(month, epath, start=None, finish=None, files=True): """Get diffs from a month and escaped path, with optional bounds. >>> diffs = diffsFromArchive('2006-03', 'phenny_modules_codepoint.py') >>> for name, f in diffs: ... print name ... 2006-03-27-09:50:45-phenny_modules_codepoint.py.diff 2006-03-05-00:30:56-phenny_modules_codepoint.py.diff """ import re, datetime, tarfile # @@ , fnmatch start = start or datetime.date(1970, 1, 1) finish = finish or datetime.date.today() r_epath = re.compile(r'^%s$' % epath) tarfn = 'diffs/' + month + '.tar.bz2' if not os.path.exists(tarfn): return tar = tarfile.open(tarfn, 'r:bz2') for tarinfo in reversed(list(tar)): name = tarinfo.name[8:] # minus leading "yyyy-mm/" args = tuple(int(arg) for arg in name[:10].split('-')) tardate = datetime.date(*args) if start <= tardate <= finish: # if fnmatch.fnmatch(name[20:-5], epath): # Strip date and extension if r_epath.match(name[20:-5]): if files: f = tar.extractfile(tarinfo) yield name, f f.close() else: yield name, None else: break else: tar.close() def diffExists(filename): import anydbm fn = os.path.join(logdir, filename[:7] + '/diffs.db') if not os.path.exists(fn): return False # @@ How will "with" work with the following? db = anydbm.open(fn, 'r') if db.has_key(filename): db.close() return True db.close() return False def public(exclusions, path): for exclude in exclusions.split(' '): if path.startswith(exclude): return False return True class Changeset(object): def __init__(self, fn, lines): self.filename = fn self.lines = lines self.parse() def __str__(self): return self.format('html') def format(self, media): day, t = self.date.split(' ', 1) lines = [self.message + ' (%s)' % t] lines.append('') return '\n'.join(lines) def parse(self): self.modified = [] self.added = [] self.deleted = [] seenpaths = set() for line in self.lines: line = line.strip(' \t\r\n') if line.startswith('#'): line = line.lstrip('# ') attr, value = line.split(': ', 1) attr = attr.lower() setattr(self, attr, value) # Schema: (rawpath, path, public, diff) elif line and not line.startswith('#'): rawpath = line.split(' ', 1)[1].strip(' \t\r\n') if not rawpath.startswith('/'): rawpath = '/' + rawpath # Normalise the path, and filter out .htacces, .pyc, etc. path = inventory.pathstrip(rawpath) if inventory.ignorable(path): continue # Remove duplicates, e.g. /newdir{/} and /newdir/{index.html} if path in seenpaths: continue seenpaths.add(path) if line.startswith('M'): # diff = diffname(self.date, rawpath) parts = (rawpath, path, public(self.robots, path)) # , diff) self.modified.append(parts) elif line.startswith('A'): # diff = diffname(self.date, rawpath) parts = (rawpath, path, public(self.robots, path)) # , diff) self.added.append(parts) elif line.startswith('D'): parts = (rawpath, path, public(self.robots, path)) # , None) self.deleted.append(parts) self.modified.sort() self.added.sort() self.deleted.sort() def public(self): all = self.modified + self.added + self.deleted for (rawpath, path, public) in all: if public: return True return False class Changelog(object): """chlog = Changelog(filename, [reverse,] [maximum]) -> new change log. chlog.changesets -> the changesets chlog.days -> days to changesets mapping, order as reverse karg """ def __init__(self, filename, reverse=False, maximum=None): self.filename = filename self.reverse = reverse self.maximum = maximum self.changesets = [] self.parse() def __len__(self): return len(list(c for c in self.changesets if c.public())) def __str__(self): lines = [] days = sorted(self.days.iterkeys()) if self.reverse: days.reverse() for day in days: public = False for changeset in self.days[day]: if changeset.public(): public = True break if public: lines.append('

%s

' % (day, day)) lines.append('') return '\n'.join(lines) def parse(self): changeLines = [] changelog = open(self.filename) for line in changelog: if line.startswith('# Date'): changeLines.append([]) if changeLines: changeLines[-1].append(line) changelog.close() for changesetLines in changeLines: changeset = Changeset(self.filename, changesetLines) self.changesets.append(changeset) if self.reverse: self.changesets.reverse() if self.maximum is not None: self.changesets = self.changesets[:self.maximum] self.days = {} for changeset in self.changesets: day = changeset.date[:10] if self.days.has_key(day): self.days[day].append(changeset) else: self.days[day] = [changeset] def changesByMonth(month): changelog = os.path.join(logdir, month + '.log') chlog = Changelog(changelog) return str(chlog) def monthsBackwards(): import datetime # Shame that datetime.date can't do UTC # Cf. http://swhack.com/logs/2006-05-01#T03-37-14 today = datetime.datetime.utcnow() first = today.replace(day=1) twoDays = datetime.timedelta(days=2) while True: yield first.strftime('%Y-%m') first = (first - twoDays).replace(day=1) if first.year < 1900: break def monthLogsBackwards(): logs = set([]) for filename in os.listdir(logdir): if filename.endswith('.log') and r_month.match(filename[:-4]): logs.add(filename) months = monthsBackwards() while logs: month = months.next() log = month + '.log' if log in logs: logs.remove(log) yield month def monthExists(month): changelog = os.path.join(logdir, month + '.log') if not os.path.isfile(changelog): return False chlog = Changelog(changelog) if len(chlog): return True return False def latest(n): changelogs = [] for month in monthLogsBackwards(): # was monthsBackwards() maximum = n - sum(len(c) for c in changelogs) changelog = os.path.join(logdir, month + '.log') if os.path.isfile(changelog): chlog = Changelog(changelog, reverse=True, maximum=maximum) changelogs.append(chlog) elif changelogs: break if sum(len(c) for c in changelogs) >= n: break return changelogs def latestAsChangesets(n): changelogs = latest(n) for chlog in changelogs: for day in reversed(sorted(chlog.days.iterkeys())): for changeset in chlog.days[day]: yield changeset def latestAsHTML(n): changelogs = latest(n) return '\n'.join(str(chlog) for chlog in changelogs) def genchanges(n=15): """Generator of (day, time, message).""" for changeset in latestAsChangesets(n): if not changeset.public(): continue day, time = changeset.date.split(' ', 1) yield day, time, changeset.message if __name__=="__main__": print __doc__