#!/usr/bin/env python
"""
changes.py - Show Site Changes
Author: Sean B. Palmer, inamidst.com
"""

import cgitb; cgitb.enable()
import sys, os, re, bz2

docroot = os.environ.get('DOCUMENT_ROOT', '.')
sys.path.append(os.path.join(docroot, 'dev/code'))
import inventory

htlogdir = os.path.join(docroot, 'odds/.htlogdir')
logdir = open(htlogdir).read()
logdir = logdir.replace('$DOCUMENT_ROOT', docroot)
logdir = logdir.strip()

r_month = re.compile(r'^\d{4}-\d{2}$')

def cgipath(default='/changes/'): 
   pathinfo = os.environ.get('PATH_INFO', '')
   requri = os.environ.get('REQUEST_URI', default)
   if requri.endswith(pathinfo): 
      return requri[:-len(pathinfo)]
   return requri

def dateToFilename(date): 
   day, t = date.split(' ', 1)
   norm = day + '.' + ''.join(t.split(':'))
   return os.path.join(logdir, day[:7], 'diff.%s.bz2' % norm)

def diffname(date, path): 
   # Parsing from bzip2'd files was too slow, so now this uses logs.
   # (The log files are compiled in bin/changes, i.e. made automatically.)
   # 
   # Before: 700 8.580 0.012 13.000 0.019 changes.py:33(diffname)
   # After: 700 0.050 0.000 0.130 0.000 changes.py:33(diffname)
   # - profile.run('changes.latestAsHTML(15)')

   bz2fn = dateToFilename(date)
   logfn = bz2fn[:-4] + '.log'

   if not os.path.isfile(logfn): 
      return None

   f = open(logfn)
   for line in f: 
      line = line.strip('\r\n')
      if line.endswith(' www' + path): 
         f.close()
         return bz2fn
   f.close()
   return None

def pathToFilename(path): 
   """Convert a path to a diff escaped path.

   >>> pathToFilename('/phenny/modules/codepoint.py')
   'phenny_modules_codepoint.py'

   """
   path = path.lstrip('/')
   path = path.replace('_', '__')
   return path.replace('/', '_')

def infoToFilename(date, path): 
   """Convert a date and path to a diff filename.

   >>> infoToFilename('2006-03-05 00:30:56', '/phenny/modules/codepoint.py')
   '2006-03-05-00:30:56-phenny_modules_codepoint.py.diff'

   """
   date = date.replace(' ', '-')
   path = pathToFilename(path)
   return date + '-' + path + '.diff'

def getdiff(filename): 
   """Get a diff using an escaped diff filename, returning lines.

   >>> diff = '2006-03-05-00:30:56-phenny_modules_codepoint.py.diff'
   >>> lines = getdiff(diff)
   >>> len(list(lines))
   28

   """
   import tarfile, itertools

   def generator(): 
      tar = tarfile.open('diffs/' + filename[:7] + '.tar.bz2', 'r:bz2')
      for tarinfo in tar: 
         name = tarinfo.name[8:] # minus leading "yyyy-mm/"
         if tarinfo.isreg() and (name == filename): 
            diff = tar.extractfile(tarinfo)
            while True: 
               line = diff.readline()
               if not line: break
               yield line
            diff.close()
            break
      tar.close()

   # In case you haven't noticed, this is absurd
   lines = generator()
   lines, copy = itertools.tee(lines)

   try: copy.next()
   except StopIteration: return False
   else: return lines

def diffsFromArchive(month, epath, start=None, finish=None, files=True): 
   """Get diffs from a month and escaped path, with optional bounds.

   >>> diffs = diffsFromArchive('2006-03', 'phenny_modules_codepoint.py')
   >>> for name, f in diffs: 
   ...    print name
   ...
   2006-03-27-09:50:45-phenny_modules_codepoint.py.diff
   2006-03-05-00:30:56-phenny_modules_codepoint.py.diff

   """
   import re, datetime, tarfile # @@ , fnmatch

   start = start or datetime.date(1970, 1, 1)
   finish = finish or datetime.date.today()

   r_epath = re.compile(r'^%s$' % epath)
   tarfn = 'diffs/' + month + '.tar.bz2'
   if not os.path.exists(tarfn): return
   tar = tarfile.open(tarfn, 'r:bz2')
   for tarinfo in reversed(list(tar)):
      name = tarinfo.name[8:] # minus leading "yyyy-mm/"
      args = tuple(int(arg) for arg in name[:10].split('-'))
      tardate = datetime.date(*args)

      if start <= tardate <= finish: 
         # if fnmatch.fnmatch(name[20:-5], epath): # Strip date and extension
         if r_epath.match(name[20:-5]): 
            if files: 
               f = tar.extractfile(tarinfo)
               yield name, f
               f.close()
            else: yield name, None
      else: break
   else: tar.close()

def diffExists(filename): 
   import anydbm
   fn = os.path.join(logdir, filename[:7] + '/diffs.db')
   if not os.path.exists(fn): 
      return False

   # @@ How will "with" work with the following?
   db = anydbm.open(fn, 'r')
   if db.has_key(filename): 
      db.close()
      return True
   db.close()
   return False

def public(exclusions, path): 
   for exclude in exclusions.split(' '): 
      if path.startswith(exclude): 
         return False
   return True

class Changeset(object): 
   def __init__(self, fn, lines): 
      self.filename = fn
      self.lines = lines
      self.parse()

   def __str__(self): 
      return self.format('html')

   def format(self, media): 
      day, t = self.date.split(' ', 1)
      lines = [self.message + ' (%s)' % t]
      lines.append('<ul>')

      template = ' <li><a href="%s">%s</a>%s</li>'
      for (rawpath, path, public) in self.modified: 
         fn = infoToFilename(self.date, rawpath)
         if False: # diffExists(fn): 
            diff = ' (<a rel="nofollow" href="%s">diff</a>)' % fn
         else: diff = ''
         if public: lines.append(template % (path, path, diff))

      template = ' <li><a href="%s">%s</a> <strong>New!</strong></li>'
      for (rawpath, path, public) in self.added: 
         if public: lines.append(template % (path, path))

      template = ' <li>%s Removed</li>'
      for (rawpath, path, public) in self.deleted: 
         if public: lines.append(template % path)

      lines.append('</ul>')
      return '\n'.join(lines)

   def parse(self): 
      self.modified = []
      self.added = []
      self.deleted = []

      seenpaths = set()
      for line in self.lines: 
         line = line.strip(' \t\r\n')

         if line.startswith('#'): 
            line = line.lstrip('# ')
            attr, value = line.split(': ', 1)
            attr = attr.lower()
            setattr(self, attr, value)

         # Schema: (rawpath, path, public, diff)
         elif line and not line.startswith('#'): 
            rawpath = line.split(' ', 1)[1].strip(' \t\r\n')
            if not rawpath.startswith('/'): 
               rawpath = '/' + rawpath

            # Normalise the path, and filter out .htacces, .pyc, etc.
            path = inventory.pathstrip(rawpath)
            if inventory.ignorable(path): 
               continue

            # Remove duplicates, e.g. /newdir{/} and /newdir/{index.html}
            if path in seenpaths: 
               continue
            seenpaths.add(path)

            if line.startswith('M'): 
               # diff = diffname(self.date, rawpath)
               parts = (rawpath, path, public(self.robots, path)) # , diff)
               self.modified.append(parts)
            elif line.startswith('A'): 
               # diff = diffname(self.date, rawpath)
               parts = (rawpath, path, public(self.robots, path)) # , diff)
               self.added.append(parts)
            elif line.startswith('D'): 
               parts = (rawpath, path, public(self.robots, path)) # , None)
               self.deleted.append(parts)

      self.modified.sort()
      self.added.sort()
      self.deleted.sort()

   def public(self): 
      all = self.modified + self.added + self.deleted
      for (rawpath, path, public) in all: 
         if public: return True
      return False

class Changelog(object): 
   """chlog = Changelog(filename, [reverse,] [maximum]) -> new change log.
      chlog.changesets -> the changesets
      chlog.days -> days to changesets mapping, order as reverse karg
   """

   def __init__(self, filename, reverse=False, maximum=None): 
      self.filename = filename
      self.reverse = reverse
      self.maximum = maximum
      self.changesets = []
      self.parse()

   def __len__(self): 
      return len(list(c for c in self.changesets if c.public()))

   def __str__(self): 
      lines = []
      days = sorted(self.days.iterkeys())
      if self.reverse: 
         days.reverse()

      for day in days: 
         public = False
         for changeset in self.days[day]: 
             if changeset.public(): 
                public = True
                break

         if public: 
            lines.append('<h2 id="T%s">%s</h2>' % (day, day))
            lines.append('<ul>')
            for changeset in self.days[day]: 
               if changeset.public(): 
                  lines.append(' <li>%s</li>' % changeset)
            lines.append('</ul>')
      return '\n'.join(lines)

   def parse(self): 
      changeLines = []
      changelog = open(self.filename)
      for line in changelog: 
         if line.startswith('# Date'): 
            changeLines.append([])
         if changeLines: 
            changeLines[-1].append(line)
      changelog.close()

      for changesetLines in changeLines: 
         changeset = Changeset(self.filename, changesetLines)
         self.changesets.append(changeset)
      if self.reverse: 
         self.changesets.reverse()
      if self.maximum is not None: 
         self.changesets = self.changesets[:self.maximum]

      self.days = {}
      for changeset in self.changesets: 
         day = changeset.date[:10]
         if self.days.has_key(day): 
            self.days[day].append(changeset)
         else: self.days[day] = [changeset]

def changesByMonth(month): 
   changelog = os.path.join(logdir, month + '.log')
   chlog = Changelog(changelog)
   return str(chlog)

def monthsBackwards(): 
   import datetime
   # Shame that datetime.date can't do UTC
   # Cf. http://swhack.com/logs/2006-05-01#T03-37-14
   today = datetime.datetime.utcnow()
   first = today.replace(day=1)

   twoDays = datetime.timedelta(days=2)
   while True: 
      yield first.strftime('%Y-%m')
      first = (first - twoDays).replace(day=1)
      if first.year < 1900: break

def monthLogsBackwards():    
   logs = set([])
   for filename in os.listdir(logdir): 
      if filename.endswith('.log') and r_month.match(filename[:-4]): 
         logs.add(filename)

   months = monthsBackwards()
   while logs: 
      month = months.next()
      log = month + '.log'
      if log in logs: 
         logs.remove(log)
         yield month

def monthExists(month): 
   changelog = os.path.join(logdir, month + '.log')
   if not os.path.isfile(changelog): 
      return False

   chlog = Changelog(changelog)
   if len(chlog): 
      return True
   return False

def latest(n): 
   changelogs = []
   for month in monthLogsBackwards(): # was monthsBackwards()
      maximum = n - sum(len(c) for c in changelogs)

      changelog = os.path.join(logdir, month + '.log')
      if os.path.isfile(changelog): 
         chlog = Changelog(changelog, reverse=True, maximum=maximum)
         changelogs.append(chlog)
      elif changelogs: break

      if sum(len(c) for c in changelogs) >= n: 
         break

   return changelogs

def latestAsChangesets(n): 
   changelogs = latest(n)
   for chlog in changelogs: 
      for day in reversed(sorted(chlog.days.iterkeys())): 
         for changeset in chlog.days[day]: 
            yield changeset

def latestAsHTML(n): 
   changelogs = latest(n)
   return '\n'.join(str(chlog) for chlog in changelogs)

def genchanges(n=15): 
   """Generator of (day, time, message)."""
   for changeset in latestAsChangesets(n): 
      if not changeset.public(): 
         continue
      day, time = changeset.date.split(' ', 1)
      yield day, time, changeset.message

if __name__=="__main__": 
   print __doc__