([^<]+?)

#!/usr/bin/env python
"""
inventory.py - An Elegant Directory Lister
Author: Sean B. Palmer, inamidst.com
@@ dirlist.py?
setOptions, updateOptions
"""

import cgitb; cgitb.enable()
import sys, os, re, time, itertools
from robotparser import RobotFileParser
from optparse import OptionParser

stripable = ['index.html', '.html', 'index.cgi', '.cgi', '.txt', 
             '.jpg', '.png', '.gif', '.php', '.rss', '.sh', '.draft']
ignore = frozenset(['.pyc', '.pyo', '.cgic']) # @@ .htaccess

r_title = re.compile(r'(?i)<title>([^<]+?)</title>')
bufsize = 2048

def formatdate(date, template=None):
   if template is None:
      template = '%Y-%m-%d %H:%M:%S'
   t = time.gmtime(date)
   return time.strftime(template, t)

def formatnumber(n): 
   parts = list(str(n))
   for i in range((len(parts) - 3), 0, -3):
      parts.insert(i, ',')
   return ''.join(parts)

def title(path): 
   """Get the title of an HTML document."""
   if not path.endswith('.html'): 
      return None
   try: f = open(path)
   except OSError: 
      return None
   else: 
      content = ''
      while True: 
         bytes = f.read(bufsize)
         if not bytes: break
         content += bytes
         m = r_title.search(content)
         if m: 
            f.close()
            return m.group(1)
   f.close()
   return None

def publicpath(path, rp=None): 
   if rp is None: 
      docroot = os.environ.get('DOCUMENT_ROOT', '.')
      if os.path.isfile(os.path.join(docroot, 'robots.txt')): 
         rp = RobotsTextParser(docroot)
      else: raise ValueError("rp cannot be None")
   return rp.readable(path)

def pathstrip(path, exceptions=None): 
   if exceptions is None: 
      exceptions = ['robots.txt']
   for exc in exceptions: 
      if path.endswith(exc): 
         return path

   if '/' in path: 
      i = path.rfind('/') + 1
   else: i = 0

   for ext in stripable: 
      if (ext.startswith('.') and path.endswith(ext)) or (path[i:] == ext): 
         newpath = path[:-len(ext)]
         if not ((newpath != path) and 
                 os.path.isdir(newpath) and 
                 (not newpath.endswith('/'))): 
            path = newpath
            break
         else: continue
   return path

def ignorable(path, ignore=ignore): 
   base, ext = os.path.splitext(path)
   if ext in ignore: 
      return True
   return False

class FileType(object): 
   def __init__(self, name): 
      self.name = name

File = FileType('File')
Directory = FileType('Directory')
Other = FileType('Other')

def filetype(path): 
   if os.path.isfile(path): 
      return File
   elif os.path.isdir(path): 
      return Directory
   return Other

def datetime(path): 
   mtime = time.gmtime(os.path.getmtime(path))
   return time.strftime('%Y-%m-%d %H:%M:%S', mtime)

def sort(property): 
   def sorter(sitelist, property=property): 
      def compare(p, q, property=property): 
         if hasattr(p, property) and hasattr(q, property): 
            return cmp(getattr(p, property), getattr(q, property))
         msg = "Can't compare %s and %s: no %s"
         raise ValueError(msg % (p, q, property))
      return sorted(sitelist, cmp=compare)
   return sorter

def reverse(sitelist): 
   return reversed(list(sitelist))

def after(unixtime): 
   def proc(sitelist, unixtime=unixtime): 
      for fileinfo in sitelist: 
         if fileinfo.mtime >= unixtime: 
            yield fileinfo
   return proc

def before(unixtime): 
   def proc(sitelist, unixtime=unixtime): 
      for fileinfo in sitelist: 
         if fileinfo.mtime <= unixtime: 
            yield fileinfo
   return proc

def first(n): 
   def proc(sitelist, n=n): 
      for fileinfo in sitelist[:n]: 
         yield fileinfo
   return proc

def last(n): 
   def proc(sitelist, n=n): 
      for fileinfo in sitelist[-n:]: 
         yield fileinfo
   return proc

def files(sitelist): 
   for fileinfo in sitelist: 
      if hasattr(fileinfo, 'type'): 
         ftype = fileinfo.type
      else: ftype = filetype(fileinfo.relpath)
      if ftype == File: 
         yield fileinfo

def dirs(sitelist): 
   for fileinfo in sitelist: 
      if hasattr(fileinfo, 'type'): 
         ftype = fileinfo.type
      else: ftype = filetype(fileinfo.relpath)
      if ftype == Directory: 
         yield fileinfo

class Options(set): 
   """An options set, for passing to the Inventory class.
      Options(optlist) -> new options set from optlist
      Options.options contains the options' documentation
   """
   options = {
      'atime': 'get access time', 
      'abspath': 'add the absolute path', 
      'basename': 'get the basename from the relpath', 
      'ctime': 'get change time', 
      'datetime': 'add an iso-like datetime', 
      'mtime': 'get modification time', 
      'ignore': 'ignore global ignores list entries', 
      'non-recursive': "don't visit child directories", 
      'plain': "don't add a trailing slash to directories", 
      'raw': "don't normalize file paths", 
      'robots': 'obey robots.txt', 
      'sitepath': 'absolute path of the file on-server', 
      'size': 'file size in bytes', 
      'strip': 'strip superfluous suffixes', 
      'title': 'HTML title', 
      'type': 'file type', 
      'uniq': "don't repeat directories; prefer indexes", 
      'uniq-directory': "don't repeat indexes; prefer directories"
   }

# Default options; just enough to deliver a sitemap
options = Options(['ignore', 'robots', 'sitepath', 'strip', 'uniq'])

class RobotsTextParser(object): 
   def __init__(self, docroot=False): 
      self.docroot = docroot
      self.parser = False
      if docroot is not False: 
         self.makeparser()

   def makeparser(self): 
      robots = os.path.join(self.docroot, 'robots.txt')
      if os.path.isfile(robots): 
         self.parser = RobotFileParser()
         self.parser.set_url(robots)
         self.parser.read() # @@ reads broken files too

   def readable(self, path, useragent=None): 
      if useragent is None: 
         useragent = '*'
      if not self.parser: 
         return True
      return self.parser.can_fetch(useragent, path)

class FileInfo(object): 
   pass

postproc = []

class Inventory(object): 
   """A filesystem inventory.

      import sys, os
      docroot = os.environ.get('DOCUMENT_ROOT', '.')
      code = os.path.join(docroot, 'dev/code')
      sys.path.append(code)
      import inventory
   """

   # @@ exclusions etc.

   def __init__(self, path, docroot=False): 
      self.path = os.path.expanduser(path)
      if docroot is not False: 
         self.docroot = os.path.expanduser(docroot)
      else: self.docroot = path
      self.options = options or Options()
      self.postproc = postproc or []
      self.rp = RobotsTextParser(self.docroot)

   def walk(self, recursive=True): 
      """Returns an unsorted list of self.path, perhaps recursively."""
      if recursive: 
         for root, dirs, files in os.walk(self.path): 
            for name in files: 
               yield os.path.join(root, name)
            yield root
      else: 
         for name in os.listdir(self.path): 
            yield os.path.join(self.path, name)

   def sitepath(self, filename, plain=False): 
      # os.path.abspath automatically normalises
      abspath = os.path.abspath(filename)
      docroot = os.path.abspath(self.docroot)
      if abspath.startswith(docroot): 
         # @@ AaronSw's strip function
         path = abspath[len(docroot):]
         # if path: os.path.normpath(path)
         if (not plain) and (filetype(filename) == Directory): 
            # @@ if not path.endswith('/'): 
            path += '/'
         return path
      raise ValueError("%s doesn't start with %s" % (abspath, docroot))

   def public(self, filename): 
      path = self.sitepath(filename)
      return publicpath(path, rp=self.rp)

   def notable(self, filename, ignore=ignore): 
      return not ignorable(filename, ignore=ignore)

   def strip(self, fileinfo, stripable=stripable, exceptions=None): 
      # @@ this method needs cleaning up a bit
      if hasattr(fileinfo, 'sitepath'): 
         path = fileinfo.sitepath
      else: path = self.sitepath(fileinfo.relpath)
      fileinfo.sitepath = pathstrip(path)
      return fileinfo

   def uniq(self, prefer): 
      def proc(sitelist, prefer=prefer): 
         seen = {}
         for fileinfo in sitelist: 
            if hasattr(fileinfo, 'sitepath'): 
               path = fileinfo.sitepath
            else: path = self.sitepath(fileinfo.relpath)

            if seen.has_key(path): 
               if filetype(fileinfo.relpath) == prefer: 
                  seen[path] = fileinfo
            else: seen[path] = fileinfo
         return seen.itervalues()
      return proc

   def fileinfo(self, filename): 
      # 'robots': 'obey robots.txt', 
      f = FileInfo()
      f.relpath = filename

      # @@ don't force continual iteration over this
      optmap = {'abspath': os.path.abspath, 
                'atime': os.path.getatime, 
                'basename': os.path.basename, 
                'ctime': os.path.getctime, 
                'datetime': datetime, 
                'mtime': os.path.getmtime, 
                'sitepath': self.sitepath, 
                'size': os.path.getsize, 
                'title': title, 
                'type': filetype}
      for opt in optmap.iterkeys(): 
         if opt in self.options: 
            setattr(f, opt, optmap[opt](f.relpath))

      if not 'raw' in self.options: 
         # both abspath and sitepath are normalised by necessity
         f.relpath = os.path.normpath(f.relpath)

      # Note: normalization removes the trailing slash
      if (not 'plain' in self.options) and (filetype(f.relpath) == Directory): 
         for property in ('relpath', 'abspath', 'sitepath'): 
            if hasattr(f, property): 
               path = getattr(f, property)
               if not path.endswith('/'): 
                  setattr(f, property, path + '/')
      return f

   def getlist(self): 
      """Returns FileInfo instances, per options."""
      if not os.path.isdir(self.path): 
         raise ValueError("Not a directory: %s" % self.path)

      filters = []
      if 'robots' in self.options: 
         filters.append(self.public)

      if 'ignore' in self.options: 
         filters.append(self.notable)

      def include(filename, filters=filters): 
         for pred in filters: 
            if not pred(filename): 
               return False
         return True

      modifiers = []
      if 'strip' in self.options: 
         modifiers.append(self.strip)

      def lister(): 
         recursive = ('non-recursive' not in self.options)
         filenames = self.walk(recursive=recursive)
         for filename in itertools.ifilter(include, filenames): 
             fileinfo = self.fileinfo(filename)
             for modifier in modifiers: 
                fileinfo = modifier(fileinfo)
             yield fileinfo
      sitelist = lister()

      if 'uniq' in self.options: 
         self.postproc.insert(0, self.uniq(File))
      elif 'uniq-directory' in self.options: 
         self.postproc.insert(0, self.uniq(Directory))
      for proc in self.postproc: 
         sitelist = proc(sitelist)

      for fileinfo in sitelist: 
         yield fileinfo

def inventory(path, docroot=False, options=False, postproc=False): 
   inv = Inventory(path, docroot=docroot)
   if options: inv.options = options
   if postproc: inv.postproc = postproc
   return inv.getlist()

def sitelist(env=None): 
   env = env or os.environ
   docroot = env.get('DOCUMENT_ROOT', '.')
   pathinfo = env.get('PATH_INFO', '/')

   kargs = {'options': Options(['sitepath']),
            'postproc': [sort('sitepath')]}
   for f in inventory(docroot + pathinfo, **kargs):
      yield f.sitepath

def siteupdates(path, **kargs): 
   """Write to a file <out> modifications under <path>.
      List is performed with recursion, and mtime.
      Sorting is done by mtime.
   """
   since = kargs.get('since')
   maxnum = kargs.get('maxnum')
   out = kargs.get('out', sys.stdout)
   if isinstance(out, basestring): 
      from StringIO import StringIO
      out = StringIO(out)

   day = None
   opt = 'datetime ignore mtime robots sitepath strip uniq'.split()

   postproc = [files]
   if since: postproc.append(after(time.time() - since))
   postproc.append(sort('mtime'))
   if maxnum: postproc.append(last(maxnum))
   postproc.append(reverse)

   for f in inventory(path, options=Options(opt), postproc=postproc): 
      if f.datetime[:10] != day: 
         if day is not None: 
            print >> out, '</ul>'
            print >> out
         day = f.datetime[:10]
         print >> out, '<h2 id="T%s">%s</h2>' % (day, day)
         print >> out, '<ul>'   
      line = '<li><a href="%s">%s</a> (%s)</li>'
      print >> out, line % (f.sitepath, f.sitepath, f.datetime[-8:])
   print >> out, '</ul>'
   return out

def updates(path=None, since=(3600 * 24 * 50), maxnum=250): 
   if path is None: 
      path = os.environ.get('DOCUMENT_ROOT', '.')
   f = siteupdates(path, since=since, maxnum=maxnum, out='')
   f.seek(0)
   return f.read()

def genupdates(path=None, since=(3600 * 24 * 50), maxnum=250): 
   """Generator of (day, time, sitepath)."""
   if path is None: 
      path = os.environ.get('DOCUMENT_ROOT', '.')
   
   options = 'datetime ignore mtime robots sitepath strip uniq'
   options = Options(options.split(' '))

   postproc = [proc for proc in (
      files, # no directories
      since and after(time.time() - since), 
      sort('mtime'), # sort by modification time
      maxnum and last(maxnum), 
      reverse # reverse chronological
   ) if proc]

   for f in inventory(path, options=options, postproc=postproc): 
      yield (f.datetime[:10], f.datetime[-8:], f.sitepath)

def run(argv=None): 
   parser = OptionParser(usage='%prog [options] <path>')
   parser.add_option("-t", "--time", dest="time", default=False, 
                     help="cut off last modification date for inclusion")
   parser.add_option("-n", "--number", dest="number", default=False, 
                     help="maximum number of results to be output")
   options, args = parser.parse_args(argv)

   if len(args) == 1: 
      if options.time: 
         t = int(options.time)
      else: t = None
      if options.number: 
         n = int(options.number)
      else: n = 250
      siteupdates(args[0], since=t, maxnum=n)
   else: parser.error("Incorrect number of arguments.")

def runcgi(env): 
   print "Content-Type: text/html; charset=utf-8"
   print 
   print updates()

def main(env=None): 
   if env is None: 
      env = os.environ
   if env.has_key('SCRIPT_NAME'): 
      runcgi(env)
   else: run()

if __name__=="__main__": 
   main()