#!/usr/bin/env python """ inventory.py - An Elegant Directory Lister Author: Sean B. Palmer, inamidst.com @@ dirlist.py? setOptions, updateOptions """ import cgitb; cgitb.enable() import sys, os, re, time, itertools from robotparser import RobotFileParser from optparse import OptionParser stripable = ['index.html', '.html', 'index.cgi', '.cgi', '.txt', '.jpg', '.png', '.gif', '.php', '.rss', '.sh', '.draft'] ignore = frozenset(['.pyc', '.pyo', '.cgic']) # @@ .htaccess r_title = re.compile(r'(?i)([^<]+?)') bufsize = 2048 def formatdate(date, template=None): if template is None: template = '%Y-%m-%d %H:%M:%S' t = time.gmtime(date) return time.strftime(template, t) def formatnumber(n): parts = list(str(n)) for i in range((len(parts) - 3), 0, -3): parts.insert(i, ',') return ''.join(parts) def title(path): """Get the title of an HTML document.""" if not path.endswith('.html'): return None try: f = open(path) except OSError: return None else: content = '' while True: bytes = f.read(bufsize) if not bytes: break content += bytes m = r_title.search(content) if m: f.close() return m.group(1) f.close() return None def publicpath(path, rp=None): if rp is None: docroot = os.environ.get('DOCUMENT_ROOT', '.') if os.path.isfile(os.path.join(docroot, 'robots.txt')): rp = RobotsTextParser(docroot) else: raise ValueError("rp cannot be None") return rp.readable(path) def pathstrip(path, exceptions=None): if exceptions is None: exceptions = ['robots.txt'] for exc in exceptions: if path.endswith(exc): return path if '/' in path: i = path.rfind('/') + 1 else: i = 0 for ext in stripable: if (ext.startswith('.') and path.endswith(ext)) or (path[i:] == ext): newpath = path[:-len(ext)] if not ((newpath != path) and os.path.isdir(newpath) and (not newpath.endswith('/'))): path = newpath break else: continue return path def ignorable(path, ignore=ignore): base, ext = os.path.splitext(path) if ext in ignore: return True return False class FileType(object): def __init__(self, name): self.name = name File = FileType('File') Directory = FileType('Directory') Other = FileType('Other') def filetype(path): if os.path.isfile(path): return File elif os.path.isdir(path): return Directory return Other def datetime(path): mtime = time.gmtime(os.path.getmtime(path)) return time.strftime('%Y-%m-%d %H:%M:%S', mtime) def sort(property): def sorter(sitelist, property=property): def compare(p, q, property=property): if hasattr(p, property) and hasattr(q, property): return cmp(getattr(p, property), getattr(q, property)) msg = "Can't compare %s and %s: no %s" raise ValueError(msg % (p, q, property)) return sorted(sitelist, cmp=compare) return sorter def reverse(sitelist): return reversed(list(sitelist)) def after(unixtime): def proc(sitelist, unixtime=unixtime): for fileinfo in sitelist: if fileinfo.mtime >= unixtime: yield fileinfo return proc def before(unixtime): def proc(sitelist, unixtime=unixtime): for fileinfo in sitelist: if fileinfo.mtime <= unixtime: yield fileinfo return proc def first(n): def proc(sitelist, n=n): for fileinfo in sitelist[:n]: yield fileinfo return proc def last(n): def proc(sitelist, n=n): for fileinfo in sitelist[-n:]: yield fileinfo return proc def files(sitelist): for fileinfo in sitelist: if hasattr(fileinfo, 'type'): ftype = fileinfo.type else: ftype = filetype(fileinfo.relpath) if ftype == File: yield fileinfo def dirs(sitelist): for fileinfo in sitelist: if hasattr(fileinfo, 'type'): ftype = fileinfo.type else: ftype = filetype(fileinfo.relpath) if ftype == Directory: yield fileinfo class Options(set): """An options set, for passing to the Inventory class. Options(optlist) -> new options set from optlist Options.options contains the options' documentation """ options = { 'atime': 'get access time', 'abspath': 'add the absolute path', 'basename': 'get the basename from the relpath', 'ctime': 'get change time', 'datetime': 'add an iso-like datetime', 'mtime': 'get modification time', 'ignore': 'ignore global ignores list entries', 'non-recursive': "don't visit child directories", 'plain': "don't add a trailing slash to directories", 'raw': "don't normalize file paths", 'robots': 'obey robots.txt', 'sitepath': 'absolute path of the file on-server', 'size': 'file size in bytes', 'strip': 'strip superfluous suffixes', 'title': 'HTML title', 'type': 'file type', 'uniq': "don't repeat directories; prefer indexes", 'uniq-directory': "don't repeat indexes; prefer directories" } # Default options; just enough to deliver a sitemap options = Options(['ignore', 'robots', 'sitepath', 'strip', 'uniq']) class RobotsTextParser(object): def __init__(self, docroot=False): self.docroot = docroot self.parser = False if docroot is not False: self.makeparser() def makeparser(self): robots = os.path.join(self.docroot, 'robots.txt') if os.path.isfile(robots): self.parser = RobotFileParser() self.parser.set_url(robots) self.parser.read() # @@ reads broken files too def readable(self, path, useragent=None): if useragent is None: useragent = '*' if not self.parser: return True return self.parser.can_fetch(useragent, path) class FileInfo(object): pass postproc = [] class Inventory(object): """A filesystem inventory. import sys, os docroot = os.environ.get('DOCUMENT_ROOT', '.') code = os.path.join(docroot, 'dev/code') sys.path.append(code) import inventory """ # @@ exclusions etc. def __init__(self, path, docroot=False): self.path = os.path.expanduser(path) if docroot is not False: self.docroot = os.path.expanduser(docroot) else: self.docroot = path self.options = options or Options() self.postproc = postproc or [] self.rp = RobotsTextParser(self.docroot) def walk(self, recursive=True): """Returns an unsorted list of self.path, perhaps recursively.""" if recursive: for root, dirs, files in os.walk(self.path): for name in files: yield os.path.join(root, name) yield root else: for name in os.listdir(self.path): yield os.path.join(self.path, name) def sitepath(self, filename, plain=False): # os.path.abspath automatically normalises abspath = os.path.abspath(filename) docroot = os.path.abspath(self.docroot) if abspath.startswith(docroot): # @@ AaronSw's strip function path = abspath[len(docroot):] # if path: os.path.normpath(path) if (not plain) and (filetype(filename) == Directory): # @@ if not path.endswith('/'): path += '/' return path raise ValueError("%s doesn't start with %s" % (abspath, docroot)) def public(self, filename): path = self.sitepath(filename) return publicpath(path, rp=self.rp) def notable(self, filename, ignore=ignore): return not ignorable(filename, ignore=ignore) def strip(self, fileinfo, stripable=stripable, exceptions=None): # @@ this method needs cleaning up a bit if hasattr(fileinfo, 'sitepath'): path = fileinfo.sitepath else: path = self.sitepath(fileinfo.relpath) fileinfo.sitepath = pathstrip(path) return fileinfo def uniq(self, prefer): def proc(sitelist, prefer=prefer): seen = {} for fileinfo in sitelist: if hasattr(fileinfo, 'sitepath'): path = fileinfo.sitepath else: path = self.sitepath(fileinfo.relpath) if seen.has_key(path): if filetype(fileinfo.relpath) == prefer: seen[path] = fileinfo else: seen[path] = fileinfo return seen.itervalues() return proc def fileinfo(self, filename): # 'robots': 'obey robots.txt', f = FileInfo() f.relpath = filename # @@ don't force continual iteration over this optmap = {'abspath': os.path.abspath, 'atime': os.path.getatime, 'basename': os.path.basename, 'ctime': os.path.getctime, 'datetime': datetime, 'mtime': os.path.getmtime, 'sitepath': self.sitepath, 'size': os.path.getsize, 'title': title, 'type': filetype} for opt in optmap.iterkeys(): if opt in self.options: setattr(f, opt, optmap[opt](f.relpath)) if not 'raw' in self.options: # both abspath and sitepath are normalised by necessity f.relpath = os.path.normpath(f.relpath) # Note: normalization removes the trailing slash if (not 'plain' in self.options) and (filetype(f.relpath) == Directory): for property in ('relpath', 'abspath', 'sitepath'): if hasattr(f, property): path = getattr(f, property) if not path.endswith('/'): setattr(f, property, path + '/') return f def getlist(self): """Returns FileInfo instances, per options.""" if not os.path.isdir(self.path): raise ValueError("Not a directory: %s" % self.path) filters = [] if 'robots' in self.options: filters.append(self.public) if 'ignore' in self.options: filters.append(self.notable) def include(filename, filters=filters): for pred in filters: if not pred(filename): return False return True modifiers = [] if 'strip' in self.options: modifiers.append(self.strip) def lister(): recursive = ('non-recursive' not in self.options) filenames = self.walk(recursive=recursive) for filename in itertools.ifilter(include, filenames): fileinfo = self.fileinfo(filename) for modifier in modifiers: fileinfo = modifier(fileinfo) yield fileinfo sitelist = lister() if 'uniq' in self.options: self.postproc.insert(0, self.uniq(File)) elif 'uniq-directory' in self.options: self.postproc.insert(0, self.uniq(Directory)) for proc in self.postproc: sitelist = proc(sitelist) for fileinfo in sitelist: yield fileinfo def inventory(path, docroot=False, options=False, postproc=False): inv = Inventory(path, docroot=docroot) if options: inv.options = options if postproc: inv.postproc = postproc return inv.getlist() def sitelist(env=None): env = env or os.environ docroot = env.get('DOCUMENT_ROOT', '.') pathinfo = env.get('PATH_INFO', '/') kargs = {'options': Options(['sitepath']), 'postproc': [sort('sitepath')]} for f in inventory(docroot + pathinfo, **kargs): yield f.sitepath def siteupdates(path, **kargs): """Write to a file modifications under . List is performed with recursion, and mtime. Sorting is done by mtime. """ since = kargs.get('since') maxnum = kargs.get('maxnum') out = kargs.get('out', sys.stdout) if isinstance(out, basestring): from StringIO import StringIO out = StringIO(out) day = None opt = 'datetime ignore mtime robots sitepath strip uniq'.split() postproc = [files] if since: postproc.append(after(time.time() - since)) postproc.append(sort('mtime')) if maxnum: postproc.append(last(maxnum)) postproc.append(reverse) for f in inventory(path, options=Options(opt), postproc=postproc): if f.datetime[:10] != day: if day is not None: print >> out, '' print >> out day = f.datetime[:10] print >> out, '

%s

' % (day, day) print >> out, '' return out def updates(path=None, since=(3600 * 24 * 50), maxnum=250): if path is None: path = os.environ.get('DOCUMENT_ROOT', '.') f = siteupdates(path, since=since, maxnum=maxnum, out='') f.seek(0) return f.read() def genupdates(path=None, since=(3600 * 24 * 50), maxnum=250): """Generator of (day, time, sitepath).""" if path is None: path = os.environ.get('DOCUMENT_ROOT', '.') options = 'datetime ignore mtime robots sitepath strip uniq' options = Options(options.split(' ')) postproc = [proc for proc in ( files, # no directories since and after(time.time() - since), sort('mtime'), # sort by modification time maxnum and last(maxnum), reverse # reverse chronological ) if proc] for f in inventory(path, options=options, postproc=postproc): yield (f.datetime[:10], f.datetime[-8:], f.sitepath) def run(argv=None): parser = OptionParser(usage='%prog [options] ') parser.add_option("-t", "--time", dest="time", default=False, help="cut off last modification date for inclusion") parser.add_option("-n", "--number", dest="number", default=False, help="maximum number of results to be output") options, args = parser.parse_args(argv) if len(args) == 1: if options.time: t = int(options.time) else: t = None if options.number: n = int(options.number) else: n = 250 siteupdates(args[0], since=t, maxnum=n) else: parser.error("Incorrect number of arguments.") def runcgi(env): print "Content-Type: text/html; charset=utf-8" print print updates() def main(env=None): if env is None: env = os.environ if env.has_key('SCRIPT_NAME'): runcgi(env) else: run() if __name__=="__main__": main()