#!/usr/bin/env python
"""
inventory.py - An Elegant Directory Lister
Author: Sean B. Palmer, inamidst.com
@@ dirlist.py?
setOptions, updateOptions
"""
import cgitb; cgitb.enable()
import sys, os, re, time, itertools
from robotparser import RobotFileParser
from optparse import OptionParser
stripable = ['index.html', '.html', 'index.cgi', '.cgi', '.txt',
'.jpg', '.png', '.gif', '.php', '.rss', '.sh', '.draft']
ignore = frozenset(['.pyc', '.pyo', '.cgic']) # @@ .htaccess
r_title = re.compile(r'(?i)
([^<]+?)')
bufsize = 2048
def formatdate(date, template=None):
if template is None:
template = '%Y-%m-%d %H:%M:%S'
t = time.gmtime(date)
return time.strftime(template, t)
def formatnumber(n):
parts = list(str(n))
for i in range((len(parts) - 3), 0, -3):
parts.insert(i, ',')
return ''.join(parts)
def title(path):
"""Get the title of an HTML document."""
if not path.endswith('.html'):
return None
try: f = open(path)
except OSError:
return None
else:
content = ''
while True:
bytes = f.read(bufsize)
if not bytes: break
content += bytes
m = r_title.search(content)
if m:
f.close()
return m.group(1)
f.close()
return None
def publicpath(path, rp=None):
if rp is None:
docroot = os.environ.get('DOCUMENT_ROOT', '.')
if os.path.isfile(os.path.join(docroot, 'robots.txt')):
rp = RobotsTextParser(docroot)
else: raise ValueError("rp cannot be None")
return rp.readable(path)
def pathstrip(path, exceptions=None):
if exceptions is None:
exceptions = ['robots.txt']
for exc in exceptions:
if path.endswith(exc):
return path
if '/' in path:
i = path.rfind('/') + 1
else: i = 0
for ext in stripable:
if (ext.startswith('.') and path.endswith(ext)) or (path[i:] == ext):
newpath = path[:-len(ext)]
if not ((newpath != path) and
os.path.isdir(newpath) and
(not newpath.endswith('/'))):
path = newpath
break
else: continue
return path
def ignorable(path, ignore=ignore):
base, ext = os.path.splitext(path)
if ext in ignore:
return True
return False
class FileType(object):
def __init__(self, name):
self.name = name
File = FileType('File')
Directory = FileType('Directory')
Other = FileType('Other')
def filetype(path):
if os.path.isfile(path):
return File
elif os.path.isdir(path):
return Directory
return Other
def datetime(path):
mtime = time.gmtime(os.path.getmtime(path))
return time.strftime('%Y-%m-%d %H:%M:%S', mtime)
def sort(property):
def sorter(sitelist, property=property):
def compare(p, q, property=property):
if hasattr(p, property) and hasattr(q, property):
return cmp(getattr(p, property), getattr(q, property))
msg = "Can't compare %s and %s: no %s"
raise ValueError(msg % (p, q, property))
return sorted(sitelist, cmp=compare)
return sorter
def reverse(sitelist):
return reversed(list(sitelist))
def after(unixtime):
def proc(sitelist, unixtime=unixtime):
for fileinfo in sitelist:
if fileinfo.mtime >= unixtime:
yield fileinfo
return proc
def before(unixtime):
def proc(sitelist, unixtime=unixtime):
for fileinfo in sitelist:
if fileinfo.mtime <= unixtime:
yield fileinfo
return proc
def first(n):
def proc(sitelist, n=n):
for fileinfo in sitelist[:n]:
yield fileinfo
return proc
def last(n):
def proc(sitelist, n=n):
for fileinfo in sitelist[-n:]:
yield fileinfo
return proc
def files(sitelist):
for fileinfo in sitelist:
if hasattr(fileinfo, 'type'):
ftype = fileinfo.type
else: ftype = filetype(fileinfo.relpath)
if ftype == File:
yield fileinfo
def dirs(sitelist):
for fileinfo in sitelist:
if hasattr(fileinfo, 'type'):
ftype = fileinfo.type
else: ftype = filetype(fileinfo.relpath)
if ftype == Directory:
yield fileinfo
class Options(set):
"""An options set, for passing to the Inventory class.
Options(optlist) -> new options set from optlist
Options.options contains the options' documentation
"""
options = {
'atime': 'get access time',
'abspath': 'add the absolute path',
'basename': 'get the basename from the relpath',
'ctime': 'get change time',
'datetime': 'add an iso-like datetime',
'mtime': 'get modification time',
'ignore': 'ignore global ignores list entries',
'non-recursive': "don't visit child directories",
'plain': "don't add a trailing slash to directories",
'raw': "don't normalize file paths",
'robots': 'obey robots.txt',
'sitepath': 'absolute path of the file on-server',
'size': 'file size in bytes',
'strip': 'strip superfluous suffixes',
'title': 'HTML title',
'type': 'file type',
'uniq': "don't repeat directories; prefer indexes",
'uniq-directory': "don't repeat indexes; prefer directories"
}
# Default options; just enough to deliver a sitemap
options = Options(['ignore', 'robots', 'sitepath', 'strip', 'uniq'])
class RobotsTextParser(object):
def __init__(self, docroot=False):
self.docroot = docroot
self.parser = False
if docroot is not False:
self.makeparser()
def makeparser(self):
robots = os.path.join(self.docroot, 'robots.txt')
if os.path.isfile(robots):
self.parser = RobotFileParser()
self.parser.set_url(robots)
self.parser.read() # @@ reads broken files too
def readable(self, path, useragent=None):
if useragent is None:
useragent = '*'
if not self.parser:
return True
return self.parser.can_fetch(useragent, path)
class FileInfo(object):
pass
postproc = []
class Inventory(object):
"""A filesystem inventory.
import sys, os
docroot = os.environ.get('DOCUMENT_ROOT', '.')
code = os.path.join(docroot, 'dev/code')
sys.path.append(code)
import inventory
"""
# @@ exclusions etc.
def __init__(self, path, docroot=False):
self.path = os.path.expanduser(path)
if docroot is not False:
self.docroot = os.path.expanduser(docroot)
else: self.docroot = path
self.options = options or Options()
self.postproc = postproc or []
self.rp = RobotsTextParser(self.docroot)
def walk(self, recursive=True):
"""Returns an unsorted list of self.path, perhaps recursively."""
if recursive:
for root, dirs, files in os.walk(self.path):
for name in files:
yield os.path.join(root, name)
yield root
else:
for name in os.listdir(self.path):
yield os.path.join(self.path, name)
def sitepath(self, filename, plain=False):
# os.path.abspath automatically normalises
abspath = os.path.abspath(filename)
docroot = os.path.abspath(self.docroot)
if abspath.startswith(docroot):
# @@ AaronSw's strip function
path = abspath[len(docroot):]
# if path: os.path.normpath(path)
if (not plain) and (filetype(filename) == Directory):
# @@ if not path.endswith('/'):
path += '/'
return path
raise ValueError("%s doesn't start with %s" % (abspath, docroot))
def public(self, filename):
path = self.sitepath(filename)
return publicpath(path, rp=self.rp)
def notable(self, filename, ignore=ignore):
return not ignorable(filename, ignore=ignore)
def strip(self, fileinfo, stripable=stripable, exceptions=None):
# @@ this method needs cleaning up a bit
if hasattr(fileinfo, 'sitepath'):
path = fileinfo.sitepath
else: path = self.sitepath(fileinfo.relpath)
fileinfo.sitepath = pathstrip(path)
return fileinfo
def uniq(self, prefer):
def proc(sitelist, prefer=prefer):
seen = {}
for fileinfo in sitelist:
if hasattr(fileinfo, 'sitepath'):
path = fileinfo.sitepath
else: path = self.sitepath(fileinfo.relpath)
if seen.has_key(path):
if filetype(fileinfo.relpath) == prefer:
seen[path] = fileinfo
else: seen[path] = fileinfo
return seen.itervalues()
return proc
def fileinfo(self, filename):
# 'robots': 'obey robots.txt',
f = FileInfo()
f.relpath = filename
# @@ don't force continual iteration over this
optmap = {'abspath': os.path.abspath,
'atime': os.path.getatime,
'basename': os.path.basename,
'ctime': os.path.getctime,
'datetime': datetime,
'mtime': os.path.getmtime,
'sitepath': self.sitepath,
'size': os.path.getsize,
'title': title,
'type': filetype}
for opt in optmap.iterkeys():
if opt in self.options:
setattr(f, opt, optmap[opt](f.relpath))
if not 'raw' in self.options:
# both abspath and sitepath are normalised by necessity
f.relpath = os.path.normpath(f.relpath)
# Note: normalization removes the trailing slash
if (not 'plain' in self.options) and (filetype(f.relpath) == Directory):
for property in ('relpath', 'abspath', 'sitepath'):
if hasattr(f, property):
path = getattr(f, property)
if not path.endswith('/'):
setattr(f, property, path + '/')
return f
def getlist(self):
"""Returns FileInfo instances, per options."""
if not os.path.isdir(self.path):
raise ValueError("Not a directory: %s" % self.path)
filters = []
if 'robots' in self.options:
filters.append(self.public)
if 'ignore' in self.options:
filters.append(self.notable)
def include(filename, filters=filters):
for pred in filters:
if not pred(filename):
return False
return True
modifiers = []
if 'strip' in self.options:
modifiers.append(self.strip)
def lister():
recursive = ('non-recursive' not in self.options)
filenames = self.walk(recursive=recursive)
for filename in itertools.ifilter(include, filenames):
fileinfo = self.fileinfo(filename)
for modifier in modifiers:
fileinfo = modifier(fileinfo)
yield fileinfo
sitelist = lister()
if 'uniq' in self.options:
self.postproc.insert(0, self.uniq(File))
elif 'uniq-directory' in self.options:
self.postproc.insert(0, self.uniq(Directory))
for proc in self.postproc:
sitelist = proc(sitelist)
for fileinfo in sitelist:
yield fileinfo
def inventory(path, docroot=False, options=False, postproc=False):
inv = Inventory(path, docroot=docroot)
if options: inv.options = options
if postproc: inv.postproc = postproc
return inv.getlist()
def sitelist(env=None):
env = env or os.environ
docroot = env.get('DOCUMENT_ROOT', '.')
pathinfo = env.get('PATH_INFO', '/')
kargs = {'options': Options(['sitepath']),
'postproc': [sort('sitepath')]}
for f in inventory(docroot + pathinfo, **kargs):
yield f.sitepath
def siteupdates(path, **kargs):
"""Write to a file modifications under .
List is performed with recursion, and mtime.
Sorting is done by mtime.
"""
since = kargs.get('since')
maxnum = kargs.get('maxnum')
out = kargs.get('out', sys.stdout)
if isinstance(out, basestring):
from StringIO import StringIO
out = StringIO(out)
day = None
opt = 'datetime ignore mtime robots sitepath strip uniq'.split()
postproc = [files]
if since: postproc.append(after(time.time() - since))
postproc.append(sort('mtime'))
if maxnum: postproc.append(last(maxnum))
postproc.append(reverse)
for f in inventory(path, options=Options(opt), postproc=postproc):
if f.datetime[:10] != day:
if day is not None:
print >> out, ''
print >> out
day = f.datetime[:10]
print >> out, '%s
' % (day, day)
print >> out, ''
line = '- %s (%s)
'
print >> out, line % (f.sitepath, f.sitepath, f.datetime[-8:])
print >> out, '
'
return out
def updates(path=None, since=(3600 * 24 * 50), maxnum=250):
if path is None:
path = os.environ.get('DOCUMENT_ROOT', '.')
f = siteupdates(path, since=since, maxnum=maxnum, out='')
f.seek(0)
return f.read()
def genupdates(path=None, since=(3600 * 24 * 50), maxnum=250):
"""Generator of (day, time, sitepath)."""
if path is None:
path = os.environ.get('DOCUMENT_ROOT', '.')
options = 'datetime ignore mtime robots sitepath strip uniq'
options = Options(options.split(' '))
postproc = [proc for proc in (
files, # no directories
since and after(time.time() - since),
sort('mtime'), # sort by modification time
maxnum and last(maxnum),
reverse # reverse chronological
) if proc]
for f in inventory(path, options=options, postproc=postproc):
yield (f.datetime[:10], f.datetime[-8:], f.sitepath)
def run(argv=None):
parser = OptionParser(usage='%prog [options] ')
parser.add_option("-t", "--time", dest="time", default=False,
help="cut off last modification date for inclusion")
parser.add_option("-n", "--number", dest="number", default=False,
help="maximum number of results to be output")
options, args = parser.parse_args(argv)
if len(args) == 1:
if options.time:
t = int(options.time)
else: t = None
if options.number:
n = int(options.number)
else: n = 250
siteupdates(args[0], since=t, maxnum=n)
else: parser.error("Incorrect number of arguments.")
def runcgi(env):
print "Content-Type: text/html; charset=utf-8"
print
print updates()
def main(env=None):
if env is None:
env = os.environ
if env.has_key('SCRIPT_NAME'):
runcgi(env)
else: run()
if __name__=="__main__":
main()