#!/usr/bin/env python """ Site Metadata Producer Author: Sean B. Palmer, inamidst.com The basic principle is that /meta/ is a mirror for / with the same filenames, only the files under /meta/ are RFC 822-like metadata entries for the files under the root portion. All requests to /meta/path are rewritten using mod_rewrite to this CGI, which takes the metadata and converts it to RDF/XML, Notation3, or HTML depending upon one's accept headers and possibly the QUERY_STRING a la /list. Annoyances: you have to segregate off all the meta scripts and databases to /meta/meta, and the metadata filenames all have misleading extensions; thankfully URIs are opaque anyway, so it doesn't really matter. Field names to use: keywords: dc:keywords? nope: dc:subject title: dc:title description: dc:description source: dc:source type: dc:type? license? Raw files are utf-8. Convert to N3, RDF/XML, HTML. Get also: modification time, size, dc:format. Might want to use dev/ls.py Keywords get compiled into... databases? slocate style? marshal? The next step is to have an interface to search the database, such that /database/keyword1.keyword2 will look up files with those keywords. Since there should be canonical URIs to all this stuff, the idea is to give redirects when the keywords are out of alphabetical order. """ import cgitb; cgitb.enable() import sys, os, cgi class Bidict(dict): def __init__(self, *args, **kargs): super(Bidict, self).__init__(*args, **kargs) dict.update(self, dict((v, k) for k, v in self.iteritems())) def __setitem__(self, k, v): dict.__setitem__(self, k, v) dict.__setitem__(self, v, k) class Env(dict): def __getitem__(self, field): if self.has_key(field): return dict.__getitem__(self, field) result = os.environ.get(field) self[field] = result return result env = Env() ns = Bidict({'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'dct': 'http://purl.org/dc/terms/', 'list': 'http://crschmidt.net/ns/list#'}) indexes = ['index.html', 'index.cgi', 'index'] fieldnames = {'keywords': (ns['dc'], 'subject'), 'title': (ns['dc'], 'title'), 'description': (ns['dc'], 'description'), 'source': (ns['dc'], 'source'), 'format': (ns['dc'], 'format'), 'modified': (ns['dct'], 'modified'), 'size': (ns['list'], 'size')} def serve(status, headers, body): sys.stdout.write("Status: %s\r\n" % status) for header in headers.iteritems(): sys.stdout.write("%s: %s\r\n" % header) sys.stdout.write("\r\n") sys.stdout.write(str(body)) sys.exit() def ok(mime, body): serve(200, {'Content-Type': mime}, body) def notfound(mime, body): serve(404, {'Content-Type': mime}, body) def redirect(uri, temp=False): if not uri.startswith('http:'): from urlparse import urljoin domain = os.environ.get('SERVER_NAME', 'localhost') uri = urljoin('http://' + domain + env['REQUEST_URI'], uri) status = (301, 307)[temp] headers = {'Location': uri, 'Content-Type': 'text/html'} body = 'go!' % uri serve(status, headers, body) def error(msg): body = '

%s

' % cgi.escape(msg) serve(501, {'Content-Type': 'text/html'}, body) class Accept(object): def __init__(self, accept): self.accept = accept self.types = {} self.parse() def parse(self): for mime in self.accept.split(','): mime = mime.strip() if ';' in mime: q = 1.0 mime, params = mime.split(';', 1) for param in params.split(';'): param = param.strip() if param.startswith('q='): try: q = float(param[2:]) except ValueError: continue else: break else: q = 1.0 self.types[mime.strip()] = q def test(self, mime): """Return the highest value that mime matches, or None.""" if not self.types: return None matches = {'*/*': lambda q: (q, 0), mime.split('/')[0]+'/*': lambda q: (q, 1), mime: lambda q: (q, 2)} return max(matches.get(testtype, lambda q: None)(q) for testtype, q in self.types.iteritems()) def preferences(self, first, *rest): prefs, maxQ = set([first]), self.test(first) for arg in rest: newQ = self.test(arg) if (newQ is not None) and (newQ >= maxQ): if newQ == maxQ: prefs.add(arg) else: prefs = set([arg]) maxQ = newQ if maxQ is not None: return prefs return None def get(fn, field): fieldmarker = field + ': ' f = open(fn) for line in f: if line.startswith(fieldmarker): f.close() result = line[len(fieldmarker):] break else: f.close() result = None return result def getall(path): fileinfo = {} fn = metafile(path) f = open(fn) for line in f: line = line.rstrip('\r\n') if line.startswith('#') or (not line.strip(' \t')): continue else: fieldname, value = line.split(': ') property = fieldnames[fieldname] fileinfo[property] = value f.close() return fileinfo def absolute(path): import urlparse if path == '/index': path = '/' return urlparse.urljoin('http://%s/' % env['SERVER_NAME'], path) def isotime(t): import time, datetime try: timestamp = time.mktime(time.strptime(t, '%a, %d %b %Y %H:%M:%S %Z')) except: raise ValueError("Input might've been None.") tm = datetime.datetime.fromtimestamp(timestamp) return tm.isoformat() + 'Z' def metafile(path): if path.endswith('/'): path += 'index' fn = os.path.join(env['DOCUMENT_ROOT'], '.' + path) return os.path.normpath(fn) def info(path): import urllib, time # Local (to /meta/) metadata fileinfo = getall(path) # Other metadata; *now* munge the path uri = absolute(path.lstrip('./')[len('meta'):]) u = urllib.urlopen(uri) info = u.info() u.close() format = info.get('Content-Type').split(';')[0] try: mtime = isotime(info.get('Last-Modified')) except ValueError: mtime = None size = info.get('Content-Length') if mtime is not None: fileinfo[(ns['dc'], 'modified')] = mtime if size is not None: fileinfo[(ns['list'], 'size')] = size if format is not None: fileinfo[(ns['dc'], 'format')] = format return uri, fileinfo def namespaces(fileinfo): prefixes = set([]) for (racine, term) in fileinfo.iterkeys(): prefixes.add((ns[racine], racine)) return sorted(prefixes) def turtle(path): # print 'Content-Type: application/x-turtle' print 'Content-Type: text/plain' print uri, fileinfo = info(path) for (prefix, racine) in namespaces(fileinfo): print '@prefix %s: <%s> .' % (prefix, racine) print # print '<%s> # a foaf:Document; ' % uri for ((racine, term), value) in fileinfo.iteritems(): print '<%s> %s:%s "%s" .' % (uri, ns[racine], term, value) print print '# [EOF]' def rdfxml(path): # print 'Content-Type: application/rdf+xml' print 'Content-Type: text/xml' print uri, fileinfo = info(path) print '' print '' % uri for ((racine, term), value) in fileinfo.iteritems(): tag, value = ns[racine] + ':' + term, cgi.escape(value) print ' <%s>%s' % (tag, value, tag) print '' print '' def xhtml(path): import textwrap print 'Content-Type: text/html; charset=utf-8' print filepath = path.lstrip('./')[len('meta'):] if filepath == '/index': filepath = '/' uri, fileinfo = info(path) print textwrap.dedent(("""\ Metadata for """ + filepath + """

Metadata for """ + filepath + """

Sean B. Palmer
""").lstrip('\n')) def data(path, robots=True): if os.environ.has_key('QUERY_STRING'): mtype = env['QUERY_STRING'] elif os.environ.has_key('HTTP_ACCEPT'): types = ('application/x-turtle', 'application/rdf+xml', 'text/html') a = Accept(env['HTTP_ACCEPT']) prefs = a.preferences(*types) if 'application/x-turtle' in prefs: mtype = 'turtle' elif 'application/rdf+xml' in prefs: mtype = 'rdf' else: mtype = 'xhtml' else: mtype = 'xhtml' # @@ robots.txt check here? if '?' in path: path, query = path.rsplit('?', 1) query = '?' + query else: query = '' filename = path.split('/').pop() if filename in indexes: if path.count('/') > 2: redirect('./' + query) redirect('./' + os.path.splitext(filename)[0] + query) if not os.path.isfile(metafile(path)): notfound('text/html', "Couldn't find <%s>." % path) opts = {'turtle': turtle, 'rdf': rdfxml} opts.get(mtype, xhtml)(path) def tests(): def test(a, input, expected): if expected is not None: expected = set(expected) result = a.preferences(*input) assert result == expected, result a = Accept('audio/*; q=0.2, audio/basic') print a.accept, a.types test(a, ['audio/basic', 'audio/rich'], ['audio/basic']) test(a, ['audio/rich', 'text/plain'], ['audio/rich']) test(a, ['text/plain'], None) a = Accept('text/plain; q=0.5, text/html, text/x-dvi; q=0.8, text/x-c') print a.accept, a.types test(a, ['text/html', 'text/x-c'], ['text/html', 'text/x-c']) test(a, ['text/html', 'text/x-dvi', 'text/xml'], ['text/html']) test(a, ['text/plain', 'text/x-dvi'], ['text/x-dvi']) test(a, ['text/xml'], None) a = Accept('text/*, text/html, */*') print a.accept, a.types test(a, ['text/plain', 'text/html'], ['text/html']) test(a, ['text/plain', 'application/xml'], ['text/plain']) test(a, ['text/xml', 'text/plain'], ['text/plain', 'text/xml']) test(a, ['application/xml'], ['application/xml']) a = Accept('') test(a, ['media/type'], None) print 'Tests pass' def main(): if os.environ.has_key('SCRIPT_NAME'): scriptpath, ext = os.path.splitext(env['SCRIPT_NAME']) if env['REQUEST_URI'] == scriptpath: redirect('./src') else: data(env['REQUEST_URI']) else: print "Content-Type: text/plain" print print __doc__ print tests() if __name__=="__main__": main()