#!/usr/bin/env python """ sitedb.py - Create a searchable site database License: GPL 2; share and enjoy! Author: Sean B. Palmer, inamidst.com Usage: ./sitedb.py - create database at dbpath ./sitedb.py --help - help and usage notes Bugs: * Duplicates entries for words appearing more than once in a line * The robots.txt parsing doesn't appear to be working properly """ import sys, re, os, dbhash from robotparser import RobotFileParser r_word = re.compile(r"(?> sys.stderr, "Won't overwrite %s" % dbpath sys.exit(1) rp = mkrparser(rootpath) db = dbhash.open(dbpath, 'n') def indexable(fn): for ext in indexables: if fn.endswith(ext): return True return False indexed = 0 print "Indexing files... this may take a while!" for (dirpath, dirnames, filenames) in os.walk(rootpath): for name in filenames: fn = os.path.join(dirpath, name) if rp.readable(fn): if indexable(fn): index(fn, db) sys.stderr.write('.') indexed += 1 db.close() print "Success! Indexed %s files." % indexed return True def findword(dbpath, word, lines=True): result = [] db = dbhash.open(dbpath, 'r') if db.has_key(word): for lineid in db[word].split('\n'): if not lines: result.append(lineid) else: result.append((lineid, db[lineid])) db.close() return result r_prent = re.compile(r'&(?!amp;)') def condquot(text): text = text.replace('<', '<') return r_prent.sub('&', text) def mklinks(html, query): def replacement(m): word = m.group(0) return '%s' % (query + word, word) return r_word.sub(replacement, html) def fwhtml(dbpath, word, strip=None, query=None): result = ['') return '\n'.join(result) def main(args=None): if args is None: args = sys.argv[1:] if len(args) == 2: rootpath, dbpath = tuple(args) makedb(rootpath, dbpath) else: print __doc__.lstrip() if __name__=="__main__": main()