#!/usr/bin/env python """ rdfdb.py - RDF Database Author: Sean B. Palmer License: GPL 2; share and enjoy! Status: Prototype Requirements: Python2.4+ - http://www.python.org/ ntriples.py - http://inamidst.com/proj/rdf/ntriples.py Example: $ ./rdfdb.py /tmp/$(date +%s) http://inamidst.com/sbp/foaf object \\ 'URI("http://inamidst.com/sbp/foaf#Sean")' \\ 'URI("http://xmlns.com/foaf/0.1/name")' """ import sys, os, re, urllib import operator, itertools, shelve import ntriples class Node(unicode): pass class URI(Node): pass class bNode(Node): pass class Literal(Node): def __new__(cls, lit, lang, dtype): node = Node.__new__(cls, lit) node.lang = lang node.dtype = dtype return node def nodeToDB(node): types = {URI: '>', bNode: '_'} for cls in types.iterkeys(): if isinstance(node, cls): return types[cls] + ' ' + node.encode('utf-8') if isinstance(node, Literal): return '" %s %s %s' % (node.lang, node.dtype, node.encode('utf-8')) raise ValueError("Unknown node type for: %r" % node) r_xml = re.compile(r'^[\t\r\n ]*(<[?!]|<[^ >]+ )') class Abstract(Exception): pass class BaseGraph(object): def __init__(self, **kargs): methods = {'uri': self.feedURI, 'ntriples': self.feedNTriples} for key in methods.iterkeys(): if kargs.has_key(key): methods[key](kargs[key]) def __len__(self): length = 0 for triple in self.itertriples(): length += 1 return length def feedURI(self, *args): raise Abstract("Error: no subclass implementation") def feedNTriples(self, *args): raise Abstract("Error: no subclass implementation") def append(self, *args): raise Abstract("Error: no subclass implementation") def itertriples(self, *args): raise Abstract("Error: no subclass implementation") def get(self, *args): raise Abstract("Error: no subclass implementation") def the(self, subj, pred, objt): iter = self.get(subj=subj, pred=pred, objt=objt) triple = iter.next() try: iter.next() except StopIteration: return triple else: raise ValueError("Query returned more than one triple") def subjects(self, pred, objt): for (s, p, o) in self.get(subj=None, pred=pred, objt=objt): yield s def predicates(self, subj, objt): for (s, p, o) in self.get(subj=subj, pred=None, objt=objt): yield p def objects(self, subj, pred): for (s, p, o) in self.get(subj=subj, pred=pred, objt=None): yield o def subject(self, pred, objt): return self.the(subj=None, pred=pred, objt=objt)[0] def predicate(self, subj, objt): return self.the(subj=subj, pred=None, objt=objt)[1] def object(self, subj, pred): return self.the(subj=subj, pred=pred, objt=None)[2] class ShelfGraph(BaseGraph): def __init__(self, database, **kargs): self.counter = 0 self.mkdatabase(database) super(ShelfGraph, self).__init__(**kargs) def mkdatabase(self, db): if not os.path.isdir(db): print >> sys.stderr, "Warning: Making directory %s" % db os.mkdir(db) self.subjDB = os.path.join(db, 'subjects.db') self.predDB = os.path.join(db, 'predicates.db') self.objtDB = os.path.join(db, 'objects.db') self.tripDB = os.path.join(db, 'triples.db') for attr in ('subjectDB', 'predicateDB', 'objectDB', 'tripleDB'): self.__setattr__(attr, None) self.mode('c') self.sync() self.mode('r') def mode(self, flag): if self.subjectDB: self.subjectDB.close() if self.predicateDB: self.predicateDB.close() if self.objectDB: self.objectDB.close() if self.tripleDB: self.tripleDB.close() wback = (flag != 'r') self.subjectDB = shelve.open(self.subjDB, flag, writeback=wback) self.predicateDB = shelve.open(self.predDB, flag, writeback=wback) self.objectDB = shelve.open(self.objtDB, flag, writeback=wback) self.tripleDB = shelve.open(self.tripDB, flag, writeback=wback) def sync(self): self.subjectDB.sync() self.predicateDB.sync() self.objectDB.sync() self.tripleDB.sync() def tripleID(self): self.counter += 1 return 't ' + str(self.counter) def feedURI(self, uri): u = urllib.urlopen(uri) data = u.read() u.close() if r_xml.match(data): xml2nt = 'http://crschmidt.net/semweb/xml2nt.cgi?uri=' u = urllib.urlopen(xml2nt + urllib.quote(uri)) data = u.read() u.close() self.feedNTriples(data) def feedNTriples(self, nt): ntriples.URI = URI ntriples.bNode = bNode ntriples.Literal = Literal class NTriplesSink(object): def triple(sink, s, p, o): self.append(s, p, o) p = ntriples.NTriplesParser(sink=NTriplesSink()) self.mode('w') p.parsestring(nt) self.sync() self.mode('r') def append(self, s, p, o): pairs = {nodeToDB(s): self.subjectDB, nodeToDB(p): self.predicateDB, nodeToDB(o): self.objectDB} triple = self.tripleID() for (term, database) in pairs.iteritems(): if database.has_key(term): database[term].add(triple) else: database[term] = set([triple]) self.tripleDB[triple] = (s, p, o) def itertriples(self): return self.tripleDB.itervalues() def get(self, subj=None, pred=None, objt=None): if not (subj or pred or objt): for triple in self.itertriples(): yield triple return # http://swhack.com/logs/2004-12-30#T01-01-02 sets = (db.get(nodeToDB(node)) for (db, node) in ((self.subjectDB, subj), (self.predicateDB, pred), (self.objectDB, objt)) if node) for t in reduce(operator.__and__, itertools.ifilter(bool, sets)): yield self.tripleDB[t] Graph = ShelfGraph def barf(msg): print >> sys.stderr, msg.strip() sys.exit(1) def main(args=None): if args is None: args = sys.argv[1:] if args: # e.g. ./rdfdb.py /tmp/rdfdb http://example.org/rdf subjects ... ... try: database, uri, method, arg = args[0], args[1], args[2], args[3:] except IndexError: barf(__doc__) G = Graph(database, uri=uri) result = getattr(G, method)(*map(eval, arg)) if hasattr(result, 'next'): for item in result: print item else: print result else: print __doc__.strip() if __name__=="__main__": main()