#!/usr/bin/env python """ juno.py - Trio's Juno Parser Copyright 2007, Sean B. Palmer, inamidst.com Licensed under the Eiffel Forum License 2. Package: http://inamidst.com/sw/trio/ """ import itertools, urllib, urlparse import xml.dom.minidom try: import xml.xpath except ImportError: xml = None import rdf n3 = rdf.n3 n3.prefix('', 'http://example.org/juno#') counter = itertools.count(1) namespaces = { 'h': 'http://www.w3.org/1999/xhtml' } def evaluate(node, xpath, xmlns=None): if xmlns is None: xmlns = namespaces c = xml.xpath.Context.Context(node, processorNss=xmlns) return xml.xpath.Evaluate(xpath, context=c) def join(base, ref): return urlparse.urljoin(base, ref) class InputRule(object): def process(self, input_uri, node): return input_uri class URIRule(unicode): def process(self, input_uri, node): vnode = evaluate(node, self)[0] try: val = vnode.value except AttributeError: val = vnode.data val = unicode(join(input_uri.value, val)) return rdf.URI(val) class LiteralRule(unicode): def process(self, input_uri, node): try: vnode = evaluate(node, self)[0] except IndexError: return None # @@ if opt else: try: val = vnode.value except AttributeError: val = vnode.data return rdf.PlainLiteral(val, None) cache = {} class Rule(object): rules = {} def __new__(cls, xpath): if not Rule.rules.has_key(xpath): rule = object.__new__(cls, xpath) Rule.rules[xpath] = rule return rule return Rule.rules[xpath] def __init__(self, xpath): self.xpath = xpath self.outputs = [] self.subrules = [] def process(self, input_uri, dom): nodes = evaluate(dom, self.xpath.lexical) for node in nodes: for s, p, o in self.outputs: if hasattr(s, 'process'): s = s.process(input_uri, node) if s is None: continue if hasattr(p, 'process'): p = p.process(input_uri, node) if p is None: continue if hasattr(o, 'process'): o = o.process(input_uri, node) if o is None: continue yield rdf.Triple(s, p, o) for subrule in self.subrules: for triple in subrule.process(input_uri, node): yield triple class Juno(object): def __init__(self, uri): self.uri = uri self.graph = rdf.Graph(self.uri) def resolve(self, term): if cache.has_key(term): return cache[term] triple = self.graph.the(term, n3['uri']) if triple is not None: r = URIRule(triple.object.lexical) cache[term] = r return r triple = self.graph.the(term, n3['lit']) if triple is not None: r = LiteralRule(triple.object.lexical) cache[term] = r return r if isinstance(term, rdf.BlankNode): b = rdf.BlankNode('b' + str(counter.next())) cache[term] = b return b if term == n3['input']: return InputRule() return term def get_rules(self): for triple in self.graph.get(predicate=n3['match']): definition, path = triple.subject, triple.object rule = Rule(path) for out in self.graph.get(definition, n3['out']): subj = self.graph.the(out.object, n3['subj']) if subj is not None: subject = self.resolve(subj.object) else: subject = self.resolve(out.object) for output in self.graph.get(out.object): if output.predicate == n3['subj']: continue if output.predicate == n3['uri']: continue predicate = self.resolve(output.predicate) object = self.resolve(output.object) rule.outputs.append((subject, predicate, object)) for sub in self.graph.get(definition, n3['subrule']): subrule = self.get_subrule(sub.object) rule.subrules.append(subrule) yield rule def get_subrule(self, definition): path = self.graph.the(definition, n3['path']).object subrule = Rule(path) for out in self.graph.get(definition, n3['out']): subj = self.graph.the(out.object, n3['subj']) if subj is not None: subject = self.resolve(subj.object) else: subject = self.resolve(out.object) for output in self.graph.get(out.object): if output.predicate == n3['subj']: continue if output.predicate == n3['uri']: continue predicate = self.resolve(output.predicate) object = self.resolve(output.object) subrule.outputs.append((subject, predicate, object)) for subsub in self.graph.get(definition, n3['subrule']): subsubrule = self.get_subrule(subsub.object) subrule.subrules.append(subsubrule) return subrule def transform(self, uri, f): input_uri = rdf.URI(unicode(uri)) dom = xml.dom.minidom.parse(f) G = rdf.Graph() for rule in self.get_rules(): for t in rule.process(input_uri, dom): G.store.add(t) return G def transform(uri, f, styleuri): j = Juno(styleuri) return j.transform(uri, f) def main(): import sys uri, styleuri = unicode(sys.argv[1]), unicode(sys.argv[2]) u = urllib.urlopen(uri) G = transform(uri, u, styleuri) u.close() for line in G.serialiseToLines('ntriples'): print line if __name__ == '__main__': main()