#!/usr/bin/env python """ grddl.py - GRDDL Processor Copyright 2007, Sean B. Palmer, inamidst.com Licensed under the Eiffel Forum License 2. Package: http://inamidst.com/sw/trio/ @@ html5lib """ import urlparse, subprocess, xml import rdf, web def xpath_root(uri): doc = web.doc(uri) try: dom = xml.dom.minidom.parse(doc) except xml.parsers.expat.ExpatError, e: print 'URI:', uri raise e doc.close() return dom.documentElement def fn_normalize_space(string): string = string.strip(' \t\r\n') string = string.replace('\t', ' ') string = string.replace('\r', ' ') string = string.replace('\n', ' ') while ' ' in string: string = string.replace(' ', ' ') return string def fn_tokenize(string): return string.split(' ') def fn_resolve_uri(base, relative): return urlparse.urljoin(base, relative) def xslt(input_bytes, stylesheet_uri): p = subprocess.Popen(['xsltproc', stylesheet_uri, '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) output = p.communicate(input_bytes.encode('utf-8'))[0] return output def grddl_result(uri, root, transform_uri): input_bytes = root.toxml() resp = web.doc(transform_uri) if resp.mediatype in set(['application/xml', 'text/xsl']): rdfxml = xslt(input_bytes, transform_uri) G = rdf.Graph(baseURI=uri) # @@! G.parseText(rdfxml, format='rdfxml') return G else: import StringIO, juno f = StringIO.StringIO(input_bytes) f.seek(0) return juno.transform(uri, f, transform_uri) def elements(element): def walk(element): for child in element.childNodes: if isinstance(child, xml.dom.minidom.Element): yield child for descendent in walk(child): yield descendent yield element for descendent in walk(element): yield descendent def debug(*args): if debug.on: import sys print >> sys.stderr, ' '.join(str(arg) for arg in args) debug.on = False class GRDDL(object): def __init__(self): self.transforms = {} # root: set(txuri) self.results = {} # root: Graph self.profiles = {} def parse(self, uri): self.uri = uri self.root = xpath_root(self.uri) self.results_2_2_done = set() self.transforms_3_1_done = set() self.transforms_5_1_done = set() # Section 2 self.transforms_2_1() self.results_2_2() # Section 4 self.profiles_4_2() self.transforms_4_1() self.results_2_2() # Section 3 - namespaces oldlen = len(self.results) while True: self.transforms_3_1() # recursive self.results_2_2() newlen = len(self.results) if newlen <= oldlen: break oldlen = newlen # Section 5 - profiles oldlen = len(self.profiles) while True: self.transforms_5_1() # recursive self.results_2_2() newlen = len(self.profiles) if newlen <= oldlen: break oldlen = newlen return self.results[self.root] def transforms_2_1(self): root = self.root for element in elements(root): attr = ('http://www.w3.org/2003/g/data-view#', 'transformation') value = element.getAttributeNS(*attr) if not value: continue for ref in fn_tokenize(fn_normalize_space(value)): txuri = fn_resolve_uri(self.uri, ref) debug('2.1 Transform:', txuri) self.transforms.setdefault(root, set()).add(txuri) def results_2_2(self): for (root, uris) in self.transforms.iteritems(): for txuri in uris: if (root, txuri) in self.results_2_2_done: continue debug('2.2 Apply-Transform:', txuri) G = grddl_result(self.uri, root, txuri) self.merge_2_3(root, G) self.results_2_2_done.add((root, txuri)) def merge_2_3(self, root, G): result = self.results.get(root) if result: debug('2.3 Merge-Result:', result, G) self.results[root] = result + G else: debug('2.3 Result:', G) self.results[root] = G def transforms_3_1(self): Q = rdf.n3('[ grddl:namespaceTransformation ?tx ]') for (root, G) in self.results.iteritems(): if root in self.transforms_3_1_done: continue for bindings in G.query(Q): debug('3.1 Namespace-Transform:', bindings.tx) self.transforms.setdefault(root, set()).add(bindings.tx) self.transforms_3_1_done.add(root) # @@ 3.2 def profiles_4_2(self): root = self.root for element in elements(root): # @@ check for if element.namespaceURI == 'http://www.w3.org/1999/xhtml' and \ element.localName == 'head': value = element.getAttribute('profile') if not value: continue for ref in fn_tokenize(fn_normalize_space(value)): profile = fn_resolve_uri(self.uri, ref) debug('4.2 Profile:', profile) self.profiles.setdefault(root, set()).add(profile) break # @@ not strictly conforming... def transforms_4_1(self): for (root, profiles) in self.profiles.iteritems(): for profile in profiles: if profile != 'http://www.w3.org/2003/g/data-view': continue for element in elements(root): if element.namespaceURI == 'http://www.w3.org/1999/xhtml' and \ element.localName in ('a', 'link'): value = element.getAttribute('rel') if not value: continue if not 'transformation' in fn_tokenize( fn_normalize_space(value)): continue value = element.getAttribute('href') if not value: continue for ref in fn_tokenize(fn_normalize_space(value)): turi = fn_resolve_uri(self.uri, ref) debug('4.1 Transform:', turi) self.transforms.setdefault(root, set()).add(turi) def transforms_5_1(self): Q = rdf.n3('?pdoc grddl:profileTransformation ?tx') for (root, G) in self.results.iteritems(): if root in self.transforms_5_1_done: continue for bindings in G.query(Q): if bindings.pdoc != self.uri: continue # @@ for (node, profile) in self.profiles.iteritems(): if profile == bindings.pdoc: debug('5.1 Transform:', bindings.tx) self.transforms.setdefault(node, set()).add(bindings.tx) self.transforms_5_1_done.add(root) # g = GRDDL() # G = g.parse('http://www.w3.org/2001/sw/grddl-wg/td/titleauthor.html') # print G # g = GRDDL() # G = g.parse('http://www.w3.org/2004/lambda/Sites/index.html') # print G def parse(uri): rdf.n3.prefix('grddl', 'http://www.w3.org/2003/g/data-view#') g = GRDDL() G = g.parse(uri) return G # EOF