#!/usr/bin/env python """ rdfxml.py - An RDF/XML Parser Copyright 2007, Sean B. Palmer, inamidst.com Licensed under the Eiffel Forum License 2. """ from xml.parsers import expat class Namespace(object): def __init__(self, name): self.name = name def __getitem__(self, item): return self.name + unicode(item) def __cmp__(self, other): if not isinstance(other, basestring): return -1 if self.name == other: return 0 return -1 xml = Namespace(u'http://www.w3.org/XML/1998/namespace') rdf = Namespace(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#') import re ntriples = False r_hex4_32 = re.compile(ur'([\x00-\x08\x0b\x0C\x0E-\x1F\x7F-\uFFFF]+)') r_hex4_33 = re.compile(ur'([\x00-\x08\x0b\x0C\x0E-\x1F\x3E\x7F-\uFFFF]+)') # Cf. http://bugs.python.org/issue1477 try: r_hex6 = re.compile(u'([\U00010000-\U0010FFFF]+)') except: r_hex6 = None def hex4(m): return u''.join('\\u%04X' % ord(c) for c in m.group(1)) def hex6(m): return u''.join('\\U%08X' % ord(c) for c in m.group(1)) def escaped32(s): # http://www.w3.org/TR/rdf-testcases/#ntrip_strings s = s.replace('\\', '\\\\') s = s.replace('\t', '\\t') s = s.replace('\n', '\\n') s = s.replace('\r', '\\r') s = s.replace('"', '\\"') s = r_hex4_32.sub(hex4, s) if r_hex6 is not None: return r_hex6.sub(hex6, s) if __import__('sys').maxunicode <= 0xffff: warning = ("Warning: You're using a Narrow Python build", "This means that N-Triples output might not be fully compliant", "Use a python compiled with --enable-unicode=ucs4 to fix this") for line in warning: print >> __import__('sys').stderr, line return s raise ValueError('r_hex6 did not compile properly') def escaped33(s): # http://lists.w3.org/Archives/Public/www-rdf-comments/2007OctDec/0008 s = s.replace('\\', '\\\\') s = s.replace('\t', '\\t') s = s.replace('\n', '\\n') s = s.replace('\r', '\\r') s = s.replace('"', '\\"') s = r_hex4_33.sub(hex4, s) if r_hex6 is not None: return r_hex6.sub(hex6, s) if __import__('sys').maxunicode <= 0xffff: warning = ("Warning: You're using a Narrow Python build", "This means that N-Triples output might not be fully compliant", "Use a python compiled with --enable-unicode=ucs4 to fix this") for line in warning: print >> __import__('sys').stderr, line return s raise ValueError('r_hex6 did not compile properly') r_normalize = re.compile(r'[\x0D\x0A\x09 ]+') def normalize(s): # http://www.w3.org/TR/REC-xml/#AVNormalize r_normalize.sub(' ', s) return s.strip(' ') class Event(object): pass class Root(Event): def __init__(self, document_element, children, base_uri): self.document_element = document_element self.children = children self.base_uri = base_uri self.language = u'' URI = type('URI', (unicode,), {}) bNode = type('bNode', (unicode,), {}) Literal = type('PlainLiteral', (tuple,), { '__new__': lambda cls, lexical, language: tuple.__new__(cls, [lexical, language]) }) DatatypedLiteral = type('TypedLiteral', (tuple,), { '__new__': lambda cls, lexical, datatype: tuple.__new__(cls, [lexical, datatype]) }) # URI = type('URI', (unicode,), {}) # bNode = type('bNode', (unicode,), {}) # Literal = type('Literal', (tuple,), {}) # DatatypedLiteral = type('DatatypedLiteral', (tuple,), {}) class Element(Event): __slots__ = ('local_name', 'namespace_name', 'children', 'base_uri', 'language', 'attributes', 'URI', 'li_counter', 'subject', 'URI_string_value') def __init__(self, local_name, ns, children, base_uri, attrs, parent): self.local_name = local_name self.namespace_name = ns self.children = children self.base_uri = base_uri xml_lang = (xml.name, u'lang') if xml_lang in attrs: # self.language = normalized_value(attrs[xml_lang]) self.language = normalize(attrs[xml_lang]) elif parent is not None: self.language = parent.language else: self.language = u'' for attr in attrs.keys(): if attr[0] == xml: del attrs[attr] elif attr[1].startswith(u'xml'): del attrs[attr] g = (Attribute(name, ns, v) for ((ns, name), v) in attrs.iteritems()) self.attributes = set(g) self.URI = self.namespace_name + self.local_name if not ntriples: self.URI_string_value = URI(self.URI) else: self.URI_string_value = u'<' + escaped33(self.URI) + u'>' self.li_counter = 1 self.subject = None # @@ The specification doesn't say to add this, but it's needed self.parent = parent def free(self): for attr in dir(self): if attr.startswith('__'): continue if attr == 'subject': continue setattr(self, attr, None) # del self.__dict__[attr] class EndElement(Event): pass class Attribute(Event): def __init__(self, local_name, ns, string_value): self.local_name = local_name self.namespace_name = ns self.string_value = normalize(string_value) names = set([u'ID', u'about', u'resource', u'parseType', u'type']) if self.namespace_name is not None: self.URI = self.namespace_name + self.local_name elif self.local_name in names: self.URI = rdf + self.local_name else: raise ValueError('Non-RDF non-namespaces local names forbidden') # @@ "The construction of RDF URI references from XML attributes can # generate the same RDF URI references from different XML attributes." if not ntriples: self.URI_string_value = URI(self.URI) else: self.URI_string_value = u'<' + escaped33(self.URI) + u'>' class Text(Event): def __init__(self, *charinfo): self.string_value = u''.join(charinfo) class Comment(object): def __init__(self, data): self.string_value = u'' class URIReference(Event): def __init__(self, identifier): self.identifier = identifier if not ntriples: self.string_value = URI(self.identifier) else: self.string_value = u'<' + escaped33(self.identifier) + u'>' class BlankNodeIdentifier(Event): def __init__(self, identifier): self.identifier = identifier if not ntriples: self.string_value = bNode(self.identifier) else: self.string_value = '_:' + self.identifier # @@ "The value begins with "_:" and the entire value MUST match the # N-Triples nodeID production. The function MUST preserve distinct blank # node identity as discussed in in section 5.2 Identifiers." # assert self.string_value.startswith('_:') r_language_tag = re.compile(r'^[A-Za-z0-9]{1,8}(-[A-Za-z0-9]{1,8})*$') class PlainLiteral(Event): def __init__(self, literal_value, literal_language): self.literal_value = literal_value self.literal_language = literal_language # check that it's valid according to RFC 3066 if self.literal_language: if not r_language_tag.match(self.literal_language): raise ValueError('Not a valid language tag: %s' % literal_language) if not ntriples: if self.literal_language == u'': self.string_value = Literal(self.literal_value, None) else: args = [self.literal_value, self.literal_language] self.string_value = Literal(*args) else: if self.literal_language == u'': self.string_value = u'"' + escaped32(self.literal_value) + u'"' else: self.string_value = u'"' + escaped32(self.literal_value) + u'"@' \ + self.literal_language class TypedLiteral(Event): def __init__(self, literal_value, literal_datatype): self.literal_value = literal_value self.literal_datatype = literal_datatype if not ntriples: args = [self.literal_value, URI(self.literal_datatype)] self.string_value = DatatypedLiteral(*args) else: self.string_value = u'"' + escaped32(self.literal_value) + u'"^^<' + \ escaped32(self.literal_datatype) + u'>' def qtuple(qname): if ' ' in qname: return tuple(qname.rsplit(' ', 1)) return (u'', qname) def racine(uri): return uri.split('#')[0] class XMLDocument(object): def __init__(self, uri, f=None, text=None): self.baseURI = uri self.expat = expat.ParserCreate(namespace_separator=' ') self.expat.StartElementHandler = self.start_element self.expat.EndElementHandler = self.end_element self.expat.CharacterDataHandler = self.char_data self.expat.CommentHandler = self.comment self.expat.StartNamespaceDeclHandler = self.start_namespace self.expat.EndNamespaceDeclHandler = self.end_namespace if f is not None: self.input = f self.expat_events = self.parse_xml(self.read_file) elif text is not None: self.input = text self.expat_events = self.parse_xml(self.read_text) else: ValueError('Expected either file or text') def read_file(self, bufsiz): data = self.input.read(bufsiz) if isinstance(data, unicode): return data.encode('utf-8') return data def read_text(self, bufsiz): result = self.input[:bufsiz].encode('utf-8') self.input = self.input[bufsiz:] return result def parse_xml(self, read): eid = 0 bufsiz = 8192 # 8 KB self.prefix_bindings = {} self.uri_bindings = {} self.events_queue = [] while True: chunk = read(bufsiz) done = (not chunk) self.expat.Parse(chunk, done) for event in self.events_queue: eid += 1 yield tuple(list(event) + [eid]) # @@ do this in start_element &c. self.events_queue = [] if done: break def start_element(self, name, attrs): items = self.uri_bindings.iteritems() bindings = dict((k, v[:]) for (k, v) in items) self.events_queue.append(('START_ELEMENT', name, attrs, bindings)) def end_element(self, name): self.events_queue.append(('END_ELEMENT', name)) def char_data(self, data): self.events_queue.append(('CHARACTER_DATA', data)) def comment(self, data): self.events_queue.append(('COMMENT', data)) def start_namespace(self, prefix, uri): try: self.prefix_bindings[prefix].append(uri) except KeyError: self.prefix_bindings[prefix] = [uri] try: self.uri_bindings[uri].append(prefix) except KeyError: self.uri_bindings[uri] = [prefix] def end_namespace(self, prefix): uri = self.prefix_bindings[prefix].pop() self.uri_bindings[uri].pop() def children(self, parent_start): # parent_start is a start element expat event while parent_start in self.expat_stack: expat_event = self.expat_events.next() if expat_event[0] == 'START_ELEMENT': if self.expat_stack[-1] == parent_start: self.expat_stack.append(expat_event) element_name = expat_event[1] if ' ' in element_name: ns, local_name = element_name.rsplit(' ', 1) else: ns, local_name = u'', element_name children = self.children(expat_event) attrs = expat_event[2] attrs = dict((qtuple(k), v) for k, v in attrs.iteritems()) parent = self.rdf_stack[-1] xml_base = (xml.name, u'base') if xml_base in attrs: base_uri = racine(attrs[xml_base]) del attrs[xml_base] else: base_uri = parent.base_uri args = (local_name, ns, children, base_uri, attrs, parent) element = Element(*args) element.uri_bindings = expat_event[3] self.rdf_stack.append(element) yield element element.free() else: self.expat_stack.append(expat_event) self.rdf_stack.append(None) elif expat_event[0] == 'END_ELEMENT': self.expat_stack.pop() self.rdf_stack.pop() elif (expat_event[0] == 'CHARACTER_DATA') and \ (self.expat_stack[-1] == parent_start): yield Text(expat_event[1]) elif (expat_event[0] == 'COMMENT') and \ (self.expat_stack[-1] == parent_start): yield Comment(expat_event[1]) def root(self): self.expat_stack = [] self.rdf_stack = [] expat_event = self.expat_events.next() while expat_event[0] == 'COMMENT': expat_event = self.expat_events.next() self.expat_stack.append(expat_event) element_name = expat_event[1] if ' ' in element_name: ns, local_name = element_name.rsplit(' ', 1) else: ns, local_name = u'', element_name children = self.children(expat_event) attrs = expat_event[2] attrs = dict((qtuple(k), v) for k, v in attrs.iteritems()) parent = None xml_base = (xml.name, u'base') if xml_base in attrs: base_uri = racine(attrs[xml_base]) del attrs[xml_base] else: base_uri = self.baseURI args = (local_name, ns, children, base_uri, attrs, parent) document_element = Element(*args) document_element.uri_bindings = expat_event[3] self.rdf_stack.append(document_element) root = Root(document_element, children, base_uri) return root # RDF Grammar Actions def concat(*args): return u''.join(unicode(arg) for arg in args) import urlparse def resolve(e, s): return unicode(urlparse.urljoin(e.base_uri, s)) def generated_blank_node_id(): generated_blank_node_id.nextid += 1 return 'n%02i' % generated_blank_node_id.nextid generated_blank_node_id.nextid = 0 # def uri(identifier): # return URIReference(identifier) # def bnodeid(identifier): # return BlankNodeIdentifier(identifier) # def literal(literal_value, literal_language): # return PlainLiteral(literal_value, literal_language) # def typed_literal(literal_value, literal_datatype): # return TypedLiteral(literal_value, literal_datatype) # Other Utilities def canonicalize(e): # Apply XML Exclusive Canonicalisation to e's contents result = [] def render(e, bindings_done=None): if isinstance(e, Element): if e.uri_bindings.has_key(e.namespace_name): prefix = e.uri_bindings[e.namespace_name][-1] if prefix: result.append('<' + prefix + ':' + e.local_name) else: result.append('<' + e.local_name) if not (prefix, e.namespace_name) in bindings_done: if prefix is None: xmlns = 'xmlns' else: xmlns = 'xmlns:' + prefix result.append(' ' + xmlns + '="' + e.namespace_name + '"') bindings_done.add((prefix, e.namespace_name)) else: result.append('<' + e.local_name) # print e.uri_bindings # attrs = ((a.local_name, a.string_value) for a in e.attributes) # attrs = sorted(attrs) # for localname, value in attrs: # result.append(' ' + localname + '="' + value + '"') for a in sorted(e.attributes): result.append(' ' + a.local_name + '="' + a.string_value + '"') result.append('>') for child in e.children: render(child, bindings_done) if e.uri_bindings.has_key(e.namespace_name): prefix = e.uri_bindings[e.namespace_name][-1] if prefix: result.append('') else: result.append('') else: result.append('') else: result.append(e.string_value) # text or comment for child in e.children: render(child, bindings_done=set()) return u''.join(result) def elements(children): for child in children: if isinstance(child, Element): yield child elif isinstance(child, Text): # [3] ::= (#x20 | #x9 | #xD | #xA) # - http://www.w3.org/TR/2000/REC-xml-20001006#NT-S if child.string_value.strip('\x20\x09\x0D\x0A'): ValueError('Expected only whitespace here') coreSyntaxTerms = set([rdf['RDF'], rdf['ID'], rdf['about'], rdf['parseType'], rdf['resource'], rdf['nodeID'], rdf['datatype']]) oldTerms = set([rdf['aboutEach'], rdf['aboutEachPrefix'], rdf['bagID']]) debug = False class RDFXMLDocument(XMLDocument): def __init__(self, *args, **kargs): super(RDFXMLDocument, self).__init__(*args, **kargs) def triple(self, s, p, o): print s, p, o, '.' def parse(self, standalone=True): if debug: print 'parse' if standalone: self.docOrNodeElement() else: self.childRDForNodeElementList() def docOrNodeElement(self): if debug: print 'docOrNodeElement' root = self.root() if root.document_element.URI == rdf['RDF']: self.RDF(root.document_element) else: self.nodeElement(root.document_element) def RDF(self, e): if debug: print 'RDF' assert (e.URI == rdf['RDF']) and (not e.attributes) self.nodeElementList(e.children) def nodeElementList(self, children, keep=False): if debug: print 'nodeElementList' if keep is False: for child in elements(children): self.nodeElement(child) return result = [] for child in elements(children): self.nodeElement(child) result.append(child) # yield child return result def nodeElement(self, e): if debug: print 'nodeElement', e.local_name, e.children assert (e.URI not in coreSyntaxTerms | oldTerms | set([rdf['li']])) propertyAttrs = [] optionals = set([rdf['ID'], rdf['nodeID'], rdf['about']]) for a in e.attributes: if a.URI in optionals: if e.subject: raise ValueError('Already had ID/nodeID/about here') if a.URI == rdf['ID']: identifier = resolve(e, concat(u'#', a.string_value)) e.subject = URIReference(identifier) elif a.URI == rdf['nodeID']: e.subject = BlankNodeIdentifier(a.string_value) elif a.URI == rdf['about']: e.subject = URIReference(resolve(e, a.string_value)) else: propertyAttrs.append(a) if not e.subject: e.subject = BlankNodeIdentifier(generated_blank_node_id()) if e.URI != rdf['Description']: s = e.subject.string_value T = URIReference(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type') p = T.string_value o = e.URI_string_value self.triple(s, p, o) for a in propertyAttrs: if a.URI == rdf['type']: u = URIReference(resolve(e, a.string_value)) # @@ spec bug? s = e.subject.string_value p = URIReference( u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' ).string_value o = u.string_value self.triple(s, p, o) else: # @@ "a.string-value SHOULD be in Normal Form C" o = PlainLiteral(a.string_value, e.language) subj = e.subject.string_value pred = a.URI_string_value self.triple(subj, pred, o.string_value) self.propertyEltList(e) def propertyEltList(self, e): if debug: print 'propertyEltList' for child in elements(e.children): child.parent = e self.propertyElt(child) def propertyElt(self, e): if debug: print 'propertyElt' if e.URI == rdf['li']: count = e.parent.li_counter u = concat(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#_', count) e.parent.li_counter += 1 e.URI = u e.URI_string_value = URI(e.URI) for a in e.attributes: if a.URI == rdf['parseType']: if a.string_value == u'Literal': self.parseTypeLiteralPropertyElt(e) elif a.string_value == u'Resource': self.parseTypeResourcePropertyElt(e) elif a.string_value == u'Collection': self.parseTypeCollectionPropertyElt(e) else: self.parseTypeOtherPropertyElt(e) return import itertools # children = list(e.children) # for child in children: # if isinstance(child, Element): # self.resourcePropertyElt(e, children) # return # if children: # self.literalPropertyElt(e, children) # else: self.emptyPropertyElt(e) texts = False textsAndComments = [] for child in e.children: # print 'CHILD:', child if isinstance(child, Element): children = itertools.chain(textsAndComments, [child], e.children) self.resourcePropertyElt(e, children) return elif isinstance(child, Text): textsAndComments.append(child) texts = True elif isinstance(child, Comment): textsAndComments.append(child) if texts: self.literalPropertyElt(e, textsAndComments) else: self.emptyPropertyElt(e) def reify(self, r, s, p, o): rdf_s = u'' rdf_p = u'' rdf_o = u'' rdf_type = u'' rdf_stat = u'' self.triple(r.string_value, rdf_s, s) self.triple(r.string_value, rdf_p, p) self.triple(r.string_value, rdf_o, o) self.triple(r.string_value, rdf_type, rdf_stat) def resourcePropertyElt(self, e, children): if debug: print 'resourcePropertyElt' done = 0 for n in elements(children): self.nodeElement(n) done += 1 if done != 1: raise ValueError('resourcePropertyElt.children.len != 1') s = e.parent.subject.string_value p = e.URI_string_value o = n.subject.string_value # print 'resourcePropertyElt triple:' self.triple(s, p, o) attributes_length = len(e.attributes) if attributes_length == 1: a = e.attributes[0] assert a.URI == rdf['ID'] i = URIReference(resolve(e, concat(u'#', a.string_value))) self.reify(i, s, p, o) e.subject = i else: assert (attributes_length == 0) def literalPropertyElt(self, e, children): if debug: print 'literalPropertyElt' t_string_value = concat(*[c.string_value for c in children]) # @@ t_string_value SHOULD be in Normal Form C a, d = None, None for attribute in e.attributes: if attribute.URI == rdf['datatype']: if d: raise ValueError('Datatype already set') d = attribute elif attribute.URI == rdf['ID']: if a: raise ValueError('ID already set') a = attribute else: raise ValueError('Unexpected attribute: %s' % attribute.URI) if d: o = TypedLiteral(t_string_value, d.string_value) else: o = PlainLiteral(t_string_value, e.language) subj = e.parent.subject.string_value pred = e.URI_string_value self.triple(subj, pred, o.string_value) if a: i = URIReference(resolve(e, concat(u'#', a.string_value))) self.reify(i, subj, pred, o.string_value) e.subject = i def parseTypeLiteralPropertyElt(self, e): if debug: print 'parseTypeLiteralPropertyElt' x = canonicalize(e) xml_literal = u'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral' o = TypedLiteral(x, xml_literal) subj = e.parent.subject.string_value pred = e.URI_string_value self.triple(subj, pred, o.string_value) a = None for attribute in e.attributes: if attribute.URI == rdf['parseType']: pass elif attribute.URI == rdf['ID']: a = attribute else: raise ValueError('Unexpected attribute: %s' % attribute.URI) if a: i = URIReference(resolve(e, concat(u'#', a.string_value))) self.reify(i, subj, pred, o.string_value) e.subject = i def parseTypeResourcePropertyElt(self, e): if debug: print 'parseTypeResourcePropertyElt' disallowed = coreSyntaxTerms | oldTerms | set([rdf['Description']]) assert (e.URI not in disallowed) n = BlankNodeIdentifier(generated_blank_node_id()) s = e.parent.subject.string_value p = e.URI_string_value o = n.string_value self.triple(s, p, o) a = None for attribute in e.attributes: if attribute.URI == rdf['parseType']: pass elif attribute.URI == rdf['ID']: a = attribute else: raise ValueError('Unexpected attribute: %s' % attribute.URI) if a: i = URIReference(resolve(e, concat(u'#', a.string_value))) self.reify(i, s, p, o) e.subject = i local_name = u'Description' children = e.children # list(e.children) base_uri = e.base_uri nodeElt = Element(local_name, rdf.name, children, base_uri, {}, e) nodeElt.subject = n # nodeElt.children = [] # for child in children: # child.parent = nodeElt # nodeElt.children.append(child) self.nodeElement(nodeElt) def parseTypeCollectionPropertyElt(self, e): if debug: print 'parseTypeCollection' nodeElementList = [] for element in self.nodeElementList(e.children, keep=True): nodeElementList.append(element) s = [] for f in nodeElementList: n = BlankNodeIdentifier(generated_blank_node_id()) s.append(n) if s: n = s[0] subj = e.parent.subject.string_value pred = e.URI_string_value objt = n.string_value self.triple(subj, pred, objt) else: subj = e.parent.subject.string_value pred = e.URI_string_value objt = u'' self.triple(subj, pred, objt) a = None for attribute in e.attributes: if attribute.URI == rdf['parseType']: pass elif attribute.URI == rdf['ID']: a = attribute else: raise ValueError('Unexpected attribute: %s' % attribute.URI) if a: i = URIReference(resolve(e, concat(u'#', a.string_value))) self.reify(i, subj, pred, o.string_value) if s: for (n, f) in zip(s, nodeElementList): # print 'N, F:', n, vars(f) first = u'' # self.triple(n.string_value, first, f.string_value) @@! wrong? self.triple(n.string_value, first, f.subject.string_value) rest = u'' for i in xrange(len(s) - 1): n, o = s[i], s[i + 1] self.triple(n.string_value, rest, o.string_value) n = s[-1] nil = u'' self.triple(n.string_value, rest, nil) def parseTypeOtherPropertyElt(self, e): if debug: print 'parseTypeOtherPropertyElt' self.parseTypeLiteralPropertyElt(e) def emptyPropertyElt(self, e): if debug: print 'emptyPropertyElt' onlyHasId = False if len(e.attributes) == 1: for attribute in e.attributes: if attribute.URI == rdf['ID']: onlyHasId = True if (not e.attributes) or onlyHasId: o = PlainLiteral(u'', e.language) subj = e.parent.subject.string_value pred = e.URI_string_value objt = o.string_value self.triple(subj, pred, objt) if onlyHasId: i = e.attributes.pop() r = URIReference(resolve(e, concat(u'#', i.string_value))) self.reify(r, subj, pred, o.string_value) else: r, i = None, None propertyAttrs = [] for attribute in e.attributes: if attribute.URI == rdf['resource']: if r: raise ValueError('Already had rdf:nodeID') r = URIReference(resolve(e, attribute.string_value)) elif attribute.URI == rdf['nodeID']: if r: raise ValueError('Already had rdf:resource') r = BlankNodeIdentifier(attribute.string_value) elif attribute.URI == rdf['ID']: i = attribute else: propertyAttrs.append(attribute) if not r: r = BlankNodeIdentifier(generated_blank_node_id()) for a in propertyAttrs: if a.URI == rdf['type']: u = URIReference(resolve(e, a.string_value)) # @@! spec bug? r_type = URIReference( u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' ).string_value self.triple(r.string_value, r_type, u.string_value) else: # @@ a.string_value SHOULD be in Normal Form C o = PlainLiteral(a.string_value, e.language) self.triple(r.string_value, a.URI_string_value, o.string_value) s = e.parent.subject.string_value p = e.URI_string_value o = r.string_value self.triple(s, p, o) if i: r = URIReference(resolve(e, concat(u'#', i.string_value))) self.reify(r, s, p, o) def ntriple(s, p, o): print s, p, o, '.' def parseText(uri, text, callback=None): if not isinstance(text, unicode): raise ValueError('Text must be a unicode instance') if callback is None: callback = ntriple class Parser(RDFXMLDocument): def triple(self, s, p, o): callback(s, p, o) p = Parser(uri, text=text) p.parse() def parseFile(uri, f, callback=None): if callback is None: callback = ntriple class Parser(RDFXMLDocument): def triple(self, s, p, o): callback(s, p, o) p = Parser(uri, f=f) p.parse() def parseURI(uri, callback=None): import urllib u = urllib.urlopen(uri) if callback is None: callback = ntriple class Parser(RDFXMLDocument): def triple(self, s, p, o): callback(s, p, o) p = Parser(uri, f=u) p.parse() u.close() def parse(uri, obj=None, callback=None): if obj is None: parseURI(uri, callback) elif isinstance(obj, unicode): parseText(uri, obj, callback) else: parseFile(uri, obj, callback) def main(argv=None): import sys if argv is None: argv = sys.argv if len(argv) == 2: parseURI(sys.argv[1]) else: print 'Usage: ./rdfxml.py ' if __name__ == '__main__': main()