#!/usr/bin/env python """ An RDF/XML Parser Author: Sean B. Palmer, inamidst.com License: GPL 2; share and enjoy! Derived from: http://infomesh.net/pyrple/ pyrple.parsers.rdfxml """ import sys, re, urllib import xml.sax, xml.sax.handler from urlparse import urljoin as urijoin try: from cStringIO import StringIO except ImportError: from StringIO import StringIO class Namespace(unicode): def __getattr__(self, name): return self + name def __getitem__(self, item): return self + str(item) rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#") x = Namespace("http://www.w3.org/XML/1998/namespace") # @@ NoNS # @@ Unicode normalization doesn't seem to be required anymore r_unilower = re.compile(r'(?<=\\u)([0-9a-f]{4})|(?<=\\U)([0-9a-f]{8})') r_hibyte = re.compile(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\xFF]') def quote(s): if not isinstance(s, unicode): s = unicode(s, 'utf-8') # @@ not required? if not (u'\\'.encode('unicode-escape') == '\\\\'): s = s.replace('\\', r'\\') s = s.replace('"', r'\"') # s = s.replace(r'\\"', r'\"') s = r_hibyte.sub(lambda m: '\\u00%02X' % ord(m.group(0)), s) s = s.encode('unicode-escape') s = r_unilower.sub(lambda m: (m.group(1) or m.group(2)).upper(), s) return str(s) r_xmlname = re.compile(ur'^[A-Za-z_][A-Za-z0-9\u0080-\uFFFF_.-]*$') class Element(object): def __init__(self, xmlns, name, attrs, p=None, base=None, prefix=None): self.xmlns, self.name, self.attrs = xmlns, name, attrs or {} self.URI = (xmlns or '') + name self.base = attrs.x.get(x.base) or (p and p.base) or base or '' self.lang = attrs.x.get(x.lang) or (p and p.lang) or '' self.parent, self.children, self.text, self.subject = p, [], '', None self.prefix = prefix attrs = '%r' % attrs if attrs: attrs = ' ' + attrs if self.parent and (self.parent.xmlns == self.xmlns): self.xtext = ['<%s%s>' % (name, attrs), '', ''] elif not self.xmlns: self.xtext = ['<%s%s>' % (name, attrs), '', ''] elif prefix: self.xtext = ['<%s:%s xmlns:%s="%s"%s>' % (prefix, name, prefix, xmlns, attrs), '', ''] else: self.xtext = ['<%s xmlns="%s"%s>' % (name, xmlns, attrs), '', ''] def __getitem__(self, attr): return self.attrs[attr] def __getitem__(self, attr): return self.attrs[attr] class Attributes(dict): def __init__(self, attrs): self.attrs, self.x = attrs.items(), {} for (xmlns, name), value in self.attrs: xmlns, name = str(xmlns or ''), str(name) if xmlns == x: self.x[xmlns + name] = unicode(value) else: dict.__setitem__(self, xmlns + name, unicode(value)) def __repr__(self): return ' '.join([('xmlns:ns%s="%s" ns%s:%s="%s"' % (i, self.attrs[i][0][0], i, self.attrs[i][0][1], self.attrs[i][1]), '%s="%s"' % (self.attrs[i][0][1], self.attrs[i][1]))[self.attrs[i][0][0] is None] for i in range(len(self.attrs))]) r_id = re.compile(r'^i([rd]+)') r_quot = re.compile(r'([^\\])"') class Term(unicode): def __new__(cls, s, v): a = unicode.__new__(cls, s) a.val = unicode(v) return a def __repr__(self): return self.val class RDFParser(object): def __init__(self, sink, base=None, qnames=True): self.triple = sink.triple self.tree = [] self.base = base or '' self.genID = 0 self.qnames = qnames self.xmlids = [] self.disallowed = [rdf.RDF, rdf.ID, rdf.about, rdf.bagID, rdf.parseType, rdf.resource, rdf.nodeID, rdf.datatype, rdf.li, rdf.aboutEach, rdf.aboutEachPrefix] def startTag(self, xmlns, name, attrs): if ':' in name: prefix, name = name.split(':', 1) else: prefix, name = '', name if self.tree: e = Element(xmlns, name, Attributes(attrs), self.tree[-1], prefix=prefix) else: e = Element(xmlns, name, Attributes(attrs), base=self.base, prefix=prefix) self.tree += [e] def characterData(self, chars): if self.tree: self.tree[-1].text += chars self.tree[-1].xtext[1] += chars def endTag(self, xmlns, name): element = self.tree.pop() if element.prefix: element.xtext[2] += '' else: element.xtext[2] += '' if self.tree: self.tree[-1].children += [element] self.tree[-1].xtext[1] += ''.join(element.xtext) else: self.document(element) def uri(self, u): return Term("<%s>" % quote(u), u) def bNode(self, label=None): if label: if not label[0].isalpha(): label = 'b' + label return '_:' + r_id.sub('ir\g<1>', label) self.genID = self.genID + 1 return Term('_:id%s' % (self.genID - 1), (self.genID - 1)) def strlit(self, s, lang=None, dtype=None): if lang and dtype: # raise "ParseError", "Can't have both" lang = None # see datatypes/Manifest.rdf#test001 lang = (lang and ("@" + lang) or '').lower() dtype = dtype and ("^^<%s>" % dtype) or '' return Term(('"%s"' % quote(s)) + lang + dtype, s) def document(self, doc): if doc.URI == rdf.RDF: for element in doc.children: self.nodeElement(element) else: self.nodeElement(doc) def nodeElement(self, e): assert e.URI not in self.disallowed, "Disallowed element used as node" if e.attrs.has_key(rdf.ID): eid = e[rdf.ID] if not r_xmlname.match(eid): raise Exception("Not an XML name: %s" % eid.encode('utf-8')) e.subject = self.uri(urijoin(e.base, "#" + e[rdf.ID])) if e.subject in self.xmlids: raise Exception("rdf:ID already used (1)") else: self.xmlids.append(e.subject) del e.attrs[rdf.ID] if e.attrs.has_key(rdf.nodeID): raise Exception("nodeElement has rdf:ID and rdf:nodeID") elif e.attrs.has_key(rdf.about): if (e[rdf.about] == '') and ('#' in e.base or ''): base = e.base[:e.base.find('#')] # @@ urlparse oddness e.subject = self.uri(urijoin(base, '')) else: e.subject = self.uri(urijoin(e.base, e[rdf.about])) if e.attrs.has_key(rdf.nodeID): raise Exception("nodeElement has rdf:about and rdf:nodeID") elif e.attrs.has_key(rdf.nodeID): eni = e[rdf.nodeID] if not r_xmlname.match(eni): raise Exception("rdf:nodeID is not an XML name: %s" % eni) e.subject = self.bNode(eni) elif e.subject is None: e.subject = self.bNode() disallowed = [rdf.RDF, rdf.bagID, rdf.resource, rdf.datatype, rdf.li, rdf.aboutEach, rdf.aboutEachPrefix] for element in e.attrs.keys(): assert element not in disallowed, "%s used as attr" % element if e.URI != rdf.Description: self.triple(e.subject, self.uri(rdf.type), self.uri(e.URI)) if e.attrs.has_key(rdf.type): self.triple(e.subject, self.uri(rdf.type), self.uri(e[rdf.type])) for attr in e.attrs.keys(): if attr not in self.disallowed + [rdf.type]: objt = self.strlit(e[attr], e.lang) self.triple(e.subject, self.uri(attr), objt) for element in e.children: self.propertyElt(element) def propertyElt(self, e): if e.URI == rdf.li: if not hasattr(e.parent, 'liCounter'): e.parent.liCounter = 1 e.URI = rdf + '_' + str(e.parent.liCounter) e.parent.liCounter += 1 if len(e.children) == 1 and not e.attrs.has_key(rdf.parseType): self.resourcePropertyElt(e) elif len(e.children) == 0 and e.text: self.literalPropertyElt(e) elif e.attrs.has_key(rdf.parseType): if e[rdf.parseType] == "Resource": self.parseTypeResourcePropertyElt(e) elif e[rdf.parseType] == "Collection": self.parseTypeCollectionPropertyElt(e) else: self.parseTypeLiteralOrOtherPropertyElt(e) elif not e.text: self.emptyPropertyElt(e) if e.attrs.has_key(rdf.parseType) and e.attrs.has_key(rdf.resource): raise Exception("propertyElt has parseType and resource") if e.attrs.has_key(rdf.parseType): for attr in e.attrs: if attr not in self.disallowed: raise Exception("propertyElt has parseType and other") if e.attrs.has_key(rdf.bagID): raise Exception("propertyElt has bagID") if e.URI in (self.disallowed + [rdf.Description]): raise Exception("Disallowed element used as node") def resourcePropertyElt(self, e): n = e.children[0] self.nodeElement(n) self.triple(e.parent.subject, self.uri(e.URI), n.subject) if e.attrs.has_key(rdf.ID): eid = e[rdf.ID] if not r_xmlname.match(eid): raise Exception("Not an XML name: %s" % eid.encode('utf-8')) i = self.uri(urijoin(e.base, ('#' + eid))) if i in self.xmlids: raise Exception("rdf:ID already used (2)") else: self.xmlids.append(i) del e.attrs[rdf.ID] self.reify(i, e.parent.subject, self.uri(e.URI), n.subject) def reify(self, r, s, p, o): self.triple(r, self.uri(rdf.subject), s) self.triple(r, self.uri(rdf.predicate), p) self.triple(r, self.uri(rdf.object), o) self.triple(r, self.uri(rdf.type), self.uri(rdf.Statement)) def literalPropertyElt(self, e): # @@ Not mentioned in rdf-syntax-grammar/#literalPropertyElt dtype = e.attrs.get(rdf.datatype) if (not dtype) and e.attrs.get(rdf.parseType) == 'Literal': dtype = rdf.XMLLiteral o = self.strlit(e.text, e.lang, dtype) self.triple(e.parent.subject, self.uri(e.URI), o) if e.attrs.has_key(rdf.ID): eid = e[rdf.ID] if not r_xmlname.match(eid): raise Exception("Not an XML name: %s" % eid.encode('utf-8')) i = self.uri(urijoin(e.base, ('#' + eid))) if i in self.xmlids: raise Exception("rdf:ID already used (3)") else: self.xmlids.append(i) del e.attrs[rdf.ID] self.reify(i, e.parent.subject, self.uri(e.URI), o) def parseTypeLiteralOrOtherPropertyElt(self, e): o = self.strlit(e.xtext[1], e.lang, rdf.XMLLiteral) self.triple(e.parent.subject, self.uri(e.URI), o) if e.attrs.has_key(rdf.ID): eid = e[rdf.ID] if not r_xmlname.match(eid): raise Exception("Not an XML name: %s" % eid.encode('utf-8')) e.subject = i = self.uri(urijoin(e.base, ('#' + eid))) if i in self.xmlids: raise Exception("rdf:ID already used (4)") else: self.xmlids.append(i) del e.attrs[rdf.ID] self.reify(i, e.parent.subject, self.uri(e.URI), o) def parseTypeResourcePropertyElt(self, e): n = self.bNode() self.triple(e.parent.subject, self.uri(e.URI), n) if e.attrs.has_key(rdf.ID): eid = e[rdf.ID] if not r_xmlname.match(eid): raise Exception("Not an XML name: %s" % eid.encode('utf-8')) e.subject = i = self.uri(urijoin(e.base, ('#' + eid))) if i in self.xmlids: raise Exception("rdf:ID already used (5)") else: self.xmlids.append(i) del e.attrs[rdf.ID] self.reify(i, e.parent.subject, self.uri(e.URI), n) c = Element(rdf, 'Description', e.attrs, e.parent, e.base) c.subject = n for child in e.children: child.parent = c c.children += [child] self.nodeElement(c) def parseTypeCollectionPropertyElt(self, e): for element in e.children: self.nodeElement(element) s = [self.bNode() for f in e.children] if not s: self.triple(e.parent.subject, self.uri(e.URI), self.uri(rdf.nil)) else: self.triple(e.parent.subject, self.uri(e.URI), s[0]) # for n in s: self.triple(n, self.uri(rdf.type), self.uri(rdf.List)) for i in range(len(s)): self.triple(s[i], self.uri(rdf.first), e.children[i].subject) for i in range(len(s) - 1): self.triple(s[i], self.uri(rdf.rest), s[i+1]) self.triple(s[-1], self.uri(rdf.rest), self.uri(rdf.nil)) def emptyPropertyElt(self, e): if e.attrs.keys() in ([], [rdf.ID]): r = self.strlit(e.text, e.lang) # was o self.triple(e.parent.subject, self.uri(e.URI), r) else: if e.attrs.has_key(rdf.resource): r = self.uri(urijoin(e.base, e[rdf.resource])) if e.attrs.has_key(rdf.nodeID): msg = "emptyPropertyElt has rdf:resource and rdf:nodeID" raise Exception(msg) elif e.attrs.has_key(rdf.nodeID): eni = e[rdf.nodeID] if not r_xmlname.match(eni): raise Exception("rdf:nodeID is not an XML name: %s" % eni) r = self.bNode(eni) else: r = self.bNode() for attr in e.attrs.keys(): # attrURI = attr[0] + attr[1] if attr not in self.disallowed: if attr != rdf.type: o = self.strlit(e.attrs[attr], e.lang) self.triple(r, self.uri(attr), o) else: self.triple(r, self.uri(rdf.type), self.uri(e.attrs[attr])) self.triple(e.parent.subject, self.uri(e.URI), r) if e.attrs.has_key(rdf.ID): eid = e[rdf.ID] if not r_xmlname.match(eid): raise Exception("Not an XML name: %s" % eid.encode('utf-8')) i = self.uri(urijoin(e.base, ('#' + eid))) if i in self.xmlids: raise Exception("rdf:ID already used (6)") else: self.xmlids.append(i) del e.attrs[rdf.ID] self.reify(i, e.parent.subject, self.uri(e.URI), r) class SAXRDFParser(xml.sax.handler.ContentHandler, RDFParser): def __init__(self, sink, base=None): RDFParser.__init__(self, sink, base=base) def startElementNS(self, name, qname, attribs): (xmlns, name), attrs = name, dict([(attribs.getNameByQName(n), attribs.getValueByQName(n)) for n in attribs.getQNames()]) self.startTag(xmlns, qname, attrs) def characters(self, chars): self.characterData(chars) def endElementNS(self, name, qname): xmlns, name = name self.endTag(xmlns, name) DefaultHandler = SAXRDFParser class Sink(object): def __init__(self): self.result = '' def triple(self, s, p, o): self.result += (s.encode('utf-8') + ' ' + p.encode('utf-8') + ' ' + o.encode('utf-8') + ' .\n') def parseRDF(s, base=None, sink=None): sink = sink or Sink() parser = xml.sax.make_parser() parser.start_namespace_decl("xml", x) parser.setFeature(xml.sax.handler.feature_namespaces, 1) try: parser.setFeature(xml.sax.handler.feature_namespace_prefixes, 1) except (xml.sax._exceptions.SAXNotSupportedException, xml.sax._exceptions.SAXNotRecognizedException): print >> sys.stderr, "Warning: prefixes error" parser.setContentHandler(DefaultHandler(sink, base)) parser.parse(StringIO(s)) return sink.result def parseURI(uri, base=None, sink=None): return parseRDF(urllib.urlopen(uri).read(), base=(base or uri), sink=sink) if __name__=="__main__": if len(sys.argv) == 3: print parseRDF(urllib.urlopen(sys.argv[1]).read(), base=sys.argv[2]) elif len(sys.argv) == 2: print parseURI(sys.argv[1]) else: print __doc__