#!/usr/bin/env python """ Notation3 Pre-Processor Author: Sean B. Palmer, inamidst.com License: GPL 2; share and enjoy! License: http://www.w3.org/Consortium/Legal/copyright-software Requires: Python 2.3+ Acknowledgements: Thanks to Yosi Scharf for implementation feedback. Design: The @keywords declaration in Notation3 requires custom tokenization in an N3 parser, hence obviating the generic approach that the RDF BNF for N3 aims to provide. TimBL suggested [1] that it might be feasible to instead pre-process the @keywords declarations; a pre-processor would also leave the path open for future extensibility to macros. Clearly, an N3 pre-processor needs to tokenize according to the patterns in the RDF BNF, getting precedence right where applicable. Yosi noted that the problem with token-pre-processing is that @ is used to start both language codes and prefixes. The test for that is . This module not only implements that, but even buffers the N3 input. [1] http://ilrt.org/discovery/chatlogs/swig/2005-01-12.html#T18-03-18 """ import sys, re, urllib bufsiz = 2048 if not hasattr(__builtins__, "set"): from sets import Set as set # Currently r_token is hand-transcribed from: # http://inamidst.com/n3p/grammar/n3.n3 # @@ Machine generate this instead r_token = re.compile(r'''(?mx)( # N3 Tokens (?:"""[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*""") | # n3:string (a) (?:"[^"\\]*(?:\\.[^"\\]*)*") | # n3:string (b) => | <= | \^\^ | # doubles \. | ; | , | = | \! | \^ | \{ | \} | \[ | \] | \( | \) | [ \t\r\n]+ | # whitespace (?:\#[^\n]*)?\r?\n | # comment (?<=")@[a-z]+(?:-[a-z0-9]+) | # n3:langcode @[A-Za-z]+ | # keyword [-+]?[0-9]+(?:\.[0-9]+)?(?:e[-+]?[0-9]+)? | # n3:numericliteral <[^>]*> | # n3:explicituri \?[A-Za-z_][A-Za-z0-9_]* | # n3:variable (?:(?:[A-Za-z_][A-Za-z0-9_]*)?:)?(?:[A-Za-z_][A-Za-z0-9_]*)? | # n3:qname [A-Za-z_][A-Za-z0-9_]* | # n3:barename .+ # catchall )''') keywords = set('prefix keywords forAll forSome a has is of this'.split(' ')) r_barename = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*$') class N3Tokenizer(object): def __init__(self): self.uri = None self.buffer = None self.chunk = '' self.was = '' def parseURI(self, uri): self.uri = uri self.buffer = urllib.urlopen(uri) for token in self.parse(): yield token self.buffer.close() def parse(self): while True: self.chunk += self.buffer.read(bufsiz) if not self.chunk: break for token in self.tokenize(): yield token def tokenize(self): """Tokenize the current chunk.""" while True: if not self.chunk: break waslen = len(self.was) m = r_token.match(self.was + self.chunk, waslen) if m: token = m.group(1) self.was = token yield token endpos = (m.end() - waslen) if not endpos: raise ValueError, "Got zero-length token" self.chunk = self.chunk[endpos:] else: break class N3Proc(N3Tokenizer): def __init__(self, uri, output=None): self.keywords = set(("a", "is", "of", "this")) self.userkeys = False if output is None: output = sys.stdout self.output = output self.prev = '' super(N3Proc, self).__init__() self.tokens = self.parseURI(uri) self.preprocess() def preprocess(self): for token in self.tokens: if token in self.keywords: if self.prev.startswith('"'): token = ' @' + token elif token in keywords: token = '@' + token else: raise ValueError, "Invalid keyword: %r" % token elif r_barename.match(token): if not self.userkeys: raise ValueError, "Barename used with user @keywords." token = ':' + token if (token == '@keywords') and (not self.prev.startswith('"')): self.output.write('@keywords . ') self.keywords = set() self.userkeys = True while True: tok = self.tokens.next() if tok == '.': break tok = tok.strip(' \t\r\n') if tok and (tok != ','): self.keywords.add(tok) else: self.output.write(token) self.prev = token def main(): uri = sys.argv[1] # tokenizer = N3Tokenizer() # for token in tokenizer.parseURI(uri): # print '%r' % token N3Proc(uri) if __name__=="__main__": main()