#!/usr/bin/env python
"""
Notation3 Pre-Processor
Author: Sean B. Palmer, inamidst.com
License: GPL 2; share and enjoy!
License: http://www.w3.org/Consortium/Legal/copyright-software
Requires: Python 2.3+

Acknowledgements: 
Thanks to Yosi Scharf for implementation feedback.

Design: 
The @keywords declaration in Notation3 requires custom tokenization in an N3 
parser, hence obviating the generic approach that the RDF BNF for N3 aims to 
provide. TimBL suggested [1] that it might be feasible to instead pre-process 
the @keywords declarations; a pre-processor would also leave the path open for 
future extensibility to macros. 

Clearly, an N3 pre-processor needs to tokenize according to the patterns in 
the RDF BNF, getting precedence right where applicable. Yosi noted that the 
problem with token-pre-processing is that @ is used to start both language 
codes and prefixes. The test for that is <test/keywords-08.n3>. This module not 
only implements that, but even buffers the N3 input.

[1] http://ilrt.org/discovery/chatlogs/swig/2005-01-12.html#T18-03-18
"""

import sys, re, urllib

bufsiz = 2048

if not hasattr(__builtins__, "set"): 
   from sets import Set as set

# Currently r_token is hand-transcribed from: 
#    http://inamidst.com/n3p/grammar/n3.n3
# @@ Machine generate this instead

r_token = re.compile(r'''(?mx)( # N3 Tokens
   (?:"""[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*""") | # n3:string (a)
   (?:"[^"\\]*(?:\\.[^"\\]*)*") | # n3:string (b)
   => | <= | \^\^ | # doubles
   \. | ; | , | = | \! | \^ | \{ | \} | \[ | \] | \( | \) | 
   [ \t\r\n]+ | # whitespace
   (?:\#[^\n]*)?\r?\n | # comment
   (?<=")@[a-z]+(?:-[a-z0-9]+) | # n3:langcode
   @[A-Za-z]+ | # keyword
   [-+]?[0-9]+(?:\.[0-9]+)?(?:e[-+]?[0-9]+)? | # n3:numericliteral
   <[^>]*> | # n3:explicituri
   \?[A-Za-z_][A-Za-z0-9_]* | # n3:variable
   (?:(?:[A-Za-z_][A-Za-z0-9_]*)?:)?(?:[A-Za-z_][A-Za-z0-9_]*)? | # n3:qname
   [A-Za-z_][A-Za-z0-9_]* | # n3:barename
   .+ # catchall
)''')

keywords = set('prefix keywords forAll forSome a has is of this'.split(' '))
r_barename = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*$')

class N3Tokenizer(object): 
   def __init__(self): 
      self.uri = None
      self.buffer = None
      self.chunk = ''
      self.was = ''

   def parseURI(self, uri): 
      self.uri = uri
      self.buffer = urllib.urlopen(uri)
      for token in self.parse(): 
         yield token
      self.buffer.close()

   def parse(self): 
      while True: 
         self.chunk += self.buffer.read(bufsiz)
         if not self.chunk: break
         for token in self.tokenize(): 
            yield token

   def tokenize(self): 
      """Tokenize the current chunk."""
      while True: 
         if not self.chunk: break
         waslen = len(self.was)
         m = r_token.match(self.was + self.chunk, waslen)
         if m: 
            token = m.group(1)
            self.was = token
            yield token
            endpos = (m.end() - waslen)
            if not endpos: 
               raise ValueError, "Got zero-length token"
            self.chunk = self.chunk[endpos:]
         else: break

class N3Proc(N3Tokenizer): 
   def __init__(self, uri, output=None): 
      self.keywords = set(("a", "is", "of", "this"))
      self.userkeys = False
      if output is None: 
         output = sys.stdout
      self.output = output
      self.prev = ''
      super(N3Proc, self).__init__()
      self.tokens = self.parseURI(uri)
      self.preprocess()

   def preprocess(self): 
      for token in self.tokens: 
         if token in self.keywords: 
            if self.prev.startswith('"'): 
               token = ' @' + token
            elif token in keywords: 
               token = '@' + token
            else: raise ValueError, "Invalid keyword: %r" % token
         elif r_barename.match(token): 
            if not self.userkeys: 
               raise ValueError, "Barename used with user @keywords."
            token = ':' + token

         if (token == '@keywords') and (not self.prev.startswith('"')): 
            self.output.write('@keywords . ')
            self.keywords = set()
            self.userkeys = True
            while True: 
               tok = self.tokens.next()
               if tok == '.': break
               tok = tok.strip(' \t\r\n')
               if tok and (tok != ','): 
                  self.keywords.add(tok)
         else: self.output.write(token)
         self.prev = token

def main(): 
   uri = sys.argv[1]
   # tokenizer = N3Tokenizer()
   # for token in tokenizer.parseURI(uri): 
   #    print '%r' % token
   N3Proc(uri)

if __name__=="__main__": 
   main()