#!/usr/bin/env python """ sparql.py - SPARQL Parser and Abstract Syntax Copyright 2007, Sean B. Palmer, inamidst.com Licensed under the Eiffel Forum License 2. Package: http://inamidst.com/sw/trio/ """ import re class Terminal(object): pass class StringTerminal(Terminal): def __init__(self, name): self.name = name class RegexpTerminal(Terminal): def __init__(self, name, pattern): self.name = name self.pattern = pattern self.regexp = re.compile(pattern) # Terminal patterns IRI_REF = r'<([^<>"{}|^`\\\x00-\x20]*)>' try: PN_CHARS_BASE = ur'[A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370' + \ ur'-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001' + \ ur'-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF]' except: PN_CHARS_BASE = ur'[A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370' + \ ur'-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001' + \ ur'-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]' for line in ("Warning: You're using a Narrow Python build", "This means that N-Triples output might not be fully compliant", "Use a python compiled with --enable-unicode=ucs4 to fix this"): print >> __import__('sys').stderr, line PN_CHARS_U = PN_CHARS_BASE + r'|_' PN_CHARS = PN_CHARS_U + ur'|[-0-9\u00B7\u0300-\u036F\u203F-\u2040]' PN_PREFIX = PN_CHARS_BASE + r'((' + PN_CHARS + r'|\.)*' + PN_CHARS + r')?' PNAME_NS = r'(' + PN_PREFIX + r')?:' PN_LOCAL = r'(' + PN_CHARS_U + r'|[0-9])((' + PN_CHARS + r'|\.)*' + \ PN_CHARS + r')?' PNAME_LN = PNAME_NS + PN_LOCAL BLANK_NODE_LABEL = r'_:' + PN_LOCAL VARNAME = ur'(' + PN_CHARS_U + r'|[0-9])(' + PN_CHARS_U + \ ur'|[0-9\u00B7\u0300-\u036F\u203F-\u2040])*' VAR1 = r'\?' + VARNAME VAR2 = '\$' + VARNAME LANGTAG = r'@[a-zA-Z]+(-[a-zA-Z0-9]+)*' INTEGER = r'[0-9]+' DECIMAL = r'[0-9]+\.[0-9]*|\.[0-9]+' EXPONENT = r'[eE][+-]?[0-9]+' DOUBLE = r'[0-9]+\.[0-9]*' + EXPONENT + r'|\.([0-9])+' + EXPONENT + \ r'|([0-9])+' + EXPONENT INTEGER_POSITIVE = r'\+' + INTEGER DECIMAL_POSITIVE = r'\+' + DECIMAL DOUBLE_POSITIVE = r'\+' + DOUBLE INTEGER_NEGATIVE = r'-' + INTEGER DECIMAL_NEGATIVE = r'-' + DECIMAL DOUBLE_NEGATIVE = r'-' + DOUBLE ECHAR = r'\\[tbnrf\"]' STRING_LITERAL1 = r"'(([^\x27\x5C\x0A\x0D])|" + ECHAR + r")*'" STRING_LITERAL2 = r'"(([^\x22\x5C\x0A\x0D])|' + ECHAR + r')*"' STRING_LITERAL_LONG1 = r"'''(('|'')?([^'\]|" + ECHAR + r"))*'''" STRING_LITERAL_LONG2 = r'"""(("|"")?([^"\]|' + ECHAR + r'))*"""' WS = r'[\x20\x09\x0D\x0A]' NIL = r'\(' + WS + r'*\)' ANON = r'\[' + WS + r'*\]' terminals = [ RegexpTerminal('IRI_REF', IRI_REF), RegexpTerminal('PNAME_NS', PNAME_NS), RegexpTerminal('PNAME_LN', PNAME_LN), RegexpTerminal('BLANK_NODE_LABEL', BLANK_NODE_LABEL), RegexpTerminal('VAR1', VAR1), RegexpTerminal('VAR2', VAR2), RegexpTerminal('LANGTAG', LANGTAG), RegexpTerminal('INTEGER', INTEGER), RegexpTerminal('DECIMAL', DECIMAL), RegexpTerminal('DOUBLE', DOUBLE), RegexpTerminal('INTEGER_POSITIVE', INTEGER_POSITIVE), RegexpTerminal('DECIMAL_POSITIVE', DECIMAL_POSITIVE), RegexpTerminal('DOUBLE_POSITIVE', DOUBLE_POSITIVE), RegexpTerminal('INTEGER_NEGATIVE', INTEGER_NEGATIVE), RegexpTerminal('DECIMAL_NEGATIVE', DECIMAL_NEGATIVE), RegexpTerminal('DOUBLE_NEGATIVE', DOUBLE_NEGATIVE), RegexpTerminal('EXPONENT', EXPONENT), RegexpTerminal('STRING_LITERAL1', STRING_LITERAL1), RegexpTerminal('STRING_LITERAL2', STRING_LITERAL2), RegexpTerminal('STRING_LITERAL_LONG1', STRING_LITERAL_LONG1), RegexpTerminal('STRING_LITERAL_LONG2', STRING_LITERAL_LONG2), RegexpTerminal('ECHAR', ECHAR), RegexpTerminal('NIL', NIL), RegexpTerminal('WS', WS), RegexpTerminal('ANON', ANON), RegexpTerminal('PN_CHARS_BASE', PN_CHARS_BASE), RegexpTerminal('PN_CHARS_U', PN_CHARS_U), RegexpTerminal('PN_CHARS', PN_CHARS), # Unused terminals # RegexpTerminal('VARNAME', VARNAME), # RegexpTerminal('PN_PREFIX', PN_PREFIX), # RegexpTerminal('PN_LOCAL', PN_LOCAL), # Keywords RegexpTerminal('BASE', r'(?i)BASE\b'), RegexpTerminal('PREFIX', r'(?i)PREFIX\b'), RegexpTerminal('SELECT', r'(?i)SELECT\b'), RegexpTerminal('CONSTRUCT', r'(?i)CONSTRUCT\b'), RegexpTerminal('DESCRIBE', r'(?i)DESCRIBE\b'), RegexpTerminal('ASK', r'(?i)ASK\b'), RegexpTerminal('ORDER BY', r'(?i)ORDER BY\b'), RegexpTerminal('LIMIT', r'(?i)LIMIT\b'), RegexpTerminal('OFFSET', r'(?i)OFFSET\b'), RegexpTerminal('DISTINCT', r'(?i)DISTINCT\b'), RegexpTerminal('REDUCED', r'(?i)REDUCED\b'), RegexpTerminal('FROM', r'(?i)FROM\b'), RegexpTerminal('FROM NAMED', r'(?i)FROM NAMED\b'), RegexpTerminal('WHERE', r'(?i)WHERE\b'), RegexpTerminal('GRAPH', r'(?i)GRAPH\b'), RegexpTerminal('OPTIONAL', r'(?i)OPTIONAL\b'), RegexpTerminal('UNION', r'(?i)UNION\b'), RegexpTerminal('FILTER', r'(?i)FILTER\b'), RegexpTerminal('a', r'(?i)a\b'), RegexpTerminal('STR', r'(?i)STR\b'), RegexpTerminal('LANG', r'(?i)LANG\b'), RegexpTerminal('LANGMATCHES', r'(?i)LANGMATCHES\b'), RegexpTerminal('DATATYPE', r'(?i)DATATYPE\b'), RegexpTerminal('BOUND', r'(?i)BOUND\b'), RegexpTerminal('sameTERM', r'(?i)sameTERM\b'), RegexpTerminal('isURI', r'(?i)isURI\b'), RegexpTerminal('isIRI', r'(?i)isIRI\b'), RegexpTerminal('isLITERAL', r'(?i)isLITERAL\b'), RegexpTerminal('REGEX', r'(?i)REGEX\b'), RegexpTerminal('true', r'(?i)true\b'), RegexpTerminal('false', r'(?i)false\b'), # String terminals StringTerminal('{'), StringTerminal('}'), StringTerminal('.'), StringTerminal('*'), StringTerminal(','), StringTerminal('('), StringTerminal(')'), StringTerminal('=') ] class Token(object): def __init__(self, text, terminal): self.text = text self.terminal = terminal self.length = len(text) def __repr__(self): return self.terminal.name + (': %r' % self.text) def __str__(self): return self.__repr__() def tokenise(terminals, text, ignore=set()): while text: token = None for terminal in terminals: if isinstance(terminal, RegexpTerminal): m = terminal.regexp.match(text) if m: end = m.end() if (token is None) or (end > token.length): token = Token(m.group(0), terminal) elif isinstance(terminal, StringTerminal): if text.startswith(terminal.name): end = len(terminal.name) if (token is None) or (end > token.length): token = Token(terminal.name, terminal) else: raise ValueError('%r' % terminal) if token is None: break if not token.terminal.name in ignore: yield token text = text[token.length:] # SPARQL Default Terms class Term(object): pass class IRI(unicode, Term): def __repr__(self): return 'IRI(' + unicode.__repr__(self) + ')' class Variable(unicode): def __repr__(self): return 'Variable(' + unicode.__repr__(self) + ')' # SPARQL Abstract Syntax class Query(tuple): def __new__(cls, E, DS, R): return tuple.__new__(cls, (E, DS, R)) def __repr__(self): return 'Query' + tuple.__repr__(self) class BGP(list): def __new__(cls, *patterns): return list.__new__(cls, patterns) def __repr__(self): return 'BGP' + list.__repr__(self) class Filter(tuple): def __new__(cls, F, P): return tuple.__new__(cls, (F, P)) # SPARQL Results Syntax DEFAULT = None class Dataset(dict): def __init__(self, default=None): if default is not None: self[DEFAULT] = default class Multiset(list): def __repr__(self): return 'Multiset' + list.__repr__(self) def add(self, solution): self.append(solution) class SolutionMapping(dict): def __repr__(self): return 'SolutionMapping' + dict.__repr__(self) class SolutionSequence(list): pass # Results Evaluation def doFilter(F, Om): # print 'F, Om:', F, Om return Om def evaluate(D, pattern): if isinstance(pattern, Filter): F, P = pattern return doFilter(F, evaluate(D, P)) elif isinstance(pattern, BGP): G = D[DEFAULT] if hasattr(G, 'sparql'): return G.sparql(pattern) else: raise Exception('Not implemented: G.sparql') else: raise Exception('Not implemented: %r' % pattern) # SPARQL Parser debug = False def production(name): def production_outer(method, name=name): def production_inner(self, method=method, name=name): production.stack.append(name) if debug: print (' ' * (len(production.stack) - 1)) + name result = method(self) if debug: print (' ' * (len(production.stack) - 1)) + '/' + name production.stack.pop() return result return production_inner return production_outer production.stack = [] class EndOfFile(object): pass EOF = EndOfFile() class Parser(object): def __init__(self, terminals, input): self.token = None self.peek = None self.tokens = tokenise(terminals, input, ignore=set(['WS'])) self.bindings = {} def eat(self): self.token = self.peek try: self.peek = self.tokens.next() except StopIteration: self.peek = EOF # print 'EAT:', self.token return self.token def parse(self): self.eat() q = self.query() assert self.peek == EOF return q @production('Query') def query(self): # [1] @@ self.prologue() E, DS, R = self.selectQuery() return Query(E, DS, R) @production('Prologue') def prologue(self): # [2] @@ # PrefixDecl* while self.peek.terminal.name == 'PREFIX': self.prefixDecl() @production('PrefixDecl') def prefixDecl(self): # [3] 'PREFIX' PNAME_NS IRI_REF prefix = self.eat() assert prefix.terminal.name == 'PREFIX' pname = self.eat() assert pname.terminal.name == 'PNAME_NS' uri = self.eat() assert uri.terminal.name == 'IRI_REF' self.bindings[pname.text[:-1]] = uri.text[1:-1] @production('SelectQuery') def selectQuery(self): # [5] @@ select = self.eat() assert select.terminal.name == 'SELECT' # Var+ vars = [] if self.peek.terminal.name == '*': token = self.eat() vars.append(token.text) else: while self.peek.terminal.name in ('VAR1', 'VAR2'): token = self.eat() var = Variable(token.text[1:]) vars.append(var) # WhereClause E = self.whereClause() return E, None, ('SELECT', vars) @production('WhereClause') def whereClause(self): # [13] 'WHERE'? GroupGraphPattern if self.peek.terminal.name == 'WHERE': where = self.eat() assert where.terminal.name == 'WHERE' E = self.groupGraphPattern() return E @production('GroupGraphPattern') def groupGraphPattern(self): # [20] @@ # '{' TriplesBlock? ( ( GraphPatternNotTriples | Filter ) '.'? # TriplesBlock? )* '}' # GraphPatternNotTriples # OptionalGraphPattern -> "OPTIONAL" # GroupOrUnionGraphPattern # GroupGraphPattern -> "{" # GraphGraphPattern -> "GRAPH" # Filter -> "FILTER" # '{' TriplesBlock? '}' obrace = self.eat() assert obrace.terminal.name == '{' peekname = self.peek.terminal.name if peekname not in ('}', 'OPTIONAL', '{', 'GRAPH', 'FILTER'): patterns = self.triplesBlock() bgp = BGP(patterns) result = None while self.peek.terminal.name in ('OPTIONAL', '{', 'GRAPH', 'FILTER'): if self.peek.terminal.name != 'FILTER': raise Exception('Not implemented') else: expr = self.filter() # [26] result = Filter(expr, bgp) # @@ "." TriplesBlock if result is None: result = bgp cbrace = self.eat() assert cbrace.terminal.name == '}' return result @production('TriplesBlock') def triplesBlock(self): # [21] TriplesSameSubject ( '.' TriplesBlock? )? patterns = [] for pattern in self.triplesSameSubject(): patterns.append(pattern) if self.peek.terminal.name == '.': fullstop = self.eat() assert fullstop.terminal.name == '.' peekname = self.peek.terminal.name if peekname not in ('}', 'OPTIONAL', '{', 'GRAPH', 'FILTER'): for pattern in self.triplesBlock(): # [21] patterns.append(pattern) return patterns @production('Filter') def filter(self): # [26] 'FILTER' Constraint keyword = self.eat() assert keyword.terminal.name == 'FILTER' constraint = self.constraint() # [27] return constraint @production('Constraint') def constraint(self): # [27] @@ # @@ BrackettedExpression, BuiltInCall return self.brackettedExpression() # [56] # self.functionCall() # [28] @production('FunctionCall') def functionCall(self): # [28] IRIref ArgList self.IRIref() # [67] self.argList() # [29] @production('ArgList') def argList(self): # [29] NIL | '(' Expression ( ',' Expression )* ')' ) if self.peek.terminal.name == 'NIL': pass # @@ else: oparen = self.eat() assert oparen.terminal.name == '(' self.expression() # [46] while self.peek.terminal.name == ',': comma = self.eat() assert comma.terminal.name == ',' self.expression() # [46] cparen = self.eat() assert cparen.terminal.name == ')' @production('TriplesSameSubject') def triplesSameSubject(self): # [32] @@ VarOrTerm PropertyListNotEmpty | TriplesNode PropertyList subject = self.varOrTerm() pairs = self.propertyListNotEmpty() for (predicate, objects) in pairs: for object in objects: yield subject, predicate, object @production('PropertyListNotEmpty') def propertyListNotEmpty(self): # [33] @@ verb = self.verb() objects = self.objectList() return [(verb, objects)] @production('ObjectList') def objectList(self): # [35] @@ return [self.object()] @production('Object') def object(self): # [36] GraphNode return self.graphNode() @production('Verb') def verb(self): # [37] VarOrIRIref | 'a' if self.peek.terminal.name != 'a': return self.varOrIRIref() else: return token @production('GraphNode') def graphNode(self): # [41] @@ return self.varOrTerm() # [42] @production('VarOrTerm') def varOrTerm(self): # [42] Var | GraphTerm if self.peek.terminal.name in ('VAR1', 'VAR2'): return self.var() # [44] else: return self.graphTerm() # [45] @production('VarOrIRIref') def varOrIRIref(self): # [43] Var | IRIref if self.peek.terminal.name in ('VAR1', 'VAR2'): return self.var() # [44] else: return self.IRIref() # [67] @production('Var') def var(self): # [44] VAR1 | VAR2 token = self.eat() assert token.terminal.name in ('VAR1', 'VAR2') v = Variable(token.text[1:]) return v @production('GraphTerm') def graphTerm(self): # [45] @@ return self.IRIref() @production('Expression') def expression(self): # [46] ConditionalOrExpression return self.conditionalOrExpression() # [47] @production('ConditionalOrExpression') def conditionalOrExpression(self): # [47] @@ return self.conditionalAndExpression() # [48] @production('ConditionalAndExpression') def conditionalAndExpression(self): # [48] @@ return self.valueLogical() # [49] @production('ValueLogical') def valueLogical(self): # [49] RelationalExpression return self.relationalExpression() # [50] @production('RelationalExpression') def relationalExpression(self): # [50] @@ left = self.numericExpression() # [51] if self.peek.terminal.name == '=': equals = self.eat() right = self.numericExpression() # [51] return ('=', left, right) @production('NumericExpression') def numericExpression(self): # [51] AdditiveExpression return self.additiveExpression() # [52] @production('AdditiveExpression') def additiveExpression(self): # [52] @@ return self.multiplicativeExpression() # [53] @production('MultiplicativeExpression') def multiplicativeExpression(self): # [53] @@ return self.unaryExpression() # [54] @production('UnaryExpression') def unaryExpression(self): # [54] @@ return self.primaryExpression() # [55] @production('PrimaryExpression') def primaryExpression(self): # [55] @@ # need to support var and number... integer_names = set([ 'INTEGER', 'DECIMAL', 'DOUBLE', 'INTEGER_POSITIVE', 'DECIMAL_POSITIVE', 'DOUBLE_POSITIVE', 'INTEGER_NEGATIVE', 'DECIMAL_NEGATIVE', 'DOUBLE_NEGATIVE' ]) if self.peek.terminal.name in integer_names: return self.numericLiteral() # [61] elif self.peek.terminal.name in ('VAR1', 'VAR2'): return self.var() # [44] else: raise Exception('Not implemented') @production('BrackettedExpression') def brackettedExpression(self): # [56] '(' Expression ')' oparen = self.eat() assert oparen.terminal.name == '(' result = self.expression() # [46] cparen = self.eat() assert cparen.terminal.name == ')' return result @production('NumericLiteral') def numericLiteral(self): # [61] NumericLiteralUnsigned | NumericLiteralPositive | # NumericLiteralNegative unsigned = set(['INTEGER', 'DECIMAL', 'DOUBLE']) pos = set(['INTEGER_POSITIVE', 'DECIMAL_POSITIVE', 'DOUBLE_POSITIVE']) neg = set(['INTEGER_NEGATIVE', 'DECIMAL_NEGATIVE', 'DOUBLE_NEGATIVE']) if self.peek.terminal.name in unsigned: return self.numericLiteralUnsigned() # [62] elif self.peek.terminal.name in pos: return self.numericLiteralPositive() # [63] elif self.peek.terminal.name in neg: return self.numericLiteralNegative() # [64] else: raise Exception('Expected a numeric literal') @production('NumericLiteralUnsigned') def numericLiteralUnsigned(self): # [62] INTEGER | DECIMAL | DOUBLE return self.eat() @production('NumericLiteralPositive') def numericLiteralPositive(self): # [63] INTEGER_POSITIVE | DECIMAL_POSITIVE | DOUBLE_POSITIVE return self.eat() @production('NumericLiteralNegative') def numericLiteralNegative(self): # [64] INTEGER_NEGATIVE | DECIMAL_NEGATIVE | DOUBLE_NEGATIVE return self.eat() @production('IRIref') def IRIref(self): # [67] IRI_REF | PrefixedName if self.peek.terminal.name == 'IRI_REF': token = self.eat() return IRI(unicode(token.text[1:-1])) else: return self.prefixedName() # [68] @production('PrefixedName') def prefixedName(self): # [68] PNAME_LN | PNAME_NS assert self.peek.terminal.name in ('PNAME_LN', 'PNAME_NS'), self.peek token = self.eat() qname = token.text.split(':') return IRI(unicode(self.bindings[qname[0]] + qname[1])) def parse(input): p = Parser(terminals, input) query = p.parse() return query def test(): input = """\ PREFIX foaf: SELECT ?name ?mbox WHERE { ?x foaf:name ?name . ?x foaf:mbox ?mbox } """ # for token in tokenise(terminals, input, ignore=set(['WS'])): # print token print parse(input) if __name__ == '__main__': test()