#!/usr/bin/env python """ parser.py - Pluvo Language Parser Author: Sean B. Palmer, inamidst.com """ import sys, re from datatypes import Table, Variable, String, Number from basics import basics terminals = ( ('String', r'"[^"\\]*(?:\\.[^"\\]*)*"'), ('Regexp', r'/[^\n/\\]*(?:\\.[^\n/\\]*)*/'), ('URI', r'<[^ \t\r\n"<>]+>'), ('Documentation', r'%% [^\n]+(?=\n)|% (?:[^\n]+|\n(?!\n))*\n'), ('Comment', r'#[^\n]*'), ('Variable', r'[$@%]?[A-Za-z]+'), ('Flag', r'--?[A-Za-z][A-Za-z0-9]*'), ('Number', r'-?(?:[1-9][0-9]*)?[0-9](?:\.[0-9]+)?'), ('Indent', r'\n +'), ('DoubleEquals', r'=='), ('DoubleArrow', r'=>'), ('DoubleStar', r'\*\*'), ('OpenParen', r'\('), ('CloseParen', r'\)'), ('OpenBrace', r'\{'), ('CloseBrace', r'\}'), ('Equals', r'='), ('Colon', r':'), ('Plus', r'\+'), ('Minus', r'-'), ('Star', r'\*'), ('Slash', r'/'), ('Pipe', r'\|'), ('Question', r'\?'), ('FullStop', r'\.'), ('OpenTriangular', r'<'), ('CloseTriangular', r'>'), ('Newline', r'\n'), ('SemiColon', r';'), ('Other', r'.') ) CommandClosers = set(['CloseBrace', 'Newline', 'SemiColon']) Operators = set([ 'CloseTriangular', 'Colon', 'DoubleArrow', 'DoubleEquals', 'DoubleStar', 'Equals', 'FullStop', 'Minus', 'OpenTriangular', 'Pipe', 'Plus', 'Question', 'Slash', 'Star' ]) Atoms = Operators | set([ 'Comment', 'Documentation', 'Flag', 'Number', 'Regexp', 'String', 'URI', 'Variable' ]) class Token(object): """A parsed token, with type information. >>> token = Token('(', 'OpenParen') >>> print token.value, token.kind ( OpenParen """ def __init__(self, value, kind): """Create a new token from a lexical value and type.""" self.value = value self.kind = kind def __repr__(self): return "Token(%r, %r)" % (self.value, self.kind) def __str__(self): return "%s<%r>" % (self.kind, self.value) class Parser(object): """Tokenise utf-8 encoded source files. >>> p = Parser() >>> p.parseString('say "Hello World!"') >>> print p.tokens """ def __init__(self, verbose=False): """Make a new Parser instance.""" self.names = [name for (name, pattern) in terminals] self.token = re.compile(r'(%s)' % r'|'.join([ r'(%s)' % pattern for (name, pattern) in terminals ])) self.bufsiz = 2048 self.verbose = verbose self.tokens = [] self.length = 0 self.position = 0 self.program = None def parse(self, f): """Tokenise the file, f.""" self.file = f self.tokenise() def parseString(self, string): """Basically wrap string and pass it to Parser.parse.""" import cStringIO as StringIO f = StringIO.StringIO(string) f.seek(0) self.parse(f) def tokenise(self): """Tokenise the local file into Parser.tokens.""" bytes = self.file.read(self.bufsiz) if not bytes: return chunk = unicode(bytes, 'utf-8') position = 0 while True: match = self.token.search(chunk, position) if match: groups = match.groups() value = groups[0] i = list(groups[1:]).index(value) kind = self.names[i] if kind == 'Other': if value.strip(' \t\r\n'): raise ValueError("Odd token: %r" % value) position = match.end() continue token = Token(value, kind) # if token.kind != 'Other': self.tokens.append(token) position = match.end() else: bytes = self.file.read(self.bufsiz) if not bytes: break chunk = chunk[position:] + unicode(bytes, 'utf-8') position = 0 self.length = len(self.tokens) def peek(self): """Return the current token in the token stream.""" return self.tokens[self.position] def eat(self): """Move forward a token, returning the token passed.""" if self.position >= self.length: raise ValueError("No more tokens left to consume") self.position += 1 token = self.tokens[self.position - 1] if self.verbose: print >> sys.stderr, token return token def test(self, kind): """Test whether the current token has any of the given kinds.""" token = self.peek() if not isinstance(kind, set): return token.kind == kind return token.kind in kind def trim(string): return string.strip(' \t\r\n') if __name__ == '__main__': print trim(__doc__)