#!/usr/bin/env python """ parser.py - Gallimaufry of Whits Parser Author: Sean B. Palmer, inamidst.com """ # from __future__ import with_statement import sys, re, itertools r_date = re.compile(r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}) [A-Z]+: ?') r_longuri = re.compile(r'<(http://[^\s>]+)\r?\n *([^\s>]+)>') r_uri = re.compile( r'''(?x) # (?:<)? ((ftp|https?):// ([^\s<>"\'\[\]),;:.&] |&(?![gl]t;) |[\[\]),;:.](?!\ ))+) # >? ''') def bind(lines): bytes = '\n'.join(lines) def replace(match): first, second = match.groups() return '%s\xC2\xAD%s' % (first, second) bytes = r_longuri.sub(replace, bytes) return bytes.split('\n') def encode(line): line = line.replace('&', '&') line = line.replace('<', '<') def replace(match): uri = match.group(1) link = uri.replace('\xC2\xAD', '') broken = uri.replace('\xC2\xAD', '
→ ') return '%s' % (link, broken) return r_uri.sub(replace, line) def delimiter(line): if line.startswith('[[') and line.endswith('[['): return True elif line.startswith(']]'): return True return False class Block(object): def __init__(self, prev, lines, next): self.prev = prev self.lines = lines or [] self.scan = [line for line in self.lines if not delimiter(line)] self.next = next def first(self): return self.scan[0] def second(self): return self.scan[1] def penultimate(self): return self.scan[-2] def last(self): return self.scan[-1] def kind(self): if len(self.lines) == 1: line = self.first() if r_date.match(line): return 'h3' result = 'p' for i, line in enumerate(self.scan): parts = len([token for token in line.split(' ') if token]) if (((' ' in line) and (parts < 12)) or # Significant whitespace (' ' in line) or # Very significant whitespace line.startswith(' ') or # Starts with space line.startswith('# ') or # Commented out line.startswith('-> ') or # Inference chains # Any line but the last line is rather short (i < (len(self.scan) - 1)) and (len(line) <= 35)): result = 'pre' if len(self.lines) >= 2: # Mini citation block (i.e. a link and title) if (self.first().startswith('http://') and (self.second().startswith('- ') or self.second().startswith('-- '))): result = 'p' # Mini quotation block (i.e. a quote and a reference) elif (self.first().startswith('"') and self.penultimate().endswith('"') and (self.last().startswith('- ') or self.last().startswith('-- '))): result = 'p' # Reverse quotation block (i.e. label and a link) elif ((self.first().endswith(':') or self.first().endswith(': ')) and self.second().startswith('http://')): result = 'p' if len(self.lines) >= 4: # A weird extended quotation shortest = 79 for line in self.lines[:-3]: length = len(line) if length < shortest: shortest = length if (shortest > 35) and self.last().startswith('- '): result = 'p' if len(self.lines): if (self.first().startswith('* ') or self.first().startswith(' * ')): result = 'ul' return result def format(self): kind = self.kind() if kind == 'p': self.element(self.p) elif kind == 'pre': self.element(self.pre) elif kind == 'h3': self.h3() elif kind == 'ul': self.element(self.ul) def element(self, handler): if self.lines: if self.lines[0].startswith('[['): print '
' self.lines = self.lines[1:] cit = [] equot = False for i, line in enumerate(reversed(self.lines)): if line.startswith(' '): cit = [line] + cit continue elif line.startswith(']]'): equot = (line.lstrip(' ]') + ''.join(cit)) or True self.lines = self.lines[:-i-1] break else: break handler() if equot is True: print '
' print elif equot: print '

%s

' % encode(equot) print '' print def p(self): print '

' post = None if len(self.lines) >= 2: if (self.lines[0].startswith('http://') and (self.lines[1].startswith('- ') or self.lines[1].startswith('-- '))): print encode(self.lines[0]) + '
' self.lines = self.lines[1:] elif ((self.lines[0].endswith(':') or self.lines[0].endswith(': ')) and self.lines[1].startswith('http://')): print encode(self.lines[0]) + '
' self.lines = self.lines[1:] if len(self.lines) >= 4: if (self.lines[-2].startswith('- ') and self.lines[-1].startswith('- ')): post = [encode(self.lines[-2]) + '
', encode(self.lines[-1])] self.lines = self.lines[:-2] self.lines = bind(self.lines) for i, line in enumerate(self.lines): encoded = encode(line) if len(encoded) > 79: encoded = encoded.replace('' elif (len(line) > 65) or (i == (len(self.lines) - 1)): print encoded else: print encoded + '
' if post is not None: for line in post: print line print '

' print def pre(self): if (not self.prev) or (not self.prev.kind() == 'pre'): print '
'

      for line in self.lines: 
         print encode(line)

      if (not self.next) or (not self.next.kind() == 'pre'): 
         print '
' print def h3(self): datestamp = self.first() m = r_date.match(datestamp) datestamp = datestamp.replace('UTC', 'UTC') year, month, day = m.group(1), m.group(2), m.group(3) hour, minutes = m.group(4), m.group(5) identifier = 'N' + day + hour + minutes link = '
%s' % (identifier, datestamp) print '

%s

' % (identifier, link) print def ul(self): print '' print def sections(f): lines = [] for line in f: line = line.rstrip(' \r\n') if line: lines.append(line) else: yield lines lines = [] if lines: yield lines def window(seq, n=2): """Returns a sliding window (of width n) over data from the iterable s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... """ it = itertools.chain(iter(seq), iter([None, None, None])) result = tuple(itertools.islice(it, n)) if len(result) == n: yield result for elem in it: result = result[1:] + (elem,) yield result def blocks(f): first = True for one, two, three in window(sections(f), n=3): if first: yield Block(None, one, Block(None, two, None)) first = False if two is not None: yield Block(Block(None, one, None), two, Block(None, three, None)) def header(first, second): first = Block(None, first, None) month = first.lines[2][-7:].replace('/', '-') atom = 'type="application/atom+xml" title="Atom Feed" href="../feed"' print '' print '' print 'Gallimaufry of Whits · %s' % month print '' print '' % atom print '' print '' print '

Gallimaufry of Whits

' print '

for %s

' % month print """\

These are quick notes taken by Sean B. Palmer on the Semantic Web, Python and Javascript programming, history and antiquarianism, linguistics and conlanging, typography, and other related matters. To receive these bits of dreck regularly, subscribe to the feed. To browse other months, check the contents. This file was generated from plain text source, for convenience of posting, so apologies for all the in-your-face URIs.

""" def footer(): print print '
' print 'Sean B. Palmer, inamidst.com' print '
' print '' print '' def whits(f): first = True g = sections(f) first = g.next() second = g.next() header(first, second) for one, two, three in window(g, n=3): if first: yield Block(None, one, Block(None, two, None)) first = False if two is not None: yield Block(Block(None, one, None), two, Block(None, three, None)) footer() def parse(f): for block in whits(f): block.format() def main(): fn = sys.argv[1] # with open(fn) as f: # parse(f) f = open(fn) parse(f) f.close() if __name__ == '__main__': main()