#!/usr/bin/env python
"""
parser.py - Gallimaufry of Whits Parser
Author: Sean B. Palmer, inamidst.com
"""
# from __future__ import with_statement
import sys, re, itertools
r_date = re.compile(r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}) [A-Z]+: ?')
r_longuri = re.compile(r'<(http://[^\s>]+)\r?\n *([^\s>]+)>')
r_uri = re.compile(
r'''(?x) # (?:<)?
((ftp|https?)://
([^\s<>"\'\[\]),;:.&]
|&(?![gl]t;)
|[\[\]),;:.](?!\ ))+)
# >?
''')
def bind(lines):
bytes = '\n'.join(lines)
def replace(match):
first, second = match.groups()
return '%s\xC2\xAD%s' % (first, second)
bytes = r_longuri.sub(replace, bytes)
return bytes.split('\n')
def encode(line):
line = line.replace('&', '&')
line = line.replace('<', '<')
def replace(match):
uri = match.group(1)
link = uri.replace('\xC2\xAD', '')
broken = uri.replace('\xC2\xAD', '
→ ')
return '%s' % (link, broken)
return r_uri.sub(replace, line)
def delimiter(line):
if line.startswith('[[') and line.endswith('[['):
return True
elif line.startswith(']]'):
return True
return False
class Block(object):
def __init__(self, prev, lines, next):
self.prev = prev
self.lines = lines or []
self.scan = [line for line in self.lines if not delimiter(line)]
self.next = next
def first(self):
return self.scan[0]
def second(self):
return self.scan[1]
def penultimate(self):
return self.scan[-2]
def last(self):
return self.scan[-1]
def kind(self):
if len(self.lines) == 1:
line = self.first()
if r_date.match(line):
return 'h3'
result = 'p'
for i, line in enumerate(self.scan):
parts = len([token for token in line.split(' ') if token])
if (((' ' in line) and (parts < 12)) or # Significant whitespace
(' ' in line) or # Very significant whitespace
line.startswith(' ') or # Starts with space
line.startswith('# ') or # Commented out
line.startswith('-> ') or # Inference chains
# Any line but the last line is rather short
(i < (len(self.scan) - 1)) and (len(line) <= 35)):
result = 'pre'
if len(self.lines) >= 2:
# Mini citation block (i.e. a link and title)
if (self.first().startswith('http://') and
(self.second().startswith('- ') or
self.second().startswith('-- '))):
result = 'p'
# Mini quotation block (i.e. a quote and a reference)
elif (self.first().startswith('"') and
self.penultimate().endswith('"') and
(self.last().startswith('- ') or
self.last().startswith('-- '))):
result = 'p'
# Reverse quotation block (i.e. label and a link)
elif ((self.first().endswith(':') or
self.first().endswith(': ')) and
self.second().startswith('http://')):
result = 'p'
if len(self.lines) >= 4:
# A weird extended quotation
shortest = 79
for line in self.lines[:-3]:
length = len(line)
if length < shortest:
shortest = length
if (shortest > 35) and self.last().startswith('- '):
result = 'p'
if len(self.lines):
if (self.first().startswith('* ') or
self.first().startswith(' * ')):
result = 'ul'
return result
def format(self):
kind = self.kind()
if kind == 'p':
self.element(self.p)
elif kind == 'pre':
self.element(self.pre)
elif kind == 'h3':
self.h3()
elif kind == 'ul':
self.element(self.ul)
def element(self, handler):
if self.lines:
if self.lines[0].startswith('[['):
print '
' self.lines = self.lines[1:] cit = [] equot = False for i, line in enumerate(reversed(self.lines)): if line.startswith(' '): cit = [line] + cit continue elif line.startswith(']]'): equot = (line.lstrip(' ]') + ''.join(cit)) or True self.lines = self.lines[:-i-1] break else: break handler() if equot is True: print '' print elif equot: print '
%s
' % encode(equot) print '' print def p(self): print ''
post = None
if len(self.lines) >= 2:
if (self.lines[0].startswith('http://') and
(self.lines[1].startswith('- ') or
self.lines[1].startswith('-- '))):
print encode(self.lines[0]) + '
'
self.lines = self.lines[1:]
elif ((self.lines[0].endswith(':') or
self.lines[0].endswith(': ')) and
self.lines[1].startswith('http://')):
print encode(self.lines[0]) + '
'
self.lines = self.lines[1:]
if len(self.lines) >= 4:
if (self.lines[-2].startswith('- ') and
self.lines[-1].startswith('- ')):
post = [encode(self.lines[-2]) + '
',
encode(self.lines[-1])]
self.lines = self.lines[:-2]
self.lines = bind(self.lines)
for i, line in enumerate(self.lines):
encoded = encode(line)
if len(encoded) > 79:
encoded = encoded.replace(''
elif (len(line) > 65) or (i == (len(self.lines) - 1)):
print encoded
else: print encoded + '
'
if post is not None:
for line in post:
print line
print '
' for line in self.lines: print encode(line) if (not self.next) or (not self.next.kind() == 'pre'): print '' print def h3(self): datestamp = self.first() m = r_date.match(datestamp) datestamp = datestamp.replace('UTC', 'UTC') year, month, day = m.group(1), m.group(2), m.group(3) hour, minutes = m.group(4), m.group(5) identifier = 'N' + day + hour + minutes link = '%s' % (identifier, datestamp) print '
These are quick notes taken by Sean B. Palmer on the Semantic Web, Python and Javascript programming, history and antiquarianism, linguistics and conlanging, typography, and other related matters. To receive these bits of dreck regularly, subscribe to the feed. To browse other months, check the contents. This file was generated from plain text source, for convenience of posting, so apologies for all the in-your-face URIs.
""" def footer(): print print '' print 'Sean B. Palmer, inamidst.com' print '' print '' print '' def whits(f): first = True g = sections(f) first = g.next() second = g.next() header(first, second) for one, two, three in window(g, n=3): if first: yield Block(None, one, Block(None, two, None)) first = False if two is not None: yield Block(Block(None, one, None), two, Block(None, three, None)) footer() def parse(f): for block in whits(f): block.format() def main(): fn = sys.argv[1] # with open(fn) as f: # parse(f) f = open(fn) parse(f) f.close() if __name__ == '__main__': main()