#!/usr/bin/env python
"""
Avocet - A Structured Text language
Author: Sean B. Palmer, inamidst.com
Cf. http://inamidst.com/proj/avocet/
"""
import os, re, itertools
class Printer(object):
import textwrap
r_wordsep = re.compile(r'(\s+|(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')
textwrap.TextWrapper.wordsep_re = r_wordsep
opts = {'width': 79, 'expand_tabs': False, 'break_long_words': False}
wrap = textwrap.TextWrapper(**opts).wrap
openers = set(['
', '', ''])
closers = set(['
', '', '', '
'])
wrappers = set(['') or ('
\n' in bytes):
self.output.write(bytes + '\n')
if not self.nested:
self.output.write('\n')
elif bytes[:bytes.find('>')] in Printer.wrappers:
if bytes.startswith('
<'):
self.output.write('')
self.output.write('\n')
bytes = bytes[4:]
bytes = bytes.replace(' \n', '\n')
bytes = '\n'.join(Printer.wrap(bytes))
self.output.write(bytes + '\n')
if not self.nested:
self.output.write('\n')
elif bytes == '':
self.output.write(bytes + '\n')
else:
print repr(bytes)
self.output.write(bytes + '\n')
self.output.write('\n\n')
self.chunks = []
out = Printer()
kinds_cache = {}
r_simplelink = re.compile(
r'''(?imsx)([A-Za-z0-9\'_.-]+)[\ \n]+
(\\?)\{([^\s<>\'"}]+)\}(?![A-Za-z0-9])
''')
complexlink = r'(?ims)\{([^}]+?)(?: +-)?[ \n]+([^\s<>\'"}]+)\}'
r_complexlink = re.compile(complexlink)
r_footlink = re.compile(
r'''(?imsx)([A-Za-z0-9\'_.-]+)[\ \n]+
(\\?)\[([0-9]+|\#)\](?![A-Za-z0-9])
''')
r_astemphasis = re.compile(
r'''(?imsx)([*]?\*)(?=[A-Za-z0-9\'<_.-])
([^*]+)
(?<=[A-Za-z0-9\'>?!_.-])\*[*]?
''')
r_undemphasis = re.compile(
r'''(?imsx)(??!.-])__?(?![A-Za-z0-9])
''')
r_entity = re.compile(r'\\&((#[0-9]+|#x[0-9A-Fa-f]+|[A-Za-z]+);)')
def parse_escaped_entities(bytes):
return r_entity.sub(r'&\g<1>', bytes)
def parse_simple_links(bytes):
def link(match):
word, togger, uri = match.groups()
if not togger:
# if len(uri) >= 23: # (79 / (phi * 2)) - len('href="">')
# return '%s' % (uri, word)
# else:
return '%s' % (uri, word)
else: return '%s {%s}' % (word, uri)
return r_simplelink.sub(link, bytes)
document_footnotes = []
r_footnote = re.compile(r'^(\\?)\[([0-9]+|#)\]: (\S+)')
r_footnotebox = re.compile(r'\[[0-9]+|#\]')
def parse_complex_links(bytes):
def link(match):
global document_footnotes
phrase, uri = match.groups()
if r_footnotebox.match(uri):
uri = document_footnotes[0]
document_footnotes = document_footnotes[1:]
# if len(uri) >= 23: # (79 / (phi * 2)) - len('href="">')
# return '%s' % (uri, phrase)
# else:
return '%s' % (uri, phrase)
return r_complexlink.sub(link, bytes)
def parse_footlinks(bytes):
def link(match):
global document_footnotes
word = match.group(1)
try: uri = document_footnotes[0]
except IndexError: return '@@'
document_footnotes = document_footnotes[1:]
return '%s' % (uri, word)
return r_footlink.sub(link, bytes)
r_unicode = re.compile(
r'(\\?)\{U\+([0-9A-Fa-f]{2}|[0-9A-Fa-f]{4}|[0-9A-Fa-f]{6})\}'
)
def parse_unicode(bytes):
def codepoint(match):
esc, cp = match.groups()
if esc: return '{U+' + cp + '}'
i = int(cp, 16)
return unichr(i).encode('utf-8')
return r_unicode.sub(codepoint, bytes)
def parse_asterisk_emphasis(bytes):
def emph(match):
delim, contents = match.groups()
if len(delim) == 1:
return '' + contents + ''
else: return '' + contents + ''
return r_astemphasis.sub(emph, bytes)
def parse_underscore_emphasis(bytes):
def emph(match, bytes=bytes):
esc, delim, contents = match.groups()
if esc: return delim + contents + delim
if len(delim) == 1:
return '' + contents + ''
else: return '' + contents + ''
return r_undemphasis.sub(emph, bytes)
def parse_linebreaks(bytes):
return bytes.replace(' \n', '
\n')
r_copyright = re.compile(r'(\\?)\(c\)')
r_trademark = re.compile(r'(\\?)\(tm\)')
def parse_special(bytes):
def copyright(match):
if not match.group(1):
return '©'
else: return '(c)'
bytes = r_copyright.sub(copyright, bytes)
def trademark(match):
if not match.group(1):
return '™'
else: return '(tm)'
bytes = r_trademark.sub(trademark, bytes)
return bytes
block_elements = [
# Cf. http://www.w3.org/TR/html401/struct/global.html#edef-BODY
# Cf. http://www.w3.org/TR/html401/sgml/dtd.html#block
'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'pre', 'dl', 'div',
'noscript', 'blockquote', 'form', 'hr', 'table', 'fieldset', 'address',
'script', 'ins', 'del'
]
r_htmlblock = re.compile(r'(?i)^<(%s)(?= |>|\Z)' % '|'.join(block_elements))
other_elements = [
# Cf. http://www.w3.org/TR/html401/index/elements.html
'a', 'abbr', 'acronym', 'applet', 'area', 'b', 'base', 'basefont', 'bdo',
'big', 'body', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'dfn', 'dir', 'dt', 'em', 'font', 'frame', 'frameset',
'head', 'html', 'i', 'iframe', 'img', 'input', 'isindex', 'kbd', 'label',
'legend', 'li', 'link', 'map', 'menu', 'meta', 'noframes', 'object',
'optgroup', 'option', 'param', 'q', 's', 'samp', 'select', 'small', 'span',
'strike', 'strong', 'style', 'sub', 'sup', 'tbody', 'td', 'textarea',
'tfoot', 'th', 'thead', 'title', 'tr', 'tt', 'u', 'var'
]
names = block_elements + other_elements
r_amp = re.compile(r'(\\?)&(?!#[0-9]+;|#x[0-9A-Fa-f]+;|[A-Za-z]+;)')
r_lt = re.compile(r'<(?!/?(?:%s)(?=[ \n>]))' % '|'.join(names))
r_esctag = re.compile(r'\\<(?=/?(?:%s))' % '|'.join(names))
def parse_escapables(bytes):
def amp(match):
if not match.group(1):
return '&'
else: return '&'
bytes = r_amp.sub(amp, bytes)
bytes = r_lt.sub('<', bytes)
bytes = r_esctag.sub('<', bytes)
return bytes
def parse_pre_escapables(bytes):
bytes = bytes.replace('&', '&')
bytes = bytes.replace('<', '<')
return bytes
def parse_inline(bytes):
bytes = parse_unicode(bytes) # Must come before links
bytes = parse_simple_links(bytes)
bytes = parse_complex_links(bytes)
bytes = parse_footlinks(bytes)
bytes = parse_asterisk_emphasis(bytes)
bytes = parse_underscore_emphasis(bytes)
bytes = parse_linebreaks(bytes)
bytes = parse_special(bytes)
bytes = parse_escapables(bytes)
bytes = parse_escaped_entities(bytes)
return bytes
def parse_inline_preformatted(bytes):
bytes = parse_unicode(bytes) # Must come before links
# bytes = parse_simple_links(bytes)
# bytes = parse_complex_links(bytes)
# bytes = parse_footlinks(bytes)
# bytes = parse_asterisk_emphasis(bytes)
bytes = parse_pre_escapables(bytes)
bytes = parse_escaped_entities(bytes)
return bytes
def parse_inline_citation(bytes):
bytes = parse_unicode(bytes) # Must come before links
bytes = parse_simple_links(bytes)
bytes = parse_complex_links(bytes)
bytes = parse_footlinks(bytes)
bytes = parse_asterisk_emphasis(bytes)
bytes = parse_underscore_emphasis(bytes)
bytes = parse_special(bytes)
bytes = parse_escapables(bytes)
bytes = parse_escaped_entities(bytes)
return bytes
def parse_h1(block, prevkind, nextkind):
if block[0].startswith('{'):
lines = '\n'.join(block)
lines = lines[1:-1]
else: lines = '\n'.join(block[:-1])
lines = lines.strip(' \n')
out.write('' + parse_inline(lines) + '
')
out.flush()
def parse_h2(block, prevkind, nextkind):
if block[0].startswith('{'):
lines = '\n'.join(block)
lines = lines[2:-2]
else: lines = '\n'.join(block[:-1])
lines = lines.strip(' \n')
out.write('' + parse_inline(lines) + '
')
out.flush()
def parse_h3(block, prevkind, nextkind):
if block[0].startswith('{'):
lines = '\n'.join(block)
lines = lines[3:-3]
else: lines = '\n'.join(block[:-1])
lines = lines.strip(' \n')
out.write('' + parse_inline(lines) + '
')
out.flush()
def parse_ul(block, prevkind, nextkind):
out.line('')
out.flush()
items = []
delimiter = block[0][:2]
for line in block:
if line.startswith(delimiter):
line = ' ' + line[2:]
items.append([line])
else: items[-1].append(line)
for item in items:
spaces = []
for line in item:
space = len(line) - len(line.lstrip(' '))
if not space: break
spaces.append(space)
if spaces:
trim = min(spaces)
if trim:
for i, line in enumerate(item):
item[i] = line[trim:]
def item_blocks(item=item):
block = []
yield None
for line in item:
if line: block.append(line)
else:
yield block
block = []
if block:
yield block
out.write('- ')
for prev, block, next in window(item_blocks()):
parse_block(prev, block, next, default='li')
out.write('
')
out.flush()
out.write('
')
out.flush()
r_olfullstop = re.compile(r'\d+\. ')
r_olcparen = re.compile(r'\d+\) ')
def parse_ol(block, prevkind, nextkind):
out.write('')
out.flush()
r_delim = {
'.': r_olfullstop,
')': r_olcparen
}[block[0][1]]
items = []
for line in block:
m = r_delim.match(line)
if m:
end = m.end()
line = (' ' * end) + line[end:]
items.append([line])
else: items[-1].append(line)
for item in items:
spaces = []
for line in item:
space = len(line) - len(line.lstrip(' '))
if not space: break
spaces.append(space)
if spaces:
trim = min(spaces)
if trim:
for i, line in enumerate(item):
item[i] = line[trim:]
def item_blocks(item=item):
block = []
yield None
for line in item:
if line: block.append(line)
else:
yield block
block = []
if block:
yield block
out.write('- ')
for prev, block, next in window(item_blocks()):
parse_block(prev, block, next, default='li')
out.write('
')
out.flush()
out.write('
')
out.flush()
def parse_blockquote(block, prevkind, nextkind):
if block[0].startswith('> '):
for i, line in enumerate(block):
if line.startswith('> '):
block[i] = line[2:]
if block[-1].lstrip(' ').startswith('-'):
block, citation = block[:-1], block[-1].lstrip(' -')
else: citation = None
quote = '\n'.join(block)
start, finish = True, True
else:
if (block[0] == '[[[') and block[-1].startswith(']]]'):
block, citation = block[1:-1], block[-1].lstrip('] -') or None
# elif get_block_kind.blockquote and (block[0] == '[[['):
# block, citation = block[1:], None
# parse_blockquote.extended = True
start, finish = True, True
elif block[0] == '[[[':
block, citation = block[1:], None
parse_blockquote.extended = True
start, finish = True, False
# elif parse_blockquote.extended and block[-1].startswith(']]]'):
# block, citation = block[:-1], block[-1].lstrip('] -') or None
# parse_blockquote.extended = False
elif block[-1].startswith(']]]'):
block, citation = block[:-1], block[-1].lstrip('] -') or None
parse_blockquote.extended = False
start, finish = False, True
else:
citation = None
start, finish = False, False
quote = '\n'.join(block)
# elif block[0].startswith('[[['):
# block, citation = block[1:-1], block[-1].lstrip('] -') or None
# quote = '\n'.join(block)
if start:
out.write('')
out.flush()
try: parse_block(None, block, None)
except RuntimeError, e:
# print block
raise e
# out.write('')
# out.write(parse_inline(quote))
# out.write('
')
# out.flush()
if citation is not None:
out.write('—')
out.write(parse_inline_citation(citation)) # @@ or just parse_inline?
out.write('
')
out.flush()
if finish:
out.write('
')
out.flush()
# out.flush()
parse_blockquote.extended = False
def parse_htmlblock(block, prevkind, nextkind):
# Using an HTML block? You're on your own!
# @@ Will this safety out.flush cause problems?
# Answer: yes. False space.
# out.flush()
for line in block:
print line
print
def parse_poesy(block, prevkind, nextkind):
parse_p_like(block, prevkind, nextkind, poesy=True)
def parse_pre(block, prevkind, nextkind):
# if (prevkind != 'pre') or get_block_kind.pre:
if prevkind != 'pre':
out.write('')
out.write('\n')
if (block[0] == '{{{') and (block[-1] == '}}}'):
block = block[1:-1]
elif get_block_kind.pre and (block[0] == '{{{'):
block = block[1:]
parse_pre.extended = True
elif parse_pre.extended and (block[-1] == '}}}'):
block = block[:-1]
parse_pre.extended = False
if not get_block_kind.pre:
spaces = []
for line in block:
spaces.append(len(line) - len(line.lstrip(' ')))
trim = min(spaces)
if trim:
for i, line in enumerate(block):
block[i] = line[trim:]
out.write(parse_inline_preformatted('\n'.join(block)) + '\n')
if (nextkind != 'pre'):
out.write('
')
out.flush()
out.line()
parse_pre.extended = False
def parse_hr(block, prevkind, nextkind):
out.write('
')
out.flush()
def parse_footnote(block, prevkind, nextkind):
pass
def parse_p(block, prevkind, nextkind):
parse_p_like(block, prevkind, nextkind)
def parse_p_like(block, prevkind, nextkind, poesy=False):
if poesy and not block[-1].strip(' \t\r\n'):
block = block[:-1]
bytes = '\n'.join(block)
bytes = parse_inline(bytes)
bytes = bytes.strip(' \n')
if poesy:
bytes = bytes.replace('\n', '
\n')
if not bytes:
return
if (prevkind != 'li') or (nextkind != 'li'):
out.write('')
out.write(bytes)
if (prevkind != 'li') or (nextkind != 'li'):
out.write('
')
out.flush()
r_spaces = re.compile(r'(?= 3) and
(length == 2) and
not block[-1].strip('#'))):
return 'h1'
# Tests for 'h2'
if ((block[0].startswith('{{ ') and
block[-1].endswith(' }}')) or
(block[-1].startswith('=') and
(len(block[-1]) >= 3) and
(length == 2) and
not block[-1].strip('='))):
return 'h2'
# Tests for 'h3'
if ((block[0].startswith('{{{ ') and
block[-1].endswith(' }}}')) or
(block[-1].startswith('-') and
(len(block[-1]) >= 3) and
(length == 2) and
not block[-1].strip('-'))):
return 'h3'
# Tests for 'ul'
if (block[0].startswith('* ') or
block[0].startswith('+ ') or
block[0].startswith('- ')):
if block[0].startswith('- '):
for line in block[1:]:
if line.startswith('- '):
return 'ul'
else: return 'ul'
# Tests for 'ol'
if (block[0].startswith('1. ') or
block[0].startswith('1) ')):
# Force starting with "1"
return 'ol'
# Tests for 'blockquote'
# if (block[0].startswith('> ') or
# ((block[0] == '[[[') and
# (block[last].startswith(']]]')))):
# return 'blockquote'
if block[0].startswith('> '):
return 'blockquote'
if (block[0] == '[[['):
if not block[last].startswith(']]]'):
get_block_kind.blockquote = True
return 'blockquote'
if block[last].startswith(']]]'): # The stateless one needs this
return 'blockquote'
# Test for 'htmlblock'
if r_htmlblock.match(block[0]):
return 'htmlblock'
# Test for 'poesy'
if block[last] == ' ':
return 'poesy'
elif (block[last].endswith(' ') and
(len(block[last]) > 3) and
(block[last][-4] != ' ')):
return 'poesy'
# Tests for 'pre'
for pos, line in enumerate(block):
if (r_spaces.search(line[:-1]) or # Contains whitespace
line.startswith(' ') or # Indented
line.startswith('# ') or # Hash comment
line.startswith('/* ') or # Multiline C/CSS comment
line.startswith('// ')): # Single line C/CSS comment
return 'pre'
if (block[0] == '{{{'):
if not (block[last] == '}}}'):
get_block_kind.pre = True
return 'pre'
if (block[last] == '}}}'): # The stateless one needs this
return 'pre'
# Tests for 'hr'
if length == 1:
line = block[last]
alpha = 'ABCDEFGHIJKLMNOPQRSTUVXWYZ'
little = '-=#' # '_.-=+|%^&*,:;|'
big = alpha + alpha.lower() + '0123456789' + little
if len(line) >= 15:
first = line[0]
if (first in big) and (not line.strip(first)):
return 'hr'
elif len(line) >= 3:
first = line[0]
if (first in little) and (not line.strip(first)):
return 'hr'
# Test for 'footnotes'
if r_footnote.match(block[0]):
return 'footnote'
# Default is 'p'
return 'p'
get_block_kind.pre = False
get_block_kind.blockquote = False
def window(seq, n=3):
"""Returns a sliding window (of width n) over data from the iterable
s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
"""
it = itertools.chain(iter(seq), iter([None, None, None]))
result = tuple(itertools.islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
def blocks_from_file(f):
yield None
block = []
for line in f:
line = line.rstrip('\r\n')
if line: block.append(line)
elif block:
yield block
block = []
if block:
yield block
def parse_block(prev, block, next, default='p'):
# @@! kinds_cache doesn't really work
if block is None: return
try: kind = kinds_cache[tuple(block)]
except KeyError:
kind = get_block_kind(block)
kinds_cache[tuple(block)] = kind
if prev is not None:
try: prevkind = kinds_cache[tuple(prev)]
except KeyError:
prevkind = get_block_kind(prev)
kinds_cache[tuple(prev)] = prevkind
else: prevkind = default
if next is not None:
try: nextkind = kinds_cache[tuple(next)]
except KeyError:
nextkind = get_block_kind(next)
kinds_cache[tuple(next)] = nextkind
else: nextkind = default
parse = eval('parse_' + kind)
parse(block[:], prevkind, nextkind)
return True
def process_footnotes(f):
for line in f:
m = r_footnote.match(line)
if m:
togger, num, uri = m.groups()
if not togger:
document_footnotes.append(uri)
f.seek(0)
def parse_file(f):
import sys
if f is sys.stdin:
from cStringIO import StringIO
c = StringIO()
for line in f:
c.write(line)
c.seek(0)
f = c
process_footnotes(f)
for prev, block, next in window(blocks_from_file(f)):
result = parse_block(prev, block, next)
if not result: break
def test():
if os.path.exists('input.txt'):
f = open('input.txt')
parse_file(f)
f.close()
def main():
import sys
if len(sys.argv) > 1:
f = open(sys.argv[1])
parse_file(f)
f.close()
else: parse_file(sys.stdin)
if __name__ == '__main__':
main()