#!/usr/bin/env python """ Avocet - A Structured Text language Author: Sean B. Palmer, inamidst.com Cf. http://inamidst.com/proj/avocet/ """ import os, re, itertools class Printer(object): import textwrap r_wordsep = re.compile(r'(\s+|(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') textwrap.TextWrapper.wordsep_re = r_wordsep opts = {'width': 79, 'expand_tabs': False, 'break_long_words': False} wrap = textwrap.TextWrapper(**opts).wrap openers = set(['

']) closers = set(['

', '', '', '

']) wrappers = set(['') or ('
\n' in bytes): self.output.write(bytes + '\n') if not self.nested: self.output.write('\n') elif bytes[:bytes.find('>')] in Printer.wrappers: if bytes.startswith('

<'): self.output.write('

') self.output.write('\n') bytes = bytes[4:] bytes = bytes.replace(' \n', '\n') bytes = '\n'.join(Printer.wrap(bytes)) self.output.write(bytes + '\n') if not self.nested: self.output.write('\n') elif bytes == '

': self.output.write(bytes + '\n') else: print repr(bytes) self.output.write(bytes + '\n') self.output.write('\n\n') self.chunks = [] out = Printer() kinds_cache = {} r_simplelink = re.compile( r'''(?imsx)([A-Za-z0-9\'_.-]+)[\ \n]+ (\\?)\{([^\s<>\'"}]+)\}(?![A-Za-z0-9]) ''') complexlink = r'(?ims)\{([^}]+?)(?: +-)?[ \n]+([^\s<>\'"}]+)\}' r_complexlink = re.compile(complexlink) r_footlink = re.compile( r'''(?imsx)([A-Za-z0-9\'_.-]+)[\ \n]+ (\\?)\[([0-9]+|\#)\](?![A-Za-z0-9]) ''') r_astemphasis = re.compile( r'''(?imsx)([*]?\*)(?=[A-Za-z0-9\'<_.-]) ([^*]+) (?<=[A-Za-z0-9\'>?!_.-])\*[*]? ''') r_undemphasis = re.compile( r'''(?imsx)(??!.-])__?(?![A-Za-z0-9]) ''') r_entity = re.compile(r'\\&((#[0-9]+|#x[0-9A-Fa-f]+|[A-Za-z]+);)') def parse_escaped_entities(bytes): return r_entity.sub(r'&\g<1>', bytes) def parse_simple_links(bytes): def link(match): word, togger, uri = match.groups() if not togger: # if len(uri) >= 23: # (79 / (phi * 2)) - len('href="">') # return '%s' % (uri, word) # else: return '%s' % (uri, word) else: return '%s {%s}' % (word, uri) return r_simplelink.sub(link, bytes) document_footnotes = [] r_footnote = re.compile(r'^(\\?)\[([0-9]+|#)\]: (\S+)') r_footnotebox = re.compile(r'\[[0-9]+|#\]') def parse_complex_links(bytes): def link(match): global document_footnotes phrase, uri = match.groups() if r_footnotebox.match(uri): uri = document_footnotes[0] document_footnotes = document_footnotes[1:] # if len(uri) >= 23: # (79 / (phi * 2)) - len('href="">') # return '%s' % (uri, phrase) # else: return '%s' % (uri, phrase) return r_complexlink.sub(link, bytes) def parse_footlinks(bytes): def link(match): global document_footnotes word = match.group(1) try: uri = document_footnotes[0] except IndexError: return '@@' document_footnotes = document_footnotes[1:] return '%s' % (uri, word) return r_footlink.sub(link, bytes) r_unicode = re.compile( r'(\\?)\{U\+([0-9A-Fa-f]{2}|[0-9A-Fa-f]{4}|[0-9A-Fa-f]{6})\}' ) def parse_unicode(bytes): def codepoint(match): esc, cp = match.groups() if esc: return '{U+' + cp + '}' i = int(cp, 16) return unichr(i).encode('utf-8') return r_unicode.sub(codepoint, bytes) def parse_asterisk_emphasis(bytes): def emph(match): delim, contents = match.groups() if len(delim) == 1: return '' + contents + '' else: return '' + contents + '' return r_astemphasis.sub(emph, bytes) def parse_underscore_emphasis(bytes): def emph(match, bytes=bytes): esc, delim, contents = match.groups() if esc: return delim + contents + delim if len(delim) == 1: return '' + contents + '' else: return '' + contents + '' return r_undemphasis.sub(emph, bytes) def parse_linebreaks(bytes): return bytes.replace(' \n', '
\n') r_copyright = re.compile(r'(\\?)\(c\)') r_trademark = re.compile(r'(\\?)\(tm\)') def parse_special(bytes): def copyright(match): if not match.group(1): return '©' else: return '(c)' bytes = r_copyright.sub(copyright, bytes) def trademark(match): if not match.group(1): return '™' else: return '(tm)' bytes = r_trademark.sub(trademark, bytes) return bytes block_elements = [ # Cf. http://www.w3.org/TR/html401/struct/global.html#edef-BODY # Cf. http://www.w3.org/TR/html401/sgml/dtd.html#block 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'pre', 'dl', 'div', 'noscript', 'blockquote', 'form', 'hr', 'table', 'fieldset', 'address', 'script', 'ins', 'del' ] r_htmlblock = re.compile(r'(?i)^<(%s)(?= |>|\Z)' % '|'.join(block_elements)) other_elements = [ # Cf. http://www.w3.org/TR/html401/index/elements.html 'a', 'abbr', 'acronym', 'applet', 'area', 'b', 'base', 'basefont', 'bdo', 'big', 'body', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'dfn', 'dir', 'dt', 'em', 'font', 'frame', 'frameset', 'head', 'html', 'i', 'iframe', 'img', 'input', 'isindex', 'kbd', 'label', 'legend', 'li', 'link', 'map', 'menu', 'meta', 'noframes', 'object', 'optgroup', 'option', 'param', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', 'strong', 'style', 'sub', 'sup', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'title', 'tr', 'tt', 'u', 'var' ] names = block_elements + other_elements r_amp = re.compile(r'(\\?)&(?!#[0-9]+;|#x[0-9A-Fa-f]+;|[A-Za-z]+;)') r_lt = re.compile(r'<(?!/?(?:%s)(?=[ \n>]))' % '|'.join(names)) r_esctag = re.compile(r'\\<(?=/?(?:%s))' % '|'.join(names)) def parse_escapables(bytes): def amp(match): if not match.group(1): return '&' else: return '&' bytes = r_amp.sub(amp, bytes) bytes = r_lt.sub('<', bytes) bytes = r_esctag.sub('<', bytes) return bytes def parse_pre_escapables(bytes): bytes = bytes.replace('&', '&') bytes = bytes.replace('<', '<') return bytes def parse_inline(bytes): bytes = parse_unicode(bytes) # Must come before links bytes = parse_simple_links(bytes) bytes = parse_complex_links(bytes) bytes = parse_footlinks(bytes) bytes = parse_asterisk_emphasis(bytes) bytes = parse_underscore_emphasis(bytes) bytes = parse_linebreaks(bytes) bytes = parse_special(bytes) bytes = parse_escapables(bytes) bytes = parse_escaped_entities(bytes) return bytes def parse_inline_preformatted(bytes): bytes = parse_unicode(bytes) # Must come before links # bytes = parse_simple_links(bytes) # bytes = parse_complex_links(bytes) # bytes = parse_footlinks(bytes) # bytes = parse_asterisk_emphasis(bytes) bytes = parse_pre_escapables(bytes) bytes = parse_escaped_entities(bytes) return bytes def parse_inline_citation(bytes): bytes = parse_unicode(bytes) # Must come before links bytes = parse_simple_links(bytes) bytes = parse_complex_links(bytes) bytes = parse_footlinks(bytes) bytes = parse_asterisk_emphasis(bytes) bytes = parse_underscore_emphasis(bytes) bytes = parse_special(bytes) bytes = parse_escapables(bytes) bytes = parse_escaped_entities(bytes) return bytes def parse_h1(block, prevkind, nextkind): if block[0].startswith('{'): lines = '\n'.join(block) lines = lines[1:-1] else: lines = '\n'.join(block[:-1]) lines = lines.strip(' \n') out.write('

' + parse_inline(lines) + '

') out.flush() def parse_h2(block, prevkind, nextkind): if block[0].startswith('{'): lines = '\n'.join(block) lines = lines[2:-2] else: lines = '\n'.join(block[:-1]) lines = lines.strip(' \n') out.write('

' + parse_inline(lines) + '

') out.flush() def parse_h3(block, prevkind, nextkind): if block[0].startswith('{'): lines = '\n'.join(block) lines = lines[3:-3] else: lines = '\n'.join(block[:-1]) lines = lines.strip(' \n') out.write('

' + parse_inline(lines) + '

') out.flush() def parse_ul(block, prevkind, nextkind): out.line('

') for prev, block, next in window(item_blocks()): parse_block(prev, block, next, default='li') out.write('

') out.flush() r_olfullstop = re.compile(r'\d+\. ') r_olcparen = re.compile(r'\d+\) ') def parse_ol(block, prevkind, nextkind): out.write('

') for prev, block, next in window(item_blocks()): parse_block(prev, block, next, default='li') out.write('

') out.flush() def parse_blockquote(block, prevkind, nextkind): if block[0].startswith('> '): for i, line in enumerate(block): if line.startswith('> '): block[i] = line[2:] if block[-1].lstrip(' ').startswith('-'): block, citation = block[:-1], block[-1].lstrip(' -') else: citation = None quote = '\n'.join(block) start, finish = True, True else: if (block[0] == '[[[') and block[-1].startswith(']]]'): block, citation = block[1:-1], block[-1].lstrip('] -') or None # elif get_block_kind.blockquote and (block[0] == '[[['): # block, citation = block[1:], None # parse_blockquote.extended = True start, finish = True, True elif block[0] == '[[[': block, citation = block[1:], None parse_blockquote.extended = True start, finish = True, False # elif parse_blockquote.extended and block[-1].startswith(']]]'): # block, citation = block[:-1], block[-1].lstrip('] -') or None # parse_blockquote.extended = False elif block[-1].startswith(']]]'): block, citation = block[:-1], block[-1].lstrip('] -') or None parse_blockquote.extended = False start, finish = False, True else: citation = None start, finish = False, False quote = '\n'.join(block) # elif block[0].startswith('[[['): # block, citation = block[1:-1], block[-1].lstrip('] -') or None # quote = '\n'.join(block) if start: out.write('

') out.flush() parse_block(None, block, None) # out.write('
') # out.write(parse_inline(quote)) # out.write('
') # out.flush() if citation is not None: out.write('
—') out.write(parse_inline_citation(citation)) # @@ or just parse_inline? out.write('
') out.flush() if finish: out.write('

') out.flush() # out.flush() parse_blockquote.extended = False def parse_htmlblock(block, prevkind, nextkind): # Using an HTML block? You're on your own! # @@ Will this safety out.flush cause problems? # Answer: yes. False space. # out.flush() for line in block: print line print def parse_poesy(block, prevkind, nextkind): parse_p_like(block, prevkind, nextkind, poesy=True) def parse_pre(block, prevkind, nextkind): # if (prevkind != 'pre') or get_block_kind.pre: if prevkind != 'pre': out.write('

')
      out.write('\n')

   if (block[0] == '{{{') and (block[-1] == '}}}'): 
      block = block[1:-1]
   elif get_block_kind.pre and (block[0] == '{{{'): 
      block = block[1:]
      parse_pre.extended = True
   elif parse_pre.extended and (block[-1] == '}}}'): 
      block = block[:-1]
      parse_pre.extended = False

   if not get_block_kind.pre: 
      spaces = []
      for line in block: 
         spaces.append(len(line) - len(line.lstrip(' ')))
      trim = min(spaces)
      if trim: 
         for i, line in enumerate(block): 
            block[i] = line[trim:]

   out.write(parse_inline_preformatted('\n'.join(block)) + '\n')

   if (nextkind != 'pre'): 
      out.write('

') out.flush() out.line() parse_pre.extended = False def parse_hr(block, prevkind, nextkind): out.write('

') out.flush() def parse_footnote(block, prevkind, nextkind): pass def parse_p(block, prevkind, nextkind): parse_p_like(block, prevkind, nextkind) def parse_p_like(block, prevkind, nextkind, poesy=False): if poesy and not block[-1].strip(' \t\r\n'): block = block[:-1] bytes = '\n'.join(block) bytes = parse_inline(bytes) bytes = bytes.strip(' \n') if poesy: bytes = bytes.replace('\n', '
\n') if not bytes: return if (prevkind != 'li') or (nextkind != 'li'): out.write('

') out.write(bytes) if (prevkind != 'li') or (nextkind != 'li'): out.write('

') out.flush() r_spaces = re.compile(r'(?= 3) and (length == 2) and not block[-1].strip('#'))): return 'h1' # Tests for 'h2' if ((block[0].startswith('{{ ') and block[-1].endswith(' }}')) or (block[-1].startswith('=') and (len(block[-1]) >= 3) and (length == 2) and not block[-1].strip('='))): return 'h2' # Tests for 'h3' if ((block[0].startswith('{{{ ') and block[-1].endswith(' }}}')) or (block[-1].startswith('-') and (len(block[-1]) >= 3) and (length == 2) and not block[-1].strip('-'))): return 'h3' # Tests for 'ul' if (block[0].startswith('* ') or block[0].startswith('+ ') or block[0].startswith('- ')): if block[0].startswith('- '): for line in block[1:]: if line.startswith('- '): return 'ul' else: return 'ul' # Tests for 'ol' if (block[0].startswith('1. ') or block[0].startswith('1) ')): # Force starting with "1" return 'ol' # Tests for 'blockquote' # if (block[0].startswith('> ') or # ((block[0] == '[[[') and # (block[last].startswith(']]]')))): # return 'blockquote' if block[0].startswith('> '): return 'blockquote' if (block[0] == '[[['): if not block[last].startswith(']]]'): get_block_kind.blockquote = True return 'blockquote' if block[last].startswith(']]]'): # The stateless one needs this return 'blockquote' # Test for 'htmlblock' if r_htmlblock.match(block[0]): return 'htmlblock' # Test for 'poesy' if block[last] == ' ': return 'poesy' elif (block[last].endswith(' ') and (len(block[last]) > 3) and (block[last][-4] != ' ')): return 'poesy' # Tests for 'pre' for pos, line in enumerate(block): if (r_spaces.search(line[:-1]) or # Contains whitespace line.startswith(' ') or # Indented line.startswith('# ') or # Hash comment line.startswith('/* ') or # Multiline C/CSS comment line.startswith('// ')): # Single line C/CSS comment return 'pre' if (block[0] == '{{{'): if not (block[last] == '}}}'): get_block_kind.pre = True return 'pre' if (block[last] == '}}}'): # The stateless one needs this return 'pre' # Tests for 'hr' if length == 1: line = block[last] alpha = 'ABCDEFGHIJKLMNOPQRSTUVXWYZ' little = '-=#' # '_.-=+|%^&*,:;|' big = alpha + alpha.lower() + '0123456789' + little if len(line) >= 15: first = line[0] if (first in big) and (not line.strip(first)): return 'hr' elif len(line) >= 3: first = line[0] if (first in little) and (not line.strip(first)): return 'hr' # Test for 'footnotes' if r_footnote.match(block[0]): return 'footnote' # Default is 'p' return 'p' get_block_kind.pre = False get_block_kind.blockquote = False def window(seq, n=3): """Returns a sliding window (of width n) over data from the iterable s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... """ it = itertools.chain(iter(seq), iter([None, None, None])) result = tuple(itertools.islice(it, n)) if len(result) == n: yield result for elem in it: result = result[1:] + (elem,) yield result def blocks_from_file(f): yield None block = [] for line in f: line = line.rstrip('\r\n') if line: block.append(line) elif block: yield block block = [] if block: yield block def parse_block(prev, block, next, default='p'): if block is None: return try: kind = kinds_cache[tuple(block)] except KeyError: kind = get_block_kind(block) kinds_cache[tuple(block)] = kind if prev is not None: try: prevkind = kinds_cache[tuple(prev)] except KeyError: prevkind = get_block_kind(prev) kinds_cache[tuple(prev)] = prevkind else: prevkind = default if next is not None: try: nextkind = kinds_cache[tuple(next)] except KeyError: nextkind = get_block_kind(next) kinds_cache[tuple(next)] = nextkind else: nextkind = default parse = eval('parse_' + kind) parse(block[:], prevkind, nextkind) return True def process_footnotes(f): for line in f: m = r_footnote.match(line) if m: togger, num, uri = m.groups() if not togger: document_footnotes.append(uri) f.seek(0) def parse_file(f): import sys if f is sys.stdin: from cStringIO import StringIO c = StringIO() for line in f: c.write(line) c.seek(0) f = c process_footnotes(f) for prev, block, next in window(blocks_from_file(f)): result = parse_block(prev, block, next) if not result: break def test(): if os.path.exists('input.txt'): f = open('input.txt') parse_file(f) f.close() def main(): import sys if len(sys.argv) > 1: f = open(sys.argv[1]) parse_file(f) f.close() else: parse_file(sys.stdin) if __name__ == '__main__': main()