#!/usr/bin/env python
"""
Avocet - A Structured Text language
Author: Sean B. Palmer, inamidst.com
Cf. http://inamidst.com/proj/avocet/
"""

import os, re, itertools

class Printer(object): 
   import textwrap
   r_wordsep = re.compile(r'(\s+|(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')
   textwrap.TextWrapper.wordsep_re = r_wordsep
   opts = {'width': 79, 'expand_tabs': False, 'break_long_words': False}
   wrap = textwrap.TextWrapper(**opts).wrap

   openers = set(['<ul>', '<ol>', '<blockquote>'])
   closers = set(['</ul>', '</ol>', '</blockquote>', '<hr />'])
   wrappers = set(['<h1', '<h2', '<h3', '<p', '<li'])

   def __init__(self, output=None): 
      if output is None: 
         self.output = __import__('sys').stdout
      else: self.output = output

      self.chunks = []
      self.nested = False

   def write(self, *args): 
      bytes = ' '.join(str(arg) for arg in args)
      self.chunks.append(bytes)

   def line(self, *args): 
      bytes = ' '.join(str(arg) for arg in args) + '\n'
      self.chunks.append(bytes)      

   def flush(self): 
      bytes = ''.join(self.chunks)
      bytes = bytes.strip(' \t\r\n')

      if bytes in Printer.openers: 
         self.output.write(bytes + '\n')
         self.nested = True

      elif bytes in Printer.closers: 
         self.output.write(bytes + '\n')
         self.output.write('\n')
         self.nested = False

      elif bytes.startswith('<pre>') or ('<br />\n' in bytes): 
         self.output.write(bytes + '\n')
         if not self.nested: 
            self.output.write('\n')

      elif bytes[:bytes.find('>')] in Printer.wrappers: 
         if bytes.startswith('<li><'): 
            self.output.write('<li>')
            self.output.write('\n')
            bytes = bytes[4:]

         bytes = bytes.replace(' \n', '\n')
         bytes = '\n'.join(Printer.wrap(bytes))
         self.output.write(bytes + '\n')
         if not self.nested: 
            self.output.write('\n')

      elif bytes == '</li>': 
         self.output.write(bytes + '\n')

      else: 
         print repr(bytes)
         self.output.write(bytes + '\n')
         self.output.write('\n\n')
      self.chunks = []
out = Printer()

kinds_cache = {}

r_simplelink = re.compile(
r'''(?imsx)([A-Za-z0-9\'_.-]+)[\ \n]+
   (\\?)\{([^\s<>\'"}]+)\}(?![A-Za-z0-9])
''')

complexlink = r'(?ims)\{([^}]+?)(?: +-)?[ \n]+([^\s<>\'"}]+)\}'
r_complexlink = re.compile(complexlink)

r_footlink = re.compile(
r'''(?imsx)([A-Za-z0-9\'_.-]+)[\ \n]+
   (\\?)\[([0-9]+|\#)\](?![A-Za-z0-9])
''')

r_astemphasis = re.compile(
r'''(?imsx)([*]?\*)(?=[A-Za-z0-9\'<_.-])
   ([^*]+)
   (?<=[A-Za-z0-9\'>?!_.-])\*[*]?
''')

r_undemphasis = re.compile(
r'''(?imsx)(?<![A-Za-z0-9])(\\?)(_?_)(?=[A-Za-z0-9\'<.-])
   ([^_]+)
   (?<=[A-Za-z0-9\'>?!.-])__?(?![A-Za-z0-9])
''')

r_entity = re.compile(r'\\&((#[0-9]+|#x[0-9A-Fa-f]+|[A-Za-z]+);)')
def parse_escaped_entities(bytes): 
   return r_entity.sub(r'&amp;\g<1>', bytes)

def parse_simple_links(bytes): 
   def link(match): 
      word, togger, uri = match.groups()
      if not togger:
         # if len(uri) >= 23: # (79 / (phi * 2)) - len('href="">') 
         #    return '<a href="%s"\n>%s</a>' % (uri, word)
         # else: 
         return '<a href="%s">%s</a>' % (uri, word)
      else: return '%s {%s}' % (word, uri)
   return r_simplelink.sub(link, bytes)

document_footnotes = []
r_footnote = re.compile(r'^(\\?)\[([0-9]+|#)\]: (\S+)')
r_footnotebox = re.compile(r'\[[0-9]+|#\]')

def parse_complex_links(bytes): 
   def link(match): 
      global document_footnotes
      phrase, uri = match.groups()
      if r_footnotebox.match(uri): 
         uri = document_footnotes[0]
         document_footnotes = document_footnotes[1:]
      # if len(uri) >= 23: # (79 / (phi * 2)) - len('href="">')
      #    return '<a href="%s"\n>%s</a>' % (uri, phrase)
      # else: 
      return '<a href="%s">%s</a>' % (uri, phrase)
   return r_complexlink.sub(link, bytes)

def parse_footlinks(bytes): 
   def link(match): 
       global document_footnotes
       word = match.group(1)
       try: uri = document_footnotes[0]
       except IndexError: return '@@'
       document_footnotes = document_footnotes[1:]
       return '<a href="%s">%s</a>' % (uri, word)
   return r_footlink.sub(link, bytes)

r_unicode = re.compile(
   r'(\\?)\{U\+([0-9A-Fa-f]{2}|[0-9A-Fa-f]{4}|[0-9A-Fa-f]{6})\}'
)
def parse_unicode(bytes): 
   def codepoint(match): 
      esc, cp = match.groups()
      if esc: return '&#x7B;U+' + cp + '&#x7D;'
      i = int(cp, 16)
      return unichr(i).encode('utf-8')
   return r_unicode.sub(codepoint, bytes)

def parse_asterisk_emphasis(bytes): 
   def emph(match): 
      delim, contents = match.groups()
      if len(delim) == 1: 
         return '<em>' + contents + '</em>'
      else: return '<strong>' + contents + '</strong>'
   return r_astemphasis.sub(emph, bytes)

def parse_underscore_emphasis(bytes): 
   def emph(match, bytes=bytes): 
      esc, delim, contents = match.groups()
      if esc: return delim + contents + delim
      if len(delim) == 1: 
         return '<em>' + contents + '</em>'
      else: return '<strong>' + contents + '</strong>'
   return r_undemphasis.sub(emph, bytes)

def parse_linebreaks(bytes): 
   return bytes.replace('  \n', '<br />\n')

r_copyright = re.compile(r'(\\?)\(c\)')
r_trademark = re.compile(r'(\\?)\(tm\)')
def parse_special(bytes): 
   def copyright(match): 
      if not match.group(1): 
         return '&#xA9;'
      else: return '(c)'
   bytes = r_copyright.sub(copyright, bytes)

   def trademark(match): 
      if not match.group(1): 
         return '&#x2122;'
      else: return '(tm)'
   bytes = r_trademark.sub(trademark, bytes)
   return bytes

block_elements = [
   # Cf. http://www.w3.org/TR/html401/struct/global.html#edef-BODY
   # Cf. http://www.w3.org/TR/html401/sgml/dtd.html#block
   'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'pre', 'dl', 'div',
   'noscript', 'blockquote', 'form', 'hr', 'table', 'fieldset', 'address',
   'script', 'ins', 'del'
]

r_htmlblock = re.compile(r'(?i)^<(%s)(?= |>|\Z)' % '|'.join(block_elements))

other_elements = [
   # Cf. http://www.w3.org/TR/html401/index/elements.html
   'a', 'abbr', 'acronym', 'applet', 'area', 'b', 'base', 'basefont', 'bdo',
   'big', 'body', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
   'colgroup', 'dd', 'dfn', 'dir', 'dt', 'em', 'font', 'frame', 'frameset',
   'head', 'html', 'i', 'iframe', 'img', 'input', 'isindex', 'kbd', 'label',
   'legend', 'li', 'link', 'map', 'menu', 'meta', 'noframes', 'object',
   'optgroup', 'option', 'param', 'q', 's', 'samp', 'select', 'small', 'span',
   'strike', 'strong', 'style', 'sub', 'sup', 'tbody', 'td', 'textarea',
   'tfoot', 'th', 'thead', 'title', 'tr', 'tt', 'u', 'var'
]

names = block_elements + other_elements

r_amp = re.compile(r'(\\?)&(?!#[0-9]+;|#x[0-9A-Fa-f]+;|[A-Za-z]+;)')
r_lt = re.compile(r'<(?!/?(?:%s)(?=[ \n>]))' % '|'.join(names))
r_esctag = re.compile(r'\\<(?=/?(?:%s))' % '|'.join(names))
def parse_escapables(bytes): 
   def amp(match): 
      if not match.group(1): 
         return '&amp;'
      else: return '&'
   bytes = r_amp.sub(amp, bytes)

   bytes = r_lt.sub('&lt;', bytes)
   bytes = r_esctag.sub('&lt;', bytes)
   return bytes

def parse_pre_escapables(bytes): 
   bytes = bytes.replace('&', '&amp;')
   bytes = bytes.replace('<', '&lt;')
   return bytes

def parse_inline(bytes): 
   bytes = parse_unicode(bytes) # Must come before links
   bytes = parse_simple_links(bytes)
   bytes = parse_complex_links(bytes)
   bytes = parse_footlinks(bytes)
   bytes = parse_asterisk_emphasis(bytes)
   bytes = parse_underscore_emphasis(bytes)
   bytes = parse_linebreaks(bytes)
   bytes = parse_special(bytes)
   bytes = parse_escapables(bytes)
   bytes = parse_escaped_entities(bytes)
   return bytes

def parse_inline_preformatted(bytes): 
   bytes = parse_unicode(bytes) # Must come before links
   # bytes = parse_simple_links(bytes)
   # bytes = parse_complex_links(bytes)
   # bytes = parse_footlinks(bytes)
   # bytes = parse_asterisk_emphasis(bytes)
   bytes = parse_pre_escapables(bytes)
   bytes = parse_escaped_entities(bytes)
   return bytes

def parse_inline_citation(bytes): 
   bytes = parse_unicode(bytes) # Must come before links
   bytes = parse_simple_links(bytes)
   bytes = parse_complex_links(bytes)
   bytes = parse_footlinks(bytes)
   bytes = parse_asterisk_emphasis(bytes)
   bytes = parse_underscore_emphasis(bytes)
   bytes = parse_special(bytes)
   bytes = parse_escapables(bytes)
   bytes = parse_escaped_entities(bytes)
   return bytes

def parse_h1(block, prevkind, nextkind): 
   if block[0].startswith('{'): 
      lines = '\n'.join(block)
      lines = lines[1:-1]
   else: lines = '\n'.join(block[:-1])
   lines = lines.strip(' \n')

   out.write('<h1>' + parse_inline(lines) + '</h1>')
   out.flush()

def parse_h2(block, prevkind, nextkind): 
   if block[0].startswith('{'): 
      lines = '\n'.join(block)
      lines = lines[2:-2]
   else: lines = '\n'.join(block[:-1])
   lines = lines.strip(' \n')

   out.write('<h2>' + parse_inline(lines) + '</h2>')
   out.flush()

def parse_h3(block, prevkind, nextkind): 
   if block[0].startswith('{'): 
      lines = '\n'.join(block)
      lines = lines[3:-3]
   else: lines = '\n'.join(block[:-1])
   lines = lines.strip(' \n')

   out.write('<h3>' + parse_inline(lines) + '</h3>')
   out.flush()

def parse_ul(block, prevkind, nextkind): 
   out.line('<ul>')
   out.flush()

   items = []
   delimiter = block[0][:2]

   for line in block: 
      if line.startswith(delimiter): 
         line = '  ' + line[2:]
         items.append([line])
      else: items[-1].append(line)

   for item in items: 
      spaces = []
      for line in item: 
         space = len(line) - len(line.lstrip(' '))
         if not space: break
         spaces.append(space)
      if spaces: 
         trim = min(spaces)
         if trim: 
            for i, line in enumerate(item):
               item[i] = line[trim:]

      def item_blocks(item=item): 
         block = []
         yield None

         for line in item: 
            if line: block.append(line)
            else: 
               yield block
               block = []

         if block: 
            yield block

      out.write('<li>')
      for prev, block, next in window(item_blocks()): 
         parse_block(prev, block, next, default='li')
      out.write('</li>')
      out.flush()

   out.write('</ul>')
   out.flush()

r_olfullstop = re.compile(r'\d+\. ')
r_olcparen = re.compile(r'\d+\) ')

def parse_ol(block, prevkind, nextkind): 
   out.write('<ol>')
   out.flush()

   r_delim = {
      '.': r_olfullstop, 
      ')': r_olcparen
   }[block[0][1]]

   items = []
   for line in block: 
      m = r_delim.match(line)
      if m: 
         end = m.end()
         line = (' ' * end) + line[end:]
         items.append([line])
      else: items[-1].append(line)

   for item in items: 
      spaces = []
      for line in item: 
         space = len(line) - len(line.lstrip(' '))
         if not space: break
         spaces.append(space)
      if spaces: 
         trim = min(spaces)
         if trim: 
            for i, line in enumerate(item):
               item[i] = line[trim:]

      def item_blocks(item=item): 
         block = []
         yield None

         for line in item: 
            if line: block.append(line)
            else: 
               yield block
               block = []

         if block: 
            yield block

      out.write('<li>')
      for prev, block, next in window(item_blocks()): 
         parse_block(prev, block, next, default='li')
      out.write('</li>')
      out.flush()

   out.write('</ol>')
   out.flush()

def parse_blockquote(block, prevkind, nextkind): 
   if block[0].startswith('> '): 
      for i, line in enumerate(block): 
         if line.startswith('> '): 
            block[i] = line[2:]

      if block[-1].lstrip(' ').startswith('-'): 
         block, citation = block[:-1], block[-1].lstrip(' -')
      else: citation = None
      quote = '\n'.join(block)
      start, finish = True, True
   else: 
      if (block[0] == '[[[') and block[-1].startswith(']]]'):
         block, citation = block[1:-1], block[-1].lstrip('] -') or None
      # elif get_block_kind.blockquote and (block[0] == '[[['):
      #    block, citation = block[1:], None
      #    parse_blockquote.extended = True
         start, finish = True, True
      elif block[0] == '[[[':
         block, citation = block[1:], None
         parse_blockquote.extended = True
         start, finish = True, False
      # elif parse_blockquote.extended and block[-1].startswith(']]]'):
      #    block, citation = block[:-1], block[-1].lstrip('] -') or None
      #    parse_blockquote.extended = False
      elif block[-1].startswith(']]]'):
         block, citation = block[:-1], block[-1].lstrip('] -') or None
         parse_blockquote.extended = False
         start, finish = False, True
      else: 
         citation = None
         start, finish = False, False
      quote = '\n'.join(block)

   # elif block[0].startswith('[[['): 
   #    block, citation = block[1:-1], block[-1].lstrip('] -') or None
   #    quote = '\n'.join(block)

   if start: 
      out.write('<blockquote>')
      out.flush()

   parse_block(None, block, None)

   # out.write('<p>')
   # out.write(parse_inline(quote))
   # out.write('</p>')
   # out.flush()

   if citation is not None: 
      out.write('<p>&#x2014;<cite>')
      out.write(parse_inline_citation(citation)) # @@ or just parse_inline?
      out.write('</cite></p>')
      out.flush()

   if finish: 
      out.write('</blockquote>')
      out.flush()
   # out.flush()
parse_blockquote.extended = False

def parse_htmlblock(block, prevkind, nextkind): 
   # Using an HTML block? You're on your own!

   # @@ Will this safety out.flush cause problems?
   # Answer: yes. False space.
   # out.flush()

   for line in block: 
      print line
   print 

def parse_poesy(block, prevkind, nextkind): 
   parse_p_like(block, prevkind, nextkind, poesy=True)

def parse_pre(block, prevkind, nextkind): 
   # if (prevkind != 'pre') or get_block_kind.pre: 
   if prevkind != 'pre': 
      out.write('<pre>')
      out.write('\n')

   if (block[0] == '{{{') and (block[-1] == '}}}'): 
      block = block[1:-1]
   elif get_block_kind.pre and (block[0] == '{{{'): 
      block = block[1:]
      parse_pre.extended = True
   elif parse_pre.extended and (block[-1] == '}}}'): 
      block = block[:-1]
      parse_pre.extended = False

   if not get_block_kind.pre: 
      spaces = []
      for line in block: 
         spaces.append(len(line) - len(line.lstrip(' ')))
      trim = min(spaces)
      if trim: 
         for i, line in enumerate(block): 
            block[i] = line[trim:]

   out.write(parse_inline_preformatted('\n'.join(block)) + '\n')

   if (nextkind != 'pre'): 
      out.write('</pre>')
      out.flush()
   out.line()
parse_pre.extended = False

def parse_hr(block, prevkind, nextkind): 
   out.write('<hr />')
   out.flush()

def parse_footnote(block, prevkind, nextkind): 
   pass

def parse_p(block, prevkind, nextkind): 
   parse_p_like(block, prevkind, nextkind)

def parse_p_like(block, prevkind, nextkind, poesy=False): 
   if poesy and not block[-1].strip(' \t\r\n'): 
      block = block[:-1]
   bytes = '\n'.join(block)
   bytes = parse_inline(bytes)
   bytes = bytes.strip(' \n')
   if poesy: 
      bytes = bytes.replace('\n', '<br />\n')
   if not bytes: 
      return

   if (prevkind != 'li') or (nextkind != 'li'): 
      out.write('<p>')
   out.write(bytes)
   if (prevkind != 'li') or (nextkind != 'li'): 
      out.write('</p>')
      out.flush()

r_spaces = re.compile(r'(?<!\.)  ')
def get_block_kind(block): 
   # Useful information
   length = len(block)
   last = length - 1

   # If the state says so...
   if get_block_kind.pre: 
      if block[last] == '}}}': 
         get_block_kind.pre = False
      return 'pre'

   if get_block_kind.blockquote:
      if block[last].startswith(']]]'): 
         get_block_kind.blockquote = False
      return 'blockquote'

   # Filter out empty blocks
   if not length: 
      return 'p'

   # Tests for 'h1'
   if ((block[0].startswith('{ ') and 
        block[-1].endswith(' }')) or 
       (block[-1].startswith('#') and 
        (len(block[-1]) >= 3) and 
        (length == 2) and  
        not block[-1].strip('#'))): 
      return 'h1'

   # Tests for 'h2'
   if ((block[0].startswith('{{ ') and 
        block[-1].endswith(' }}')) or 
       (block[-1].startswith('=') and 
        (len(block[-1]) >= 3) and 
        (length == 2) and  
        not block[-1].strip('='))): 
      return 'h2'

   # Tests for 'h3'
   if ((block[0].startswith('{{{ ') and 
        block[-1].endswith(' }}}')) or 
       (block[-1].startswith('-') and 
        (len(block[-1]) >= 3) and 
        (length == 2) and  
        not block[-1].strip('-'))): 
      return 'h3'

   # Tests for 'ul'
   if (block[0].startswith('* ') or 
       block[0].startswith('+ ') or 
       block[0].startswith('- ')): 
      if block[0].startswith('- '): 
         for line in block[1:]: 
            if line.startswith('- '): 
               return 'ul'
      else: return 'ul'

   # Tests for 'ol'
   if (block[0].startswith('1. ') or 
       block[0].startswith('1) ')): 
      # Force starting with "1"
      return 'ol'

   # Tests for 'blockquote'
   # if (block[0].startswith('> ') or 
   #     ((block[0] == '[[[') and 
   #      (block[last].startswith(']]]')))): 
   #    return 'blockquote'

   if block[0].startswith('> '): 
      return 'blockquote'
   if (block[0] == '[[['): 
      if not block[last].startswith(']]]'): 
         get_block_kind.blockquote = True
      return 'blockquote'
   if block[last].startswith(']]]'): # The stateless one needs this
      return 'blockquote'

   # Test for 'htmlblock'
   if r_htmlblock.match(block[0]): 
      return 'htmlblock'

   # Test for 'poesy'
   if block[last] == '  ': 
      return 'poesy'
   elif (block[last].endswith('   ') and 
         (len(block[last]) > 3) and 
         (block[last][-4] != ' ')): 
      return 'poesy'

   # Tests for 'pre'
   for pos, line in enumerate(block): 
      if (r_spaces.search(line[:-1]) or # Contains whitespace
          line.startswith(' ') or # Indented
          line.startswith('# ') or # Hash comment
          line.startswith('/* ') or # Multiline C/CSS comment
          line.startswith('// ')): # Single line C/CSS comment
         return 'pre'
   if (block[0] == '{{{'): 
      if not (block[last] == '}}}'): 
         get_block_kind.pre = True
      return 'pre'
   if (block[last] == '}}}'): # The stateless one needs this
      return 'pre'

   # Tests for 'hr'
   if length == 1: 
      line = block[last]
      alpha = 'ABCDEFGHIJKLMNOPQRSTUVXWYZ'
      little = '-=#' # '_.-=+|%^&*,:;|'
      big = alpha + alpha.lower() + '0123456789' + little

      if len(line) >= 15: 
         first = line[0]
         if (first in big) and (not line.strip(first)): 
            return 'hr'
      elif len(line) >= 3: 
         first = line[0]
         if (first in little) and (not line.strip(first)): 
            return 'hr'

   # Test for 'footnotes'
   if r_footnote.match(block[0]): 
      return 'footnote'

   # Default is 'p'
   return 'p'
get_block_kind.pre = False
get_block_kind.blockquote = False

def window(seq, n=3): 
   """Returns a sliding window (of width n) over data from the iterable
      s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
   """
   it = itertools.chain(iter(seq), iter([None, None, None]))
   result = tuple(itertools.islice(it, n))
   if len(result) == n:
       yield result
   for elem in it:
       result = result[1:] + (elem,)
       yield result

def blocks_from_file(f): 
   yield None
   block = []

   for line in f: 
      line = line.rstrip('\r\n')
      if line: block.append(line)
      elif block: 
         yield block
         block = []

   if block: 
      yield block

def parse_block(prev, block, next, default='p'): 
   if block is None: return

   try: kind = kinds_cache[tuple(block)]
   except KeyError: 
      kind = get_block_kind(block)
      kinds_cache[tuple(block)] = kind

   if prev is not None: 
      try: prevkind = kinds_cache[tuple(prev)]
      except KeyError: 
         prevkind = get_block_kind(prev)
         kinds_cache[tuple(prev)] = prevkind
   else: prevkind = default

   if next is not None: 
      try: nextkind = kinds_cache[tuple(next)]
      except KeyError: 
         nextkind = get_block_kind(next)
         kinds_cache[tuple(next)] = nextkind
   else: nextkind = default

   parse = eval('parse_' + kind)
   parse(block[:], prevkind, nextkind)
   return True

def process_footnotes(f): 
   for line in f: 
      m = r_footnote.match(line)
      if m: 
         togger, num, uri = m.groups()
         if not togger: 
            document_footnotes.append(uri)
   f.seek(0)

def parse_file(f): 
   import sys
   if f is sys.stdin: 
      from cStringIO import StringIO
      c = StringIO()
      for line in f: 
         c.write(line)
      c.seek(0)
      f = c
   process_footnotes(f)

   for prev, block, next in window(blocks_from_file(f)): 
      result = parse_block(prev, block, next)
      if not result: break

def test(): 
   if os.path.exists('input.txt'): 
      f = open('input.txt')
      parse_file(f)
      f.close()

def main(): 
   import sys
   if len(sys.argv) > 1: 
      f = open(sys.argv[1])
      parse_file(f)
      f.close()
   else: parse_file(sys.stdin)

if __name__ == '__main__': 
   main()
