#!/usr/bin/env python
"""
striphtml.py - Strip HTML From Input
Author: Sean B. Palmer, inamidst.com
Example: ./striphtml.py filename.html | wc -w
"""

import sys, re
from optparse import OptionParser

r_alphanum = re.compile(r'[A-Za-z0-9]+')
r_cdata = re.compile(r'<!\[CDATA\[((?:[^\]]+|\](?!\]>))*)\]\]>')
r_comment = re.compile(r'<!--((?:[^-]|(?:-[^-]))*)-->')
r_pi = re.compile(r'<\?(\S+)[\t\n\r ]+(([^\?]+|\?(?!>))*)\?>')
r_doctype = re.compile(r'(?m)(<!DOCTYPE[\t\n\r ]+\S+[^\[]+?(\[[^\]]+?\])?\s*>)')
r_tag = re.compile(r'(?ms)<[^>]+>')

def striphtml(f, ignore): 
   data = f.read()
   for regexp in (r_cdata, r_comment, r_pi, r_doctype): 
      data = regexp.sub(' ', data)

   if ignore: 
      for tag in ignore: 
         regexp = re.compile(r'(?ism)<%s[^>]*>.*?</%s>' % (tag, tag))
         data = regexp.sub(' ', data)
   data = r_tag.sub(' ', data)
   try: sys.stdout.write(data)
   except IOError: sys.exit()

def main(argv=None): 
   parser = OptionParser(usage='%prog [options] <filename>?')
   parser.add_option("-i", "--ignore", dest="ignore", default=False, 
                     help="elements to ignore content of", metavar="IGN")
   # @@ -r --retain - elements to retain
   options, args = parser.parse_args(argv)

   if options.ignore is False: 
      try: frozenset
      except NameError: ignore = ['head', 'blockquote', 'q', 'ins']
      else: ignore = frozenset(['head', 'blockquote', 'q', 'ins'])
   else: ignore = r_alphanum.findall(options.ignore)

   if not args: 
      f = sys.stdin
   elif len(args) == 1: 
      f = open(args[0], 'r')
   else: parser.error("One filename argument maximum")

   striphtml(f, ignore)
   if f is not sys.stdin: 
      f.close()

if __name__=="__main__": 
   main()