#!/usr/bin/env python
"""
striphtml.py - Strip HTML From Input
Author: Sean B. Palmer, inamidst.com
Example: ./striphtml.py filename.html | wc -w
"""
import sys, re
from optparse import OptionParser
r_alphanum = re.compile(r'[A-Za-z0-9]+')
r_cdata = re.compile(r'))*)\]\]>')
r_comment = re.compile(r'')
r_pi = re.compile(r'<\?(\S+)[\t\n\r ]+(([^\?]+|\?(?!>))*)\?>')
r_doctype = re.compile(r'(?m)()')
r_tag = re.compile(r'(?ms)<[^>]+>')
def striphtml(f, ignore):
data = f.read()
for regexp in (r_cdata, r_comment, r_pi, r_doctype):
data = regexp.sub(' ', data)
if ignore:
for tag in ignore:
regexp = re.compile(r'(?ism)<%s[^>]*>.*?%s>' % (tag, tag))
data = regexp.sub(' ', data)
data = r_tag.sub(' ', data)
try: sys.stdout.write(data)
except IOError: sys.exit()
def main(argv=None):
parser = OptionParser(usage='%prog [options] ?')
parser.add_option("-i", "--ignore", dest="ignore", default=False,
help="elements to ignore content of", metavar="IGN")
# @@ -r --retain - elements to retain
options, args = parser.parse_args(argv)
if options.ignore is False:
try: frozenset
except NameError: ignore = ['head', 'blockquote', 'q', 'ins']
else: ignore = frozenset(['head', 'blockquote', 'q', 'ins'])
else: ignore = r_alphanum.findall(options.ignore)
if not args:
f = sys.stdin
elif len(args) == 1:
f = open(args[0], 'r')
else: parser.error("One filename argument maximum")
striphtml(f, ignore)
if f is not sys.stdin:
f.close()
if __name__=="__main__":
main()