#!/usr/bin/env python """ striphtml.py - Strip HTML From Input Author: Sean B. Palmer, inamidst.com Example: ./striphtml.py filename.html | wc -w """ import sys, re from optparse import OptionParser r_alphanum = re.compile(r'[A-Za-z0-9]+') r_cdata = re.compile(r'))*)\]\]>') r_comment = re.compile(r'') r_pi = re.compile(r'<\?(\S+)[\t\n\r ]+(([^\?]+|\?(?!>))*)\?>') r_doctype = re.compile(r'(?m)()') r_tag = re.compile(r'(?ms)<[^>]+>') def striphtml(f, ignore): data = f.read() for regexp in (r_cdata, r_comment, r_pi, r_doctype): data = regexp.sub(' ', data) if ignore: for tag in ignore: regexp = re.compile(r'(?ism)<%s[^>]*>.*?' % (tag, tag)) data = regexp.sub(' ', data) data = r_tag.sub(' ', data) try: sys.stdout.write(data) except IOError: sys.exit() def main(argv=None): parser = OptionParser(usage='%prog [options] ?') parser.add_option("-i", "--ignore", dest="ignore", default=False, help="elements to ignore content of", metavar="IGN") # @@ -r --retain - elements to retain options, args = parser.parse_args(argv) if options.ignore is False: try: frozenset except NameError: ignore = ['head', 'blockquote', 'q', 'ins'] else: ignore = frozenset(['head', 'blockquote', 'q', 'ins']) else: ignore = r_alphanum.findall(options.ignore) if not args: f = sys.stdin elif len(args) == 1: f = open(args[0], 'r') else: parser.error("One filename argument maximum") striphtml(f, ignore) if f is not sys.stdin: f.close() if __name__=="__main__": main()