#!/usr/bin/env python """ xhtmlnorm.py - XHTML Normaliser Author: Sean B. Palmer, inamidst.com Options: * Entities -> numeric (-e numeric), utf-8 (-e utf-8), remove (-e remove) * CDATA -> entities (-t entities), remove (-t remove) * Doctype -> remove (-d) * Comments -> remove (-c) * PI -> remove (-p) * Encoding/XMLDecl -> ? """ import sys, re from htmlentitydefs import name2codepoint from optparse import OptionParser r_cdata = re.compile(r'))*)\]\]>') r_comment = re.compile(r'') r_delimiter = re.compile(r'<|&') r_doctype = re.compile(r'(?m)()') r_entity = re.compile(r'&[A-Za-z0-9#]+;') r_pi = re.compile(r'<\?(\S+)[\t\n\r ]+(([^\?]+|\?(?!>))*)\?>') # r_tag = re.compile(r'(?ms)<[^>]+>') r_tag = re.compile(r'<([^\'">]+("[^"]+"|\'[^\']+\')?)*>') def escape(data): data = data.replace('&', '&') data = data.replace('<', '<') return data.replace('>', '>') class HTMLNormaliser(object): def __init__(self, options): self.options = options def inputFilename(self, filename): if filename in ('-', '/dev/stdin'): self.input = sys.stdin else: self.input = open(filename) def outputFilename(self, filename): if filename in ('-', '/dev/stdout'): self.output = sys.stdout else: self.output = open(filename, 'w') def closeInput(self): if self.input is not sys.stdin: self.input.close() def closeOutput(self): if self.output is not sys.stdout: self.output.close() def normalise(self): self.bytes = self.input.read() while True: m = r_delimiter.search(self.bytes) if not m: self.output.write(self.bytes) break pos = m.end() - 1 self.output.write(self.bytes[:pos]) self.bytes = self.bytes[pos:] if self.bytes.startswith('&'): self.parseEntity() elif self.bytes.startswith(' Test Document

Café, and—test‽.

Test CDATA section.

]]>
""") reference = textwrap.dedent("""\ Test Document

Café, and—test‽.

      <p>Test CDATA <em>section</em>.</p>
      
""") from cStringIO import StringIO input = StringIO() input.write(html) input.seek(0) output = StringIO() options = type('_', (object,), {})() options.entities = 'numeric' options.cdata = 'entities' options.doctype = True options.comments = True options.pi = True options.whitespace = True n = HTMLNormaliser(options) n.input = input n.output = output n.normalise() output.seek(0) assert output.read() == reference print reference print "Test passed" def opt(parser, flags, help): short, long = flags[1:].split('/', 1) if '=' in long: long, metavar = long.split('=', 1) kargs = dict(default=False, help=help, metavar=metavar) else: kargs = dict(action='store_true', default=False, help=help) parser.add_option('-' + short, '--' + long, **kargs) def main(): parser = OptionParser(usage="%prog [options] ") # Input and Output opt(parser, "-i/inplace", "modify file in-place") opt(parser, "-o/output=FN", "output filename") # Normalisation options opt(parser, "-e/entities=OP", "entity ops (numeric/utf-8/remove)") opt(parser, "-t/cdata=OP", "cdata ops (entities/remove)") opt(parser, "-d/doctype", "remove doctype") opt(parser, "-c/comments", "remove comments") opt(parser, "-p/pi", "remove processing instructions") opt(parser, "-w/whitespace", "remove whitespace") opt(parser, "-a/all", "normalise all") options, args = parser.parse_args() if options.inplace and options.output: parser.error("-i and -o are mutually exclusive") if options.all: options.entities = 'numeric' options.cdata = 'entities' options.doctype = True options.comments = True options.pi = True options.whitespace = True if len(args) > 1: parser.error("Maximum of one input files") if args: input = args[0] else: input = '-' if options.output: output = options.output elif options.inplace: if input in ('-', '/dev/stdin'): parser.error("Can't use in-place with stdin") output = input else: output = '-' normalise(input, output, options) if __name__=="__main__": main()