#!/usr/bin/env python """ xhtmlnorm.py - XHTML Normaliser Author: Sean B. Palmer, inamidst.com Options: * Entities -> numeric (-e numeric), utf-8 (-e utf-8), remove (-e remove) * CDATA -> entities (-t entities), remove (-t remove) * Doctype -> remove (-d) * Comments -> remove (-c) * PI -> remove (-p) * Encoding/XMLDecl -> ? """ import sys, re from htmlentitydefs import name2codepoint from optparse import OptionParser r_cdata = re.compile(r'))*)\]\]>') r_comment = re.compile(r'') r_delimiter = re.compile(r'<|&') r_doctype = re.compile(r'(?m)()') r_entity = re.compile(r'&[A-Za-z0-9#]+;') r_pi = re.compile(r'<\?(\S+)[\t\n\r ]+(([^\?]+|\?(?!>))*)\?>') # r_tag = re.compile(r'(?ms)<[^>]+>') r_tag = re.compile(r'<([^\'">]+("[^"]+"|\'[^\']+\')?)*>') def escape(data): data = data.replace('&', '&') data = data.replace('<', '<') return data.replace('>', '>') class HTMLNormaliser(object): def __init__(self, options): self.options = options def inputFilename(self, filename): if filename in ('-', '/dev/stdin'): self.input = sys.stdin else: self.input = open(filename) def outputFilename(self, filename): if filename in ('-', '/dev/stdout'): self.output = sys.stdout else: self.output = open(filename, 'w') def closeInput(self): if self.input is not sys.stdin: self.input.close() def closeOutput(self): if self.output is not sys.stdout: self.output.close() def normalise(self): self.bytes = self.input.read() while True: m = r_delimiter.search(self.bytes) if not m: self.output.write(self.bytes) break pos = m.end() - 1 self.output.write(self.bytes[:pos]) self.bytes = self.bytes[pos:] if self.bytes.startswith('&'): self.parseEntity() elif self.bytes.startswith('
Café, and—test‽.
Test CDATA section.
]]>
""")
reference = textwrap.dedent("""\
Café, and—test‽.
<p>Test CDATA <em>section</em>.</p>
""")
from cStringIO import StringIO
input = StringIO()
input.write(html)
input.seek(0)
output = StringIO()
options = type('_', (object,), {})()
options.entities = 'numeric'
options.cdata = 'entities'
options.doctype = True
options.comments = True
options.pi = True
options.whitespace = True
n = HTMLNormaliser(options)
n.input = input
n.output = output
n.normalise()
output.seek(0)
assert output.read() == reference
print reference
print "Test passed"
def opt(parser, flags, help):
short, long = flags[1:].split('/', 1)
if '=' in long:
long, metavar = long.split('=', 1)
kargs = dict(default=False, help=help, metavar=metavar)
else: kargs = dict(action='store_true', default=False, help=help)
parser.add_option('-' + short, '--' + long, **kargs)
def main():
parser = OptionParser(usage="%prog [options] ")
# Input and Output
opt(parser, "-i/inplace", "modify file in-place")
opt(parser, "-o/output=FN", "output filename")
# Normalisation options
opt(parser, "-e/entities=OP", "entity ops (numeric/utf-8/remove)")
opt(parser, "-t/cdata=OP", "cdata ops (entities/remove)")
opt(parser, "-d/doctype", "remove doctype")
opt(parser, "-c/comments", "remove comments")
opt(parser, "-p/pi", "remove processing instructions")
opt(parser, "-w/whitespace", "remove whitespace")
opt(parser, "-a/all", "normalise all")
options, args = parser.parse_args()
if options.inplace and options.output:
parser.error("-i and -o are mutually exclusive")
if options.all:
options.entities = 'numeric'
options.cdata = 'entities'
options.doctype = True
options.comments = True
options.pi = True
options.whitespace = True
if len(args) > 1:
parser.error("Maximum of one input files")
if args:
input = args[0]
else: input = '-'
if options.output:
output = options.output
elif options.inplace:
if input in ('-', '/dev/stdin'):
parser.error("Can't use in-place with stdin")
output = input
else: output = '-'
normalise(input, output, options)
if __name__=="__main__":
main()