#!/usr/bin/env python
"""
htmldiff.py - An HTML Diff Tool
Author: Sean B. Palmer, inamidst.com
"""
import sys
from HTMLParser import HTMLParser
from difflib import ndiff
class DiffHTMLParser(HTMLParser):
def __init__(self):
self.parts = []
self.__data = None
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
self.collect_data()
if attrs:
attributes = ' '.join('%s="%s"' % (n, v) for n, v in attrs)
start = '<%s %s>' % (tag, attributes)
else: start = '<%s>' % tag
self.parts.append(start)
def handle_endtag(self, tag):
self.collect_data()
end = '%s>' % tag
self.parts.append(end)
def handle_data(self, data):
self.add_data(data)
def handle_decl(self, decl):
self.add_data('' % decl)
def handle_charref(self, ref):
self.add_data('%s;' % ref)
def handle_entityref(self, name):
self.add_data('&%s;' % name)
def add_data(self, data):
if self.__data is None:
self.__data = data
else: self.__data += data
def collect_data(self):
if self.__data is not None:
self.parts.append(self.__data)
self.__data = None
def close(self):
self.collect_data()
HTMLParser.close(self)
def parse(bytes):
h = DiffHTMLParser()
h.feed(bytes)
h.close()
return h.parts
def tdiff(a, b):
def split(bytes):
words = bytes.split(' ')
if len(words) > 1:
p, q = words[:-1], words[-1]
words = [word + ' ' for word in p] + [q]
return words
mode = None
for line in ndiff(split(a), split(b)):
if line.startswith('+ '):
if mode == 'del':
yield ''
if (mode is None) or (mode == 'del'):
yield ''
mode = 'ins'
yield line[2:]
elif line.startswith('- '):
if mode == 'ins':
yield ''
if (mode is None) or (mode == 'ins'):
yield ''
mode = 'del'
yield line[2:]
elif line.startswith(' '):
if mode is not None:
yield '%s>' % mode
yield line[2:]
if mode is not None:
yield '%s>' % mode
def hdiff(a, b):
aparts = parse(a)
bparts = parse(b)
for line in ndiff(aparts, bparts):
if line.startswith('- <'):
continue
elif line.startswith('+ <'):
yield line[2:]
elif line.startswith('- '):
old = line[2:]
elif line.startswith('+ '):
if old:
for word in tdiff(old, line[2:]):
yield word
elif line[2:].strip(' \r\n'):
yield '' + line[2:] + ''
else: yield line[2:]
old = None
elif line.startswith(' '):
yield line[2:]
def test():
a = 'a b c
'
b = 'a b d e f
f\ng
h
'
for line in hdiff(a, b):
print '%r' % line
def main():
a, b = sys.argv[1], sys.argv[2]
adata = open(a, 'rb').read()
bdata = open(b, 'rb').read()
for bytes in hdiff(adata, bdata):
sys.stdout.write(bytes)
if __name__ == '__main__':
main()