#!/usr/bin/python import sys, re r_comment = re.compile(r'') r_tag = re.compile(r'<[^>]+>') r_em = re.compile(r'(?ims)(?:.*?)|(?:.*?)') r_paren = re.compile(r'\([^\)]+\)') r_tr = re.compile(r'(?ims)]*>.*?') r_td = re.compile(r'(?ims)]+>.*?') r_ws = re.compile(r'[ \t\r\n]+') def stripComments(s): return r_comment.sub(' ', s) def resolveEntities(s): replacements = {'<': '<', '>': '>', ' ': ''} for (before, after) in replacements.iteritems(): s = s.replace(before, after) return s.replace('&', '&') def getWord(s): s = r_tag.sub(' ', s) s = r_ws.sub(' ', s) return s.strip(' \t\r\n') def getTranslations(s): s = r_em.sub(' ', s) s = r_tag.sub(' ', s) s = r_paren.sub(' ', s) s = r_ws.sub(' ', s) s = s.strip(' \t\r\n') if s: return s.split(', ') else: return [] def main(): fn = sys.argv[1] # get all the tr blocks f = open(fn) data = f.read() f.close() for tr in r_tr.findall(data): tr = stripComments(tr) try: dt, dd = tuple(r_td.findall(tr)) except ValueError: continue dt, dd = resolveEntities(dt), resolveEntities(dd) word = getWord(dt) translations = getTranslations(dd) if word and translations: print '%s: %s' % (word, ', '.join(translations[:2])) if __name__=="__main__": main()