#!/usr/bin/python
import sys, re
r_comment = re.compile(r'')
r_tag = re.compile(r'<[^>]+>')
r_em = re.compile(r'(?ims)(?:.*?)|(?:.*?)')
r_paren = re.compile(r'\([^\)]+\)')
r_tr = re.compile(r'(?ims)
]*>.*?
')
r_td = re.compile(r'(?ims)]+>.*? | ')
r_ws = re.compile(r'[ \t\r\n]+')
def stripComments(s):
return r_comment.sub(' ', s)
def resolveEntities(s):
replacements = {'<': '<', '>': '>', ' ': ''}
for (before, after) in replacements.iteritems():
s = s.replace(before, after)
return s.replace('&', '&')
def getWord(s):
s = r_tag.sub(' ', s)
s = r_ws.sub(' ', s)
return s.strip(' \t\r\n')
def getTranslations(s):
s = r_em.sub(' ', s)
s = r_tag.sub(' ', s)
s = r_paren.sub(' ', s)
s = r_ws.sub(' ', s)
s = s.strip(' \t\r\n')
if s: return s.split(', ')
else: return []
def main():
fn = sys.argv[1]
# get all the tr blocks
f = open(fn)
data = f.read()
f.close()
for tr in r_tr.findall(data):
tr = stripComments(tr)
try: dt, dd = tuple(r_td.findall(tr))
except ValueError: continue
dt, dd = resolveEntities(dt), resolveEntities(dd)
word = getWord(dt)
translations = getTranslations(dd)
if word and translations:
print '%s: %s' % (word, ', '.join(translations[:2]))
if __name__=="__main__":
main()