#!/usr/bin/env python # Rank is based on: # a) recency # b) frequency # c) quality (CQ) import sys, re, operator, itertools # from decimal import Decimal as dec r_word = re.compile(r"[A-Za-z']+") r_comment = re.compile(r'\[[^\]]+\]') def parse(s): s = r_comment.sub(' ', s) return [word.lower().strip("'") for word in r_word.findall(s)] def main(): words = {} f = open(sys.argv[1]) for line in f: line = line.strip(' \t\r\n') if line.count('\t') != 3: continue num, kal, eng, rat = line.split('\t') num, rat = float(num), float(rat) for word in parse(kal): if words.has_key(word): freq, qual, rec = words[word] words[word] = (freq + 1, qual + [rat], num) else: words[word] = (1, [rat], num) f.close() result = {} for word, (freq, qual, rec) in words.iteritems(): rating = (sum(qual) / len(qual)) * ((freq / 20) + 0.25) if rec < 750: rating = rating * 0.9 result[word] = round(rating, 1) results = [] ri = sorted(result.iteritems(), key=operator.itemgetter(1)) for k, g in itertools.groupby(ri, key=operator.itemgetter(1)): results.append(('%s: ' % k, ', '.join(map(operator.itemgetter(0), g)))) for p, q in reversed(results): print p, q if __name__ == '__main__': main()