#!/usr/bin/env python """ count.py - Count Kalusa Words Author: Sean B. Palmer, inamidst.com """ import sys, re r_word = re.compile(r"[A-Za-z']+") r_comment = re.compile(r'\[[^\]]+\]') def parse(s): s = r_comment.sub(' ', s) return [word.lower().strip("'") for word in r_word.findall(s)] def main(): corpus, bound = (sys.argv[1:] + ['100'])[:2] bound = int(bound) freq = {} f = open(corpus) for line in f: line = line.strip(' \t\r\n') if line.count('\t') != 3: continue num, kal, eng, cq = line.split('\t') cq = float(cq) if cq < bound: continue for word in parse(kal): try: freq[word] += 1 except KeyError: freq[word] = 1 f.close() for word in sorted(freq.iterkeys()): print word, '(%s)' % freq[word] if __name__ == '__main__': main()