#!/usr/bin/env python """ mergembox.py - Merge mbox files, stripping duplicates Author: Sean B. Palmer, inamidst.com Based on: http://home.ccil.org/~cowan/mergembox """ import sys, md5, fileinput def messages(input): """Generate a sequence of message line lists from input.""" message = [] for line in input: if message and line.startswith('From '): yield message message = [] message.append(line) if message: yield message def uri(msg): """Get a Message-ID or MD5 URI from a message.""" for line in msg: if line.startswith('Message-ID: '): return 'mid:' + line[12:].strip(' \t\r\n<>') return 'md5:' + md5.new(''.join(msg)).hexdigest() def main(input=None): """Print out all input messages, sans duplicates.""" if input is None: input = fileinput.input() seen = set() for msgno, msg in enumerate(messages(input)): msguri = uri(msg) if not (msguri in seen): seen.add(msguri) sys.stdout.writelines(msg) else: args = (fileinput.filename(), msgno, msguri) print >> sys.stderr, "%s:%s:duplicate id %s" % args if __name__=="__main__": main()