#!/usr/bin/env python """ subset.py - Subset Discovery Author: Sean B. Palmer, inamidst.com """ import operator, itertools def intersection(sequences): """Naive list intersection.""" tally = sequences.next() for sequence in sequences: for item in tally: if item not in sequence: tally.remove(item) return tally class SetStorage(object): def __init__(self): self.database = {} self.instances = {} self.frequencies = {} self.counter = 0 def add(self, seq): """Add the sequence seq to the database.""" if not isinstance(seq, frozenset): seq = frozenset(seq) self.database[self.counter] = seq for item in seq: self.instances.setdefault(item, set()).add(self.counter) try: self.frequencies[item] += 1 except KeyError: self.frequencies[item] = 1 self.counter += 1 def lookup(self, seq): """Return sequences for which seq is a subset.""" if not isinstance(seq, frozenset): seq = frozenset(seq) g = ((self.frequencies[item], item) for item in seq) ordered = (self.instances[pair[1]] for pair in sorted(g)) for result in reduce(operator.__and__, ordered): yield self.database[result] def naive(self, seq): """Return sequences for which seq is a subset.""" if not isinstance(seq, frozenset): seq = frozenset(seq) ordered = (self.instances[item] for item in seq) for result in reduce(operator.__and__, ordered): yield self.database[result] class ListStorage(object): def __init__(self): self.database = {} self.instances = {} self.frequencies = {} self.counter = 0 def add(self, seq): """Add the sequence seq to the database.""" if not isinstance(seq, frozenset): seq = frozenset(seq) self.database[self.counter] = seq for item in seq: self.instances.setdefault(item, []).append(self.counter) try: self.frequencies[item] += 1 except KeyError: self.frequencies[item] = 1 self.counter += 1 def lookup(self, seq): """Return sequences for which seq is a subset.""" if not isinstance(seq, frozenset): seq = frozenset(seq) g = ((self.frequencies[item], item) for item in seq) ordered = (self.instances[pair[1]] for pair in sorted(g)) for result in intersection(ordered): yield self.database[result] def naive(self, seq): """Return sequences for which seq is a subset.""" if not isinstance(seq, frozenset): seq = frozenset(seq) ordered = (self.instances[item] for item in seq) for result in intersection(ordered): yield self.database[result] def test(Storage): s = Storage() s.add(['p', 'q', 'r', 's', 't']) s.add(['q', 'r', 's']) s.add(['a', 'b', 'r', 's', 't']) s.add(['p', 'q']) s.add(['a', 'p', 'b', 'q', 'c', 'r']) s.add(['s']) import time t = time.clock() for i in xrange(10000): for answer in s.lookup(['p', 'q', 'r']): pass print time.clock() - t t = time.clock() for i in xrange(10000): for answer in s.naive(['p', 'q', 'r']): pass print time.clock() - t def main(): import sys if sys.argv[1] in ('-t', '--test'): test(SetStorage) test(ListStorage) else: print __doc__.strip() if __name__ == '__main__': main()