#!/usr/bin/env python """ scrape.py - Archive Yahoo! Groups Emails Author: Sean B. Palmer, inamidst.com Requires: http://wwwsearch.sourceforge.net/ClientCookie/ Usage: ./scrape.py groupname 1 50 - scrapes and archives posts 1 to 50 inclusive """ import sys, re, time from htmlentitydefs import name2codepoint import mechanize as ClientCookie message = 'http://groups.yahoo.com/group/%s/message/%s?source=1' r_tag = re.compile(r'<[^>]+>') r_space = re.compile(r'(?m)=20$') r_entity = re.compile(r'&([^;\s]+);') r_source = re.compile(r'(?is)(.*?)') def entity(match): value = match.group(1).lower() if value.startswith('#x'): return unichr(int(value[2:], 16)) elif value.startswith('#'): return unichr(int(value[1:])) elif name2codepoint.has_key(value): return unichr(name2codepoint[value]) return '[' + value + ']' def text(html): html = unicode(html, 'iso-8859-1') html = r_tag.sub('', html) result = r_entity.sub(entity, html) result = r_space.sub(' ', result) return unicode(result) def archive(group, i, padding): uri = message % (group, i) u = ClientCookie.urlopen(uri) bytes = u.read() u.close() m = r_source.search(bytes) if m: td = m.group(1) post = text(td) post = post.strip(' \t\r\n') + '\n' fn = '%%0%si.txt' % padding f = open(fn % i, 'w') f.write(post.encode('utf-8')) f.close() print >> sys.stderr, 'Archived %s:%s' % (group, i) else: print >> sys.stderr, 'Warning: no source in %s:%s' % (group, i) def main(): group, lower, upper = sys.argv[1:] for i in xrange(int(lower), int(upper) + 1): archive(group, i, len(str(int(upper) + 1))) time.sleep(1) if __name__=="__main__": main()