#!/usr/bin/env python
"""
scrape.py - Archive Yahoo! Groups Emails
Author: Sean B. Palmer, inamidst.com
Requires: http://wwwsearch.sourceforge.net/ClientCookie/
Usage:
./scrape.py groupname 1 50
- scrapes and archives posts 1 to 50 inclusive
"""
import sys, re, time
from htmlentitydefs import name2codepoint
import mechanize as ClientCookie
message = 'http://groups.yahoo.com/group/%s/message/%s?source=1'
r_tag = re.compile(r'<[^>]+>')
r_space = re.compile(r'(?m)=20$')
r_entity = re.compile(r'&([^;\s]+);')
r_source = re.compile(r'(?is)
(.*?) | ')
def entity(match):
value = match.group(1).lower()
if value.startswith('#x'):
return unichr(int(value[2:], 16))
elif value.startswith('#'):
return unichr(int(value[1:]))
elif name2codepoint.has_key(value):
return unichr(name2codepoint[value])
return '[' + value + ']'
def text(html):
html = unicode(html, 'iso-8859-1')
html = r_tag.sub('', html)
result = r_entity.sub(entity, html)
result = r_space.sub(' ', result)
return unicode(result)
def archive(group, i, padding):
uri = message % (group, i)
u = ClientCookie.urlopen(uri)
bytes = u.read()
u.close()
m = r_source.search(bytes)
if m:
td = m.group(1)
post = text(td)
post = post.strip(' \t\r\n') + '\n'
fn = '%%0%si.txt' % padding
f = open(fn % i, 'w')
f.write(post.encode('utf-8'))
f.close()
print >> sys.stderr, 'Archived %s:%s' % (group, i)
else: print >> sys.stderr, 'Warning: no source in %s:%s' % (group, i)
def main():
group, lower, upper = sys.argv[1:]
for i in xrange(int(lower), int(upper) + 1):
archive(group, i, len(str(int(upper) + 1)))
time.sleep(1)
if __name__=="__main__":
main()