#!/usr/bin/env python """ getlinks.py - HTML Link Extractor Author: Sean B. Palmer, inamidst.com License: GPL 2; share and enjoy! Usage: %prog [options] """ import sys, re, urllib, urlparse, htmlentitydefs from HTMLParser import HTMLParser r_cdata = re.compile(r'))*)\]\]>') r_comment = re.compile(r'') r_pi = re.compile(r'<\?(\S+)[\t\n\r ]+(([^\?]+|\?(?!>))*)\?>') r_link = re.compile(r'(?i)]*)') r_closedattr = re.compile(r'\s+(\w+)\s*=\s*[\'"]([^\'"]*)[\'"]') r_entity = re.compile(r'&(#x[0-9A-Fa-f]+|#[0-9]+|[A-Za-z]+);') r_hibytes = re.compile('([\x80-\xff]+)') htmlentitydefs.name2codepoint['apos'] = 0x27 def uribase(uri): uri = uri.split('#')[0] uri = uri.split('?')[0] if uri.endswith('/'): return uri parts = uri.split('/') parts.pop() return '/'.join(parts) + '/' def entity(m): name = m.group(1) if name.startswith('#x'): return unichr(int(name[2:].lstrip('0'), 16)) elif name.startswith('#'): return unichr(int(name[1:].lstrip('0'))) elif htmlentitydefs.name2codepoint.has_key(name): return unichr(htmlentitydefs.name2codepoint[name]) else: return '&' + name + ';' def hexescape(m): bytes = m.group(1) return ''.join('%%%02X' % ord(byte) for byte in bytes) def uriencode(uri): uri = r_entity.sub(entity, uri) uri = uri.encode('utf-8') return r_hibytes.sub(hexescape, uri) def readuri(uri): if uri in ('-', '/dev/stdin'): return sys.stdin.read() elif ':' in uri: opener = urllib.urlopen else: opener = open f = opener(uri) data = f.read() f.close() return data class LinkParser(object): def __init__(self, uri, baseURI=None, match=None): self.uri = uri self.baseURI = baseURI or uri if match == '-': self.match = uribase(self.baseURI) else: self.match = match def getlinks(self): self.data = readuri(self.uri) # Remove CDATA, commands, and PIs self.data = r_cdata.sub(' ', self.data) self.data = r_comment.sub(' ', self.data) self.data = r_pi.sub(' ', self.data) self.pos = 0 while True: link = self.getanchor() if link is None: break if link is not False: yield link def getanchor(self): m = r_link.search(self.data, self.pos) if not m: return None self.pos = m.end() attributes = self.getattrs() if attributes.has_key('href'): link = urlparse.urljoin(self.baseURI, attributes['href']) link = uriencode(link) if (not self.match) or link.startswith(self.match): return link return False def getattrs(self): attributes = {} while True: omatch = r_openattr.match(self.data, self.pos) if omatch: self.pos = omatch.end() attributes[omatch.group(1).lower()] = omatch.group(2) continue cmatch = r_closedattr.match(self.data, self.pos) if cmatch: self.pos = cmatch.end() attributes[cmatch.group(1).lower()] = cmatch.group(2) else: break return attributes class StrictLinkParser(HTMLParser): def __init__(self, uri, baseURI=None, match=None): HTMLParser.__init__(self) self.uri = uri self.baseURI = baseURI or self.uri if match == '-': self.match = uribase(self.baseURI) else: self.match = match self.results = [] def getlinks(self): self.parse() for uri in self.results: yield uri def parse(self): data = readuri(self.uri) self.feed(data) def handle_starttag(self, tag, attrs): if tag == 'a': for (attr, value) in attrs: if attr == 'href': uri = uriencode(urlparse.urljoin(self.baseURI, value)) if self.match: if not uri.startswith(self.match): return self.results.append(uri) return def getlinks(uri, opt): if not opt.strict: LinkParserClass = LinkParser else: LinkParserClass = StrictLinkParser parser = LinkParserClass(uri, opt.base, opt.match) return parser.getlinks() def docparse(doc): result = [] for line in doc.splitlines(): if line.startswith('Usage: '): result = [line[7:]] + result elif ': ' in line: result.append(line) return '\n'.join(result) def main(argv=None): from optparse import OptionParser parser = OptionParser(usage=docparse(__doc__)) parser.add_option('-b', '--base', default=False, metavar='URI', help='set the base URI of the input') parser.add_option('-m', '--match', default=False, metavar='URI', help='links must start with this URI') parser.add_option('-s', '--strict', dest='strict', action='store_true', default=False, help='use the stricter parser') opt, args = parser.parse_args(argv) if len(args) == 1: for link in getlinks(args[0], opt): try: print link except IOError: break else: parser.error('Error: must have one URI argument') if __name__=="__main__": main()