#!/usr/bin/env python
"""
getlinks.py - HTML Link Extractor
Author: Sean B. Palmer, inamidst.com
License: GPL 2; share and enjoy!
Usage: %prog [options]
"""
import sys, re, urllib, urlparse, htmlentitydefs
from HTMLParser import HTMLParser
r_cdata = re.compile(r'))*)\]\]>')
r_comment = re.compile(r'')
r_pi = re.compile(r'<\?(\S+)[\t\n\r ]+(([^\?]+|\?(?!>))*)\?>')
r_link = re.compile(r'(?i)]*)')
r_closedattr = re.compile(r'\s+(\w+)\s*=\s*[\'"]([^\'"]*)[\'"]')
r_entity = re.compile(r'&(#x[0-9A-Fa-f]+|#[0-9]+|[A-Za-z]+);')
r_hibytes = re.compile('([\x80-\xff]+)')
htmlentitydefs.name2codepoint['apos'] = 0x27
def uribase(uri):
uri = uri.split('#')[0]
uri = uri.split('?')[0]
if uri.endswith('/'):
return uri
parts = uri.split('/')
parts.pop()
return '/'.join(parts) + '/'
def entity(m):
name = m.group(1)
if name.startswith('#x'):
return unichr(int(name[2:].lstrip('0'), 16))
elif name.startswith('#'):
return unichr(int(name[1:].lstrip('0')))
elif htmlentitydefs.name2codepoint.has_key(name):
return unichr(htmlentitydefs.name2codepoint[name])
else: return '&' + name + ';'
def hexescape(m):
bytes = m.group(1)
return ''.join('%%%02X' % ord(byte) for byte in bytes)
def uriencode(uri):
uri = r_entity.sub(entity, uri)
uri = uri.encode('utf-8')
return r_hibytes.sub(hexescape, uri)
def readuri(uri):
if uri in ('-', '/dev/stdin'):
return sys.stdin.read()
elif ':' in uri:
opener = urllib.urlopen
else: opener = open
f = opener(uri)
data = f.read()
f.close()
return data
class LinkParser(object):
def __init__(self, uri, baseURI=None, match=None):
self.uri = uri
self.baseURI = baseURI or uri
if match == '-':
self.match = uribase(self.baseURI)
else: self.match = match
def getlinks(self):
self.data = readuri(self.uri)
# Remove CDATA, commands, and PIs
self.data = r_cdata.sub(' ', self.data)
self.data = r_comment.sub(' ', self.data)
self.data = r_pi.sub(' ', self.data)
self.pos = 0
while True:
link = self.getanchor()
if link is None: break
if link is not False: yield link
def getanchor(self):
m = r_link.search(self.data, self.pos)
if not m: return None
self.pos = m.end()
attributes = self.getattrs()
if attributes.has_key('href'):
link = urlparse.urljoin(self.baseURI, attributes['href'])
link = uriencode(link)
if (not self.match) or link.startswith(self.match):
return link
return False
def getattrs(self):
attributes = {}
while True:
omatch = r_openattr.match(self.data, self.pos)
if omatch:
self.pos = omatch.end()
attributes[omatch.group(1).lower()] = omatch.group(2)
continue
cmatch = r_closedattr.match(self.data, self.pos)
if cmatch:
self.pos = cmatch.end()
attributes[cmatch.group(1).lower()] = cmatch.group(2)
else: break
return attributes
class StrictLinkParser(HTMLParser):
def __init__(self, uri, baseURI=None, match=None):
HTMLParser.__init__(self)
self.uri = uri
self.baseURI = baseURI or self.uri
if match == '-':
self.match = uribase(self.baseURI)
else: self.match = match
self.results = []
def getlinks(self):
self.parse()
for uri in self.results:
yield uri
def parse(self):
data = readuri(self.uri)
self.feed(data)
def handle_starttag(self, tag, attrs):
if tag == 'a':
for (attr, value) in attrs:
if attr == 'href':
uri = uriencode(urlparse.urljoin(self.baseURI, value))
if self.match:
if not uri.startswith(self.match):
return
self.results.append(uri)
return
def getlinks(uri, opt):
if not opt.strict:
LinkParserClass = LinkParser
else: LinkParserClass = StrictLinkParser
parser = LinkParserClass(uri, opt.base, opt.match)
return parser.getlinks()
def docparse(doc):
result = []
for line in doc.splitlines():
if line.startswith('Usage: '):
result = [line[7:]] + result
elif ': ' in line: result.append(line)
return '\n'.join(result)
def main(argv=None):
from optparse import OptionParser
parser = OptionParser(usage=docparse(__doc__))
parser.add_option('-b', '--base', default=False, metavar='URI',
help='set the base URI of the input')
parser.add_option('-m', '--match', default=False, metavar='URI',
help='links must start with this URI')
parser.add_option('-s', '--strict', dest='strict',
action='store_true', default=False,
help='use the stricter parser')
opt, args = parser.parse_args(argv)
if len(args) == 1:
for link in getlinks(args[0], opt):
try: print link
except IOError: break
else: parser.error('Error: must have one URI argument')
if __name__=="__main__":
main()