#!/usr/bin/env python
"""
getlinks.py - HTML Link Extractor
Author: Sean B. Palmer, inamidst.com
License: GPL 2; share and enjoy!
Usage: %prog [options] <uri>
"""

import sys, re, urllib, urlparse, htmlentitydefs
from HTMLParser import HTMLParser

r_cdata = re.compile(r'<!\[CDATA\[((?:[^\]]+|\](?!\]>))*)\]\]>')
r_comment = re.compile(r'<!--((?:[^-]|(?:-[^-]))*)-->')
r_pi = re.compile(r'<\?(\S+)[\t\n\r ]+(([^\?]+|\?(?!>))*)\?>')
r_link = re.compile(r'(?i)<a(?=[ \t\r\n])')
r_openattr = re.compile(r'\s+(\w+)\s*=\s*(?![\'"])([^ \t\r\n<>]*)')
r_closedattr = re.compile(r'\s+(\w+)\s*=\s*[\'"]([^\'"]*)[\'"]')
r_entity = re.compile(r'&(#x[0-9A-Fa-f]+|#[0-9]+|[A-Za-z]+);')
r_hibytes = re.compile('([\x80-\xff]+)')

htmlentitydefs.name2codepoint['apos'] = 0x27

def uribase(uri): 
   uri = uri.split('#')[0]
   uri = uri.split('?')[0]

   if uri.endswith('/'): 
      return uri
   parts = uri.split('/')
   parts.pop()
   return '/'.join(parts) + '/'

def entity(m): 
   name = m.group(1)
   if name.startswith('#x'): 
      return unichr(int(name[2:].lstrip('0'), 16))
   elif name.startswith('#'): 
      return unichr(int(name[1:].lstrip('0')))
   elif htmlentitydefs.name2codepoint.has_key(name): 
      return unichr(htmlentitydefs.name2codepoint[name])
   else: return '&' + name + ';'

def hexescape(m): 
   bytes = m.group(1)
   return ''.join('%%%02X' % ord(byte) for byte in bytes)

def uriencode(uri): 
   uri = r_entity.sub(entity, uri)
   uri = uri.encode('utf-8')
   return r_hibytes.sub(hexescape, uri)

def readuri(uri): 
   if uri in ('-', '/dev/stdin'): 
      return sys.stdin.read()
   elif ':' in uri: 
      opener = urllib.urlopen
   else: opener = open

   f = opener(uri)
   data = f.read()
   f.close()
   return data

class LinkParser(object): 
   def __init__(self, uri, baseURI=None, match=None): 
      self.uri = uri
      self.baseURI = baseURI or uri
      if match == '-': 
         self.match = uribase(self.baseURI)
      else: self.match = match

   def getlinks(self): 
      self.data = readuri(self.uri)

      # Remove CDATA, commands, and PIs
      self.data = r_cdata.sub(' ', self.data)
      self.data = r_comment.sub(' ', self.data)
      self.data = r_pi.sub(' ', self.data)

      self.pos = 0

      while True: 
         link = self.getanchor()
         if link is None: break
         if link is not False: yield link

   def getanchor(self): 
      m = r_link.search(self.data, self.pos)
      if not m: return None
      self.pos = m.end()
      attributes = self.getattrs()
      if attributes.has_key('href'): 
         link = urlparse.urljoin(self.baseURI, attributes['href'])
         link = uriencode(link)
         if (not self.match) or link.startswith(self.match): 
            return link
      return False

   def getattrs(self): 
      attributes = {}
      while True: 
         omatch = r_openattr.match(self.data, self.pos)
         if omatch: 
            self.pos = omatch.end()
            attributes[omatch.group(1).lower()] = omatch.group(2)
            continue
         cmatch = r_closedattr.match(self.data, self.pos)
         if cmatch: 
            self.pos = cmatch.end()
            attributes[cmatch.group(1).lower()] = cmatch.group(2)
         else: break
      return attributes

class StrictLinkParser(HTMLParser): 
   def __init__(self, uri, baseURI=None, match=None): 
      HTMLParser.__init__(self)
      self.uri = uri
      self.baseURI = baseURI or self.uri
      if match == '-': 
         self.match = uribase(self.baseURI)
      else: self.match = match
      self.results = []

   def getlinks(self): 
      self.parse()
      for uri in self.results: 
         yield uri

   def parse(self): 
      data = readuri(self.uri)
      self.feed(data)

   def handle_starttag(self, tag, attrs): 
      if tag == 'a': 
         for (attr, value) in attrs: 
            if attr == 'href': 
               uri = uriencode(urlparse.urljoin(self.baseURI, value))
               if self.match: 
                  if not uri.startswith(self.match): 
                     return 
               self.results.append(uri)
               return 

def getlinks(uri, opt): 
   if not opt.strict: 
      LinkParserClass = LinkParser
   else: LinkParserClass = StrictLinkParser
   parser = LinkParserClass(uri, opt.base, opt.match)
   return parser.getlinks()

def docparse(doc): 
   result = []
   for line in doc.splitlines(): 
      if line.startswith('Usage: '): 
         result = [line[7:]] + result
      elif ': ' in line: result.append(line)
   return '\n'.join(result)

def main(argv=None): 
   from optparse import OptionParser
   parser = OptionParser(usage=docparse(__doc__))
   parser.add_option('-b', '--base', default=False, metavar='URI', 
                     help='set the base URI of the input')
   parser.add_option('-m', '--match', default=False, metavar='URI', 
                     help='links must start with this URI')
   parser.add_option('-s', '--strict', dest='strict', 
                     action='store_true', default=False, 
                     help='use the stricter parser')
   opt, args = parser.parse_args(argv)

   if len(args) == 1: 
      for link in getlinks(args[0], opt): 
         try: print link
         except IOError: break
   else: parser.error('Error: must have one URI argument')

if __name__=="__main__": 
   main()