#!/usr/bin/env python 'Hypertext Topology Assistant' import os, re, urlparse, HTMLParser, subprocess from htmlentitydefs import name2codepoint cache = {} def get_cached(name, key): if cache.has_key((name, key)): return cache[(name, key)] if not key.isalpha(): raise ValueError('Bad key: %s' % key) value = eval(key)(name) set_cached(name, key, value) return value def set_cached(name, key, value): cache.setdefault(name, {}) cache[name].setdefault(key, value) return value def decode(bytes): for encoding in ('utf-8', 'iso-8859-1', 'cp1252'): try: return bytes.decode(encoding) except UnicodeDecodeError: continue def encode(text): if not isinstance(text, unicode): return text for encoding in ('utf-8', 'iso-8859-1', 'cp1252'): try: return text.encode(encoding) except UnicodeDecodeError: continue def text(name): with open(name) as f: bytes = f.read() text = decode(bytes) return text class TitleParser(HTMLParser.HTMLParser): def __init__(self, *args, **kargs): HTMLParser.HTMLParser.__init__(self, *args, **kargs) self.title = None self.buf = None def handle_starttag(self, tag, attrs): if tag == 'title': self.buf = [] def handle_endtag(self, tag): if tag == 'title': self.title = ''.join(self.buf) self.buf = None def handle_data(self, data): if self.buf is not None: self.buf.append(data) def handle_charref(self, name): if self.buf is not None: if name.startswith('x'): char = unichr(int(name[1:], 16)) else: char = unichr(int(name)) self.buf.append(char) def handle_entityref(self, name): if self.buf is not None: char = unichr(name2codepoint[name]) self.buf.append(char) def title(name): ext = get_cached(name, 'extension') if ext == 'html': p = TitleParser() with open(name) as f: for line in f: try: p.feed(line) except HTMLParser.HTMLParseError, err: continue if p.title is not None: break if p.title is None: return name p.title = encode(p.title)[:79] return set_cached(name, 'title', p.title) elif (ext == 'jpg') or (ext == 'png'): data = get_cached(name, 'exif') if data.has_key('Document Name'): title = data.get('Document Name', '') title = encode(title)[:79] return set_cached(name, 'title', title) elif ext == 'txt': with open(name) as f: line = f.readline() title = decode(line).strip() # length limit? title = encode(title)[:79] return set_cached(name, 'title', title) return def modified(name): m = os.path.getmtime(name) return set_cached(name, 'modified', m) def created(name): raise Exception('Unimplemented') def extension(name): ext = name.split('.')[-1] return set_cached(name, 'extension', ext) def media(name): raise Exception('Unimplemented') def bytes(name): b = os.path.getsize(name) return set_cached(name, 'bytes', b) def exif(name): command = ['exiftool', name] try: p = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=1) except OSError, err: return data = {} output = (line.decode('utf-8') for line in p.stdout) for line in output: if not ': ' in line: continue key, value = line.split(': ', 1) key, value = key.strip(), value.strip() data[key] = value return set_cached(name, 'exif', data) def dimensions(name): raise Exception('Unimplemented') class LinkParser(HTMLParser.HTMLParser): def __init__(self, *args, **kargs): HTMLParser.HTMLParser.__init__(self, *args, **kargs) self.links = {} self.base = None def handle_starttag(self, tag, attrs): attrdict = dict(attrs) if tag != 'base': self.handle_link(attrdict) else: self.base = attrdict.get('href') def handle_link(self, attrdict): href = attrdict.get('href') src = attrdict.get('src') if not (href or src): return name = 'href' if href else src uri = href if href else src uri = urlparse.urldefrag(uri)[0] uri = os.path.normpath(uri) if self.base is not None: uri = urlparse.urljoin(self.base, uri) link = name, uri self.links.setdefault(link, [set(), set()]) rel = attrdict.get('rel', '').strip() rev = attrdict.get('rev', '').strip() if rel: self.links[link][0] |= set(rel.split(' ')) if rev: self.links[link][1] |= set(rev.split(' ')) def links(name): p = LinkParser() p.base = name with open(name) as f: for line in f: try: p.feed(line) except HTMLParser.HTMLParseError, err: continue return set_cached(name, 'links', p.links) def exists(name): e = os.path.exists(name) return set_cached(name, 'exists', e) def names(directory, depth): if depth == '*': pattern = '[^/]*$' elif depth == '**': pattern = '.*$' elif depth == '**/*': pattern = '[^/]+/.*$' else: raise ValueError(depth) r_pattern = re.compile(pattern) directory = os.path.normpath(directory) prefix = directory.strip('.') + '/' for root, dirs, files in os.walk(directory): root = os.path.normpath(root) for name in files: path = os.path.join(root, name) path = os.path.normpath(path) if not r_pattern.match(path[len(prefix):]): continue yield path def absolute(uri): return ':' in uri def relative(uri): return ':' not in uri def graph(directory, depth): deps = {} popularity = {} for path in names(directory, depth): if not popularity.has_key(path): popularity[path] = 0 ext = get_cached(path, 'extension') if ext == 'html': deps[path] = set() outbound = get_cached(path, 'links') nearby = set() for attribute, uri in outbound: if relative(uri): nearby.add(uri) for uri in nearby: deps[path] |= set([uri]) popularity[uri] = popularity.get(uri, 0) + 1 return deps, popularity def tree(directory, depth): deps, popularity = graph(directory, depth) a = [] for hypertext in sorted(deps.keys()): item = [hypertext] outbounds = [] for outbound in sorted(deps[hypertext]): if outbound == '.': continue e = '' if get_cached(outbound, 'exists') else '(X)' p = popularity.get(outbound, 0) p = ('(%s)' % p) if (p > 1) else None outbounds.append((outbound, p, e)) item.append(outbounds) a.append(tuple(item)) b = [] for file, pop in sorted(popularity.items()): if (not get_cached(file, 'extension') == 'html') and (pop == 0): b.append(file) return a, b def display(directory, depth, base=None): if base: if not display.base: display.base = os.getcwd() else: os.chdir(display.base) os.chdir(base) deps, popularity = tree(directory, depth) for dep in deps: print dep[0] for o in dep[1]: if o[1]: print ' -', o[0], o[1], o[2] else: print ' -', o[0], o[2] print for fn in popularity: print fn display.base = None def hypertext(directory, depth, base=None): if base: if not hypertext.base: hypertext.base = os.getcwd() else: os.chdir(display.base) os.chdir(base) deps, popularity = tree(directory, depth) prefix = os.path.normpath(directory).strip('.') + '/' if deps: print '' if popularity: print '' hypertext.base = None def test(): import glob print '' for name in glob.glob('2*'): print '

' + name + '

' print '

Directory

' hypertext(name, '*') print '

Subdirectories

' hypertext(name, '**/*') if __name__ == '__main__': print __doc__ test()