#!/usr/bin/env python """ mapper.py - Maps Out Sites Author: Sean B. Palmer, inamidst.com * changefreq - how frequently the content at the URL is likely to change * lastmod - the time the content at the URL was last modified * loc - the URL location * priority - the priority of the page relative to other pages on the same site * url - this tag encapsulates the first four tags in this list * urlset - this tag encapsulates the first five tags in this list """ import sys, os, time sys.path.append('dev/code') import inventory inventory.options = inventory.Options(['ignore', 'robots', 'strip']) def encode(text): text = text.replace('&', '&') text = text.replace("'", ''') text = text.replace('"', '"') text = text.replace('>', '>') return text.replace('<', '<') def match(fn, text): f = open(fn) for line in f: if line.count(text): return True f.close() return False class StringBuffer(list): def __iadd__(self, line): self.append(line) return self class Sitemap(object): def __init__(self, host, root): self.host = host self.root = root self.xml = StringBuffer() def crawl(self): xmlns = "http://www.google.com/schemas/sitemap/0.84" self.xml += '' self.xml += '' % xmlns def compare(p, q): return cmp(p.sitepath, q.sitepath) for path in sorted(self.paths(), cmp=compare): self.uri(path) self.xml += '' self.xml += '' def paths(self, robots=True): seen = set() opt = inventory.Options(['abspath', 'mtime', 'sitepath', 'type']) inventory.options |= opt for f in inventory.inventory(self.root): if f.relpath.endswith('.cgi'): if match(f.abspath, 'PATH_INFO'): f.sitepath += '/' # if (not f.sitepath.endswith('/') and # metagen.exists(f.sitepath + '/')): # f.sitepath += '/' # elif (f.sitepath.endswith('/') and # metagen.exists(f.sitepath.rstrip('/'))): # f.sitepath = f.sitepath.rstrip('/') if f.sitepath not in seen: yield f seen.add(f.sitepath) if f.relpath.endswith('.cgi'): fi = inventory.FileInfo() fi.sitepath = '/inside' + f.sitepath fi.mtime = f.mtime if fi.sitepath not in seen: yield fi seen.add(fi.sitepath) if f.type == inventory.Directory: fd = inventory.FileInfo() fd.sitepath = '/list' + f.sitepath fd.mtime = f.mtime if fd.sitepath not in seen: yield fd seen.add(fd.sitepath) def uri(self, f): xmlhost = encode(self.host) def absolute(path): result = 'http://' + xmlhost + encode(path) if len(result) > 2048: raise ValueError("Value too large: %s" % result) return result # def iso8601(unixtime): # t = time.strftime('%Y-%m-%dT%M:%H:%S+00:00', unixtime) # return encode(t) self.xml += ' ' # @@ changefreq # @@ lastmod # self.xml += ' ' + iso8601(f.mtime) + '' self.xml += ' ' + absolute(f.sitepath) + '' priorities = { '/encnorm/': 0.6, '/sbp/': 0.7 } if priorities.has_key(f.sitepath): self.xml += ' %s' % priorities[f.sitepath] # @@ meta stuff # @@ priority self.xml += ' ' def output(self): size = 0 lineseplen = len(os.linesep) for line in self.xml: size += len(line) + lineseplen if size >= 10485760: # the figure in the documentation raise ValueError("Output exceeds 10MB") for line in self.xml: # Test that the line is utf-8 encoded unicode(line, 'utf-8') print line def main(): mapper = Sitemap('inamidst.com', '.') mapper.crawl() mapper.output() if __name__=="__main__": main()