#!/usr/bin/env python
"""
mapper.py - Maps Out Sites
Author: Sean B. Palmer, inamidst.com
* changefreq - how frequently the content at the URL is likely to change
* lastmod - the time the content at the URL was last modified
* loc - the URL location
* priority - the priority of the page relative to other pages on the same site
* url - this tag encapsulates the first four tags in this list
* urlset - this tag encapsulates the first five tags in this list
"""
import sys, os, time
sys.path.append('dev/code')
import inventory
inventory.options = inventory.Options(['ignore', 'robots', 'strip'])
def encode(text):
text = text.replace('&', '&')
text = text.replace("'", ''')
text = text.replace('"', '"')
text = text.replace('>', '>')
return text.replace('<', '<')
def match(fn, text):
f = open(fn)
for line in f:
if line.count(text):
return True
f.close()
return False
class StringBuffer(list):
def __iadd__(self, line):
self.append(line)
return self
class Sitemap(object):
def __init__(self, host, root):
self.host = host
self.root = root
self.xml = StringBuffer()
def crawl(self):
xmlns = "http://www.google.com/schemas/sitemap/0.84"
self.xml += ''
self.xml += '' % xmlns
def compare(p, q):
return cmp(p.sitepath, q.sitepath)
for path in sorted(self.paths(), cmp=compare):
self.uri(path)
self.xml += ''
self.xml += ''
def paths(self, robots=True):
seen = set()
opt = inventory.Options(['abspath', 'mtime', 'sitepath', 'type'])
inventory.options |= opt
for f in inventory.inventory(self.root):
if f.relpath.endswith('.cgi'):
if match(f.abspath, 'PATH_INFO'):
f.sitepath += '/'
# if (not f.sitepath.endswith('/') and
# metagen.exists(f.sitepath + '/')):
# f.sitepath += '/'
# elif (f.sitepath.endswith('/') and
# metagen.exists(f.sitepath.rstrip('/'))):
# f.sitepath = f.sitepath.rstrip('/')
if f.sitepath not in seen:
yield f
seen.add(f.sitepath)
if f.relpath.endswith('.cgi'):
fi = inventory.FileInfo()
fi.sitepath = '/inside' + f.sitepath
fi.mtime = f.mtime
if fi.sitepath not in seen:
yield fi
seen.add(fi.sitepath)
if f.type == inventory.Directory:
fd = inventory.FileInfo()
fd.sitepath = '/list' + f.sitepath
fd.mtime = f.mtime
if fd.sitepath not in seen:
yield fd
seen.add(fd.sitepath)
def uri(self, f):
xmlhost = encode(self.host)
def absolute(path):
result = 'http://' + xmlhost + encode(path)
if len(result) > 2048:
raise ValueError("Value too large: %s" % result)
return result
# def iso8601(unixtime):
# t = time.strftime('%Y-%m-%dT%M:%H:%S+00:00', unixtime)
# return encode(t)
self.xml += ' '
# @@ changefreq
# @@ lastmod
# self.xml += ' ' + iso8601(f.mtime) + ''
self.xml += ' ' + absolute(f.sitepath) + ''
priorities = {
'/encnorm/': 0.6,
'/sbp/': 0.7
}
if priorities.has_key(f.sitepath):
self.xml += ' %s' % priorities[f.sitepath]
# @@ meta stuff
# @@ priority
self.xml += ' '
def output(self):
size = 0
lineseplen = len(os.linesep)
for line in self.xml:
size += len(line) + lineseplen
if size >= 10485760: # the figure in the documentation
raise ValueError("Output exceeds 10MB")
for line in self.xml:
# Test that the line is utf-8 encoded
unicode(line, 'utf-8')
print line
def main():
mapper = Sitemap('inamidst.com', '.')
mapper.crawl()
mapper.output()
if __name__=="__main__":
main()