#!/usr/bin/env python """ Wiki Weblog RSS 1.0 Feed Manipulator Author: Sean B. Palmer, inamidst.com Now with a certain-percent extra DOM Munging! """ import os, re, time, xml.dom.minidom from urlparse import urljoin as urijoin timenow = int(time.time()) __version__ = '2005-02-24 (loaded: %i)' % timenow def first(seq): for result in seq: return result return None def the(seq): iter = (item for item in seq) first = iter.next() try: iter.next() except StopIteration: pass else: raise ValueError, "Sequence has more than one item" return first def last(seq): return seq[-1] def text(data): node = xml.dom.minidom.Text() node.replaceWholeText(data) return node def blog(wikiname, description, basedir=None): if basedir is None: basedir = './' rssfeed = os.path.join(basedir, 'rss1.0.rss') wikifile = os.path.join(basedir, ('%s.html' % wikiname)) rawarchive = os.path.join(basedir, 'rawarchive.txt') blogFeed(rssfeed, wikifile, wikiname, description, rawarchive) def blogFeed(rssfeed, wikifile, wikiname, description, rawarchive=None): dom = xml.dom.minidom.parse(rssfeed) channel = the(dom.getElementsByTagName('channel')) baseURI = channel.getAttribute('rdf:about') link = urijoin(baseURI, wikiname) title, content = getTitleAndContent(wikifile) lastmod = os.stat(wikifile).st_mtime dom = addItem(dom, link, title, description, content) rss = open(rssfeed, 'w') dom.writexml(rss) rss.close() dom.unlink() if rawarchive and os.path.isfile(rawarchive): now = time.strftime('%Y-%m-%d %H:%M', time.gmtime()) f = open(rawarchive, 'a') print >> f, '\t'.join([now, wikiname, title, description]) f.close() r_title = re.compile(r'(?ims)(.*?)') r_body = re.compile(r'(?ims)(.*)') r_whitespace = re.compile(r'[ \t\r\n]+') def getTitleAndContent(wikifile): f = open(wikifile) html = f.read() f.close() title = r_title.search(html).group(1) title = r_whitespace.sub(' ', title) title = title.strip() content = r_body.search(html).group(1) return title, content def addItem(dom, link, title, description, content): uri = link + '#' + str(int(time.time())) date = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) rdf = the(dom.getElementsByTagName('rdf:RDF')) channel = the(rdf.getElementsByTagName('channel')) seq = the(channel.getElementsByTagName('rdf:Seq')) dateElements = channel.getElementsByTagName('dc:date') if len(dateElements) == 1: dateElement = the(dateElements) if dateElement.firstChild: dateElement.firstChild.replaceWholeText(date) else: dateElement.appendChild(text(date)) seqChildElements = seq.getElementsByTagName('rdf:li') seqLength = len(seqChildElements) if seqLength >= 10: seqLast = last(seqChildElements) seq.removeChild(seqLast) li = xml.dom.minidom.Element('rdf:li') li.setAttribute('rdf:resource', uri) if seqLength: seqFirst = first(seqChildElements) seq.insertBefore(li, seqFirst) else: seq.appendChild(li) itemElements = rdf.getElementsByTagName('item') itemElementsLength = len(itemElements) if itemElementsLength >= 10: itemLast = last(itemElements) rdf.removeChild(lastItem) item = xml.dom.minidom.Element('item') item.setAttribute('rdf:about', uri) titleElement = xml.dom.minidom.Element('title') titleElement.appendChild(text(title)) item.appendChild(titleElement) descriptionElement = xml.dom.minidom.Element('description') descriptionElement.appendChild(text(description)) item.appendChild(descriptionElement) linkElement = xml.dom.minidom.Element('link') linkElement.appendChild(text(link)) item.appendChild(linkElement) creatorElements = channel.getElementsByTagName('dc:creator') if len(creatorElements) == 1: creatorElement = the(creatorElements).cloneNode(True) item.appendChild(creatorElement) dateElement = xml.dom.minidom.Element('dc:date') dateElement.appendChild(text(date)) item.appendChild(dateElement) contentElement = xml.dom.minidom.Element('content:encoded') contentElement.appendChild(text(content)) item.appendChild(contentElement) if itemElementsLength: itemFirst = first(itemElements) rdf.insertBefore(item, itemFirst) else: rdf.appendChild(item) return dom def test(args=None): import sys if args is None: args = sys.argv[1:] if len(args): basedir = args[0] blog('test', 'This is a test file.', basedir=basedir) else: print __doc__ if __name__=="__main__": print __doc__