#! /usr/bin/env python ''' A module and command-line tool for fetching text pages from a mediawiki installation. Advantage over urllib2.urlopen is caching. Very mediawiki-aware caching.... x>> text=load(pageURL) ''' __version__ = "$Revision: 1.2 $" import urllib import urllib2 import os.path import time import calendar import xml.dom.minidom # other packages import debugtools from debugtools import debug import xmlextras class Change: def __init__(self, wikiURL): self.wikiURL = wikiURL self.id = None # really the URL of the diff self.title = None # pageName self.link = None # value is in href, so it's missed here self.updated = None self.summary = None self.author = None # wrapped in element @property def time(self): # eg 2008-12-01T13:08:09Z when = time.strptime(self.updated, "%Y-%m-%dT%H:%M:%SZ") return calendar.timegm(when) @property def age(self): now = time.time() return now-self.time @property def url(self): return self.wikiURL + "/" + self.title def __repr__(self): return "Change("+`self.__dict__`+")" def get_recent_changes(wikiURL, count=1): """ Return an array of Change objects, the wiki's Recent Changes page. x>> for x in get_recent_changes('http://www.w3.org/2007/OWL/wiki/', 2): x.. print x.age, `x` """ # time wget 'http://www.w3.org/2007/OWL/wiki/index.php?title=Special:Recentchanges&limit=1&feed=atom' -O last-change debug("cache", 'Fetching Recent Changes') url = (wikiURL+ '/index.php?title=Special:Recentchanges&items=%d&feed=atom'% count) stream = urllib2.urlopen(url) text = stream.read() stream.close() dom = xml.dom.minidom.parseString(text) changes = [] for child in dom.documentElement.childNodes: if child.nodeType == child.ELEMENT_NODE: if child.tagName == "entry": entry = Change(wikiURL) for field in child.childNodes: if field.nodeType == field.ELEMENT_NODE: setattr(entry, field.tagName, xmlextras.nodeContents(field)) changes.append(entry) return changes def print_changes(wikiURL): changes = get_recent_changes(wikiURL, 4) for change in changes: print change.age, change.url def safe(text): """ Quote anything in the text which isn't good to be in a filename """ return urllib.quote(text, safe="") class CachedWiki: def __init__(self, wikiURL, max_stale=1, cache_directory_root=""): self.wikiURL = wikiURL self.max_stale = max_stale self.recent_changes = [] self.last_poll_began = 0 self.last_poll_ended = 0 self.cache_directory = cache_directory_root+"/"+safe(wikiURL) if not os.path.exists(self.cache_directory): os.makedirs(self.cache_directory) def poll_if_necessary(self): start = time.time() if start - self.last_poll_ended < 0.5: return # don't be too rough on server age = start - self.last_poll_began if age > self.max_stale: self.recent_changes = get_recent_changes(self.wikiURL, 20) self.last_poll_began = start self.last_poll_ended = time.time() def page_filename(self, pageName): return self.cache_directory + "/" + safe(pageName) def page_read_from_cache(self, pageName): filename = self.page_filename(pageName) debug("cache", "Looking in file", filename) f = open(filename, "r") full_text = f.read() f.close() debug("cache", "read text", filename) return full_text.split("\n", 1) def page_fetch(self, pageName): debug("cache", 'Fetching page', pageName) stream = urllib2.urlopen(self.wikiURL+"/"+pageName) text = stream.read() stream.close() filename = self.page_filename(pageName) f = open(filename, "w") debug("cache", "Cached to", filename) now = time.time() f.write(str(now)) f.write("\n") f.write(text) f.close() return text def page_load(self, pageName): try: (line1, text) = self.page_read_from_cache(pageName) except IOError: return self.page_fetch(pageName) page_cached = float(line1) now = time.time() if page_cached + self.max_stale > now: return text if self.page_might_have_changed_after(pageName, page_cached): return self.page_fetch(pageName) return text def page_might_have_changed_after(self, pageName, mark): self.poll_if_necessary() for rc in self.recent_changes: if rc.title == pageName: page_last_changed = rc.time if page_last_changed > mark: return True else: return False rc_start = self.recent_changes[-1].time if rc_start < mark: # rc goes back before mark, so the page must not have # changed after mark (or it would have been caught above) return False else: # Yeah, it might have changed, for all we know. # # If we ended up here often, we could raise the count on # fetching Recent Changes, but ... I don't think that's likely. return True wikis = { } def load(pageURL, max_stale=1, cache_directory_root="/tmp/wiki_cache"): # doesn't work if pageName has a slash in it; can't use this interface (wikiURL, pageName) = pageURL.rsplit("/", 1) try: wiki = wikis[wikiURL] except KeyError: wiki = CachedWiki(wikiURL, max_stale, cache_directory_root) wikis[wikiURL] = wiki return wiki.page_load(pageName) def main(): from optparse import OptionParser parser = OptionParser(usage="%prog [options] sourceURL", version=__version__) parser.set_defaults(verbose=True) parser.set_defaults(max_stale=True) parser.set_defaults(cache_directory="/tmp/wiki_cache") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", help="don't print status messages to stdout") parser.add_option("--max-stale", type="float", action="store", dest="max_stale", help="How stale to allow the contents to be (in seconds)") parser.add_option("--cache-directory", action="store", dest="cache_directory", help="Root of cache directory") parser.add_option("-D", "--debug", action="append", dest="debugTags", help="turn on debugging for some subsystem (try 'all')") global options (options, args) = parser.parse_args() if options.debugTags: debugtools.tags.update(options.debugTags) verbose = options.verbose if len(args) == 1: text = load(args[0], max_stale=options.max_stale, cache_directory_root=options.cache_directory) print len(text), "bytes available for this" else: parser.print_help() sys.exit(1) if __name__ == "__main__": import doctest, sys doctest.testmod(sys.modules[__name__]) main()