#! /usr/bin/env python
'''

A module and command-line tool for fetching text pages from a
mediawiki installation.

Advantage over urllib2.urlopen is caching.  Very mediawiki-aware
caching.... 


   x>> text=load(pageURL)


'''
__version__ = "$Revision: 1.2 $"

import urllib
import urllib2
import os.path
import time
import calendar
import xml.dom.minidom

# other packages
import debugtools
from debugtools import debug
import xmlextras

class Change:

    def __init__(self, wikiURL):
        self.wikiURL = wikiURL
        self.id = None      # really the URL of the diff
        self.title = None   # pageName
        self.link = None    # value is in href, so it's missed here
        self.updated = None
        self.summary = None
        self.author = None  # wrapped in <name> element

    @property
    def time(self):

        # eg 2008-12-01T13:08:09Z
        when = time.strptime(self.updated, "%Y-%m-%dT%H:%M:%SZ")
        return calendar.timegm(when)

    @property
    def age(self):
        now = time.time()
        return now-self.time

    @property
    def url(self):
        return self.wikiURL + "/" + self.title
    
    def __repr__(self):
        return "Change("+`self.__dict__`+")"


def get_recent_changes(wikiURL, count=1):
    """

    Return an array of Change objects, the wiki's Recent Changes page.

    x>> for x in get_recent_changes('http://www.w3.org/2007/OWL/wiki/', 2):
    x..    print x.age, `x`
    
    """
    # time wget 'http://www.w3.org/2007/OWL/wiki/index.php?title=Special:Recentchanges&limit=1&feed=atom' -O last-change
    debug("cache", 'Fetching Recent Changes')
    url = (wikiURL+
           '/index.php?title=Special:Recentchanges&items=%d&feed=atom'%
           count)
    stream = urllib2.urlopen(url)
    text = stream.read()
    stream.close()
    dom = xml.dom.minidom.parseString(text)

    changes = []
    for child in dom.documentElement.childNodes:
        if child.nodeType == child.ELEMENT_NODE:
            if child.tagName == "entry":
                entry = Change(wikiURL)
                for field in child.childNodes:
                    if field.nodeType == field.ELEMENT_NODE:
                        setattr(entry, field.tagName,
                                xmlextras.nodeContents(field))
                changes.append(entry)
    return changes

        
def print_changes(wikiURL):
    changes = get_recent_changes(wikiURL, 4)
    for change in changes:
        print change.age, change.url

def safe(text):
    """ Quote anything in the text which isn't good to be in a filename """
    return urllib.quote(text, safe="")
    
class CachedWiki:

    def __init__(self, wikiURL, max_stale=1, cache_directory_root=""):
        self.wikiURL = wikiURL
        self.max_stale = max_stale
        self.recent_changes = []
        self.last_poll_began = 0
        self.last_poll_ended = 0
        self.cache_directory = cache_directory_root+"/"+safe(wikiURL)
        if not os.path.exists(self.cache_directory):
            os.makedirs(self.cache_directory)

    def poll_if_necessary(self):

        start = time.time()

        if start - self.last_poll_ended < 0.5:
            return   # don't be too rough on server
        
        age = start - self.last_poll_began
        if age > self.max_stale:
            self.recent_changes = get_recent_changes(self.wikiURL, 20)
            self.last_poll_began = start
            self.last_poll_ended = time.time()

    def page_filename(self, pageName):
        return self.cache_directory + "/" + safe(pageName)

    def page_read_from_cache(self, pageName):
        filename = self.page_filename(pageName)
        debug("cache", "Looking in file", filename)
        f = open(filename, "r")
        full_text = f.read()
        f.close()
        debug("cache", "read text", filename)
        return full_text.split("\n", 1)

    def page_fetch(self, pageName):
        debug("cache", 'Fetching page', pageName)
        stream = urllib2.urlopen(self.wikiURL+"/"+pageName)
        text = stream.read()
        stream.close()

        filename = self.page_filename(pageName)
        f = open(filename, "w")
        debug("cache", "Cached to", filename)
        now = time.time()
        f.write(str(now))
        f.write("\n")
        f.write(text)
        f.close()
        
        return text

    def page_load(self, pageName):

        try:
            (line1, text) = self.page_read_from_cache(pageName)
        except IOError:
            return self.page_fetch(pageName)

        page_cached = float(line1)
        now = time.time()
        
        if page_cached + self.max_stale > now:
            return text

        if self.page_might_have_changed_after(pageName, page_cached):
            return self.page_fetch(pageName)

        return text

    def page_might_have_changed_after(self, pageName, mark):

        self.poll_if_necessary()

        for rc in self.recent_changes:
            if rc.title == pageName:
                page_last_changed = rc.time
                if page_last_changed > mark:
                    return True
                else:
                    return False
        rc_start = self.recent_changes[-1].time
        if rc_start < mark:
            # rc goes back before mark, so the page must not have
            # changed after mark (or it would have been caught above)
            return False
        else:
            # Yeah, it might have changed, for all we know.
            #
            # If we ended up here often, we could raise the count on
            # fetching Recent Changes, but ... I don't think that's likely.
            return True 
        

wikis = { }
def load(pageURL, max_stale=1, cache_directory_root="/tmp/wiki_cache"):

    # doesn't work if pageName has a slash in it; can't use this interface
    (wikiURL, pageName) = pageURL.rsplit("/", 1)
    
    try:
        wiki = wikis[wikiURL]
    except KeyError:
        wiki = CachedWiki(wikiURL, max_stale, cache_directory_root)
        wikis[wikiURL] = wiki

    return wiki.page_load(pageName)

            
def main():
    from optparse import OptionParser
    
    parser = OptionParser(usage="%prog [options] sourceURL",
                          version=__version__)
    parser.set_defaults(verbose=True)
    parser.set_defaults(max_stale=True)
    parser.set_defaults(cache_directory="/tmp/wiki_cache")
    parser.add_option("-q", "--quiet",
                      action="store_false", dest="verbose", 
                      help="don't print status messages to stdout")
    parser.add_option("--max-stale", type="float",
                      action="store", dest="max_stale", 
                      help="How stale to allow the contents to be (in seconds)")
    parser.add_option("--cache-directory", 
                      action="store", dest="cache_directory", 
                      help="Root of cache directory")
    parser.add_option("-D", "--debug",
                      action="append", dest="debugTags", 
                      help="turn on debugging for some subsystem (try 'all')")

    global options
    (options, args) = parser.parse_args()

    if options.debugTags:
        debugtools.tags.update(options.debugTags)
    verbose = options.verbose

    if len(args) == 1:
        text = load(args[0],
                    max_stale=options.max_stale,
                    cache_directory_root=options.cache_directory)
        print len(text), "bytes available for this"
    else:
        parser.print_help()
        sys.exit(1)

if __name__ == "__main__":
    import doctest, sys
    doctest.testmod(sys.modules[__name__])

    main()





