#! /usr/bin/env python
"""
sandro@ubuhebe:~/gd$ python wiki-dump.py -p 'UCR.*' -d dump/ http://www.w3.org/2005/rules/wg/wiki

Core/Positive_Conditions is a good (difficult) test page

TODO:
    - better css
    - handle attachments
    - load into MediaWiki???  with leading {{Migrated|...}} template
          convert {{{ ... }}}  to <pre> <pre> 
    - link to latest, which is dup'd [ or same-time image??? ]
    - find out how to get the who & why of an edit (action=info)
    - group changes made in a 24 hour period, to reduce size of dump, etc

"""
__version__ = "$Revision: 1.6 $"

import sys
import urllib
import urllib2
import re
import os
import time
from optparse import OptionParser

import html

def trace(s):
    if options.verbose:
        print s

cacheDir = "./wiki-dump-cache"

def fetch(url):
    """Download the contents over the web.    But we super-cache
    everything, since wikis aren't so good about giving us good
    cacheability information.

    wiki-dump-cache/quoted-URL
    """
    if not os.path.exists(cacheDir):
        os.makedirs(cacheDir)
    filename = cacheDir+"/"+urllib.quote(url, "")
    if not os.path.exists(filename):
        trace('   fetching '+url)
        urllib.urlretrieve(url, filename)
        time.sleep(0.2)
    return(open(filename).read())

class NoSuchVersion (RuntimeError):
    pass
class CantParseTemplate (RuntimeError):
    pass
class BadWiki (RuntimeError):
    pass

contentPattern = re.compile(r'''.*<div id="page" lang=".*?" dir="ltr">(.*)</div>\s*<div id="footer">.*''', re.DOTALL)

noSuchPage = re.compile(r'''.*<strong>This page does not exist yet. You can create a new empty page, or use one of the page templates.</strong>.*''', re.DOTALL)
pageListPattern = re.compile(r'''<br><a href="(.*?)">.*?</a>(<a href=".*?action=AttachFile.*)?''')

revisionDatePattern = re.compile(r'''<div id="message">\s*<p>(<strong>This page redirects to page "(.*?)"</strong><br>)?<strong>Revision (\d+) as of (.*)</strong><br></p>''', re.DOTALL)

linkPattern = re.compile(r'''<a (class=".*?" |)href="(.*?)">(.*?)</a>''', re.DOTALL)

class Page:

    def __init__(self, dumper, name, hasAttachments):
        self.dumper = dumper
        self.name = name
        self.hasAttachements = hasAttachments
        self.versions = [None]
        self.rawVersions = [None]
        self.versionDates = [None]
        self.maxVersion = None

    def doMirror(self):
        trace('Fetching all versions of '+self.name)
        for version in xrange(1,99999):
            try:
                print '   trying version', version
                self.getPageVersion(version)
                self.maxVersion = version
            except NoSuchVersion:
                break
            except urllib2.HTTPError, err:
                if err.code == 404:
                    break
                else:
                    raise BadWiki, 'HTTP Error'+str(err)
        print '   maxVersion', self.maxVersion
        for version in xrange(1, self.maxVersion+1):
            self.dumpVersion(version)
        self.writeVersionPage()

        # save some memory... we wont need these any more....
        self.versions = None
        self.rawVersions = None

    def getPageVersion(self, version):
        url = (self.dumper.hostpart+
               self.dumper.pathpart+
               self.name+
               "?action=recall&rev="+str(version))
        pagetext = fetch(url)
        snapfile = '/tmp/wiki-dumper-trace'
        tmp = open(snapfile, 'w')
        # trace('   snapshot to '+snapfile)
        tmp.write('trace data for '+url+"\n")
        tmp.write(pagetext)

        m = contentPattern.match(pagetext)
        if not m:
            raise BadWiki('page content pattern failed')
        content = m.group(1)

        m = noSuchPage.match(content)
        if m:
            raise NoSuchVersion

        content = self.relink(content)

        assert len(self.versions) == version
        self.versions.append(content)

        url = (self.dumper.hostpart+
               self.dumper.pathpart+
               self.name+
               "?action=raw&rev="+str(version))
        rawtext = fetch(url)
        self.rawVersions.append(rawtext)

        m = revisionDatePattern.search(pagetext)
        assert m
        # group 2 is the redirection information
        assert m.group(3) == str(version)
        self.versionDates.append(m.group(4))

    def relink(self, text):
        """Find all the internal links in this html and change them"""
        boundRelink2 = lambda x: self.relink2(x)
        return linkPattern.sub(boundRelink2, text)

    def relink2(self, match):
        link = match.group(2)
        if link.startswith(self.dumper.pathpart):
            fromWikiRoot = link[len(self.dumper.pathpart):]
            toWikiRoot = "../" * (1+self.name.count("/"))
            link = toWikiRoot + fromWikiRoot + "/history.html"
            # instead of history, should we link to the latest
            # (but what is it called?  latest.html as html redirect?)
        ## catch "?" links....
        return '<a '+match.group(1)+'href="'+link+'">'+match.group(3)+'</a>'
        
    def pagedir(self):
        dirname = self.dumper.directory+self.name
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        return dirname+"/"
        
    def dumpVersion(self, version):

        trace('dumping '+self.name+' version '+str(version))
        filename = self.pagedir()+"rev_"+str(version)+".html"

        d = html.Document()
        d.head << html.title(self.name + ", revision "+str(version))
        d.head << html.stylelink('http://www.w3.org/2007/04/wiki/modern/css/common.css')
        d.head << html.stylelink('http://www.w3.org/2007/04/wiki/modern/css/screen.css')
        d.head << html.stylelink('http://www.w3.org/2007/04/wiki/modern/css/print.css')
        d.head << html.stylelink('http://www.w3.org/2007/04/wiki/modern/css/projection.css')
        d << html.h1(self.name)
        d << html.h2('Revision '+str(version)+" of ",
                     html.a(str(self.maxVersion),
                            attrs={'href':'history.html'}),
                     ", "+self.versionDates[version])
        if version > 1:
            d << html.a('Previous Version', attrs={'href':"rev_"+str(version-1)+".html"})
        else:
            d << html.span('No Previous Versions')
        d << html.span(" ... ")
        if version < self.maxVersion:
            d << html.a('Next Version', attrs={'href':"rev_"+str(version+1)+".html"})
        else:
            d << html.span('No Later Version')
        d << html.Comment('HTML Code from the wiki begins here')
        d << html.Raw(self.versions[version])
        
        w = open(filename, "w")
        w.write(str(d))

    def writeVersionPage(self):
        d = html.Document()
        d.head << html.title(self.name + " revision history")
        d << html.h1(self.name + " Page Revision History")
        t = html.table()
        for version in xrange(1, self.maxVersion+1):
            t << html.tr(
                html.td(html.a("Revision "+str(version),
                               attrs={"href":"rev_"+str(version)+".html"})),
                html.td(self.versionDates[version])
                )
        d << t
        filename = self.pagedir()+"history.html"
        w = open(filename, "w")
        w.write(str(d))

        
urisplitpattern = re.compile(r'''(http://[^/]*)(/?.*?)(/?)$''')
def urisplit(uripat):
    """
    Take an HTTP URI, which might end in a regexp pattern and split it
    into the host part, the wiki path part, and the pattern part.
    This doesn't work for wiki page names which contain a slash -- we
    wouldn't be able to tell where the root of the wiki was.

    >>> urisplit('http://www.w3.org/2005/rules/wg/wiki/')
    ('http://www.w3.org', '/2005/rules/wg/wiki/')
    >>> urisplit('http://www.w3.org/wiki')
    ('http://www.w3.org', '/wiki/')
    >>> urisplit('http://example.org')
    ('http://example.org', '/')
    """
    m = urisplitpattern.match(uripat)
    host = m.group(1)
    path = m.group(2)+"/"
    if path == "//":
        path = "/"
    return (host, path)

class WikiDumper:

    def __init__(self, wikiAddress, pagePattern, directory):
        self.pages = []
        (self.hostpart, self.pathpart) = urisplit(wikiAddress)
        self.pageNamePattern = re.compile(pagePattern)
        self.directory = directory

    def run(self):
        self.getPageList()
        trace(str(len(self.pages))+' pages found.')
        matched = 0
        for page in self.pages:
            m = self.pageNamePattern.match(page.name)
            if m:
                matched += 1
        trace(str(matched)+' of those match the name pattern.')
        for page in self.pages:
            m = self.pageNamePattern.match(page.name)
            if m:
                page.doMirror()

    def getPageList(self):
        url = self.hostpart+self.pathpart+"TitleIndex"
        trace('Fetching list of pages from '+url)
        indextext = fetch(url)
        for line in indextext.splitlines():
            m = pageListPattern.match(line)
            if m:
                link = m.group(1)
                assert link.startswith(self.pathpart)
                link = link[len(self.pathpart):]
                if m.group(2):
                    page = Page(self, link, hasAttachments=True)
                else:
                    page = Page(self, link, hasAttachments=False)
                self.pages.append(page)


templatePattern = re.compile('{{(.*?)}}', re.DOTALL)

def templates(text):
    """
    Returns an interator over all the MediaWiki template instances
    found in the text.   For each template instance found, return a tuple
    (TemplateName, ArgumentDict).

    Not expected to work in the presence of nested template or
    template arguments like {{{...}}}

    >>> list(templates('foo'))
    []
    >>> list(templates('foo{{bar}}'))
    [('bar', {})]
    >>> list(templates('foo{{  a-z_x*x   }}'))
    [('a-z_x*x', {})]

    This slightly convoluted example is written to allow success even
    though the order of the dictionary might change.
    
    >>> result = list(templates('foo{{Person|name=Sandro Hawke| age = 43 }}'))
    >>> expected = [('Person', {'age': ' 43 ', 'name': 'Sandro Hawke'})]
    >>> result == expected
    True
    
    """
    for tm in templatePattern.finditer(text):
        parts = tm.group(1).split("|")
        template = parts[0].strip()
        d = {}
        for arg in parts[1:]:
            (key, value) = arg.split("=")
            key = key.strip()
            d[key] = value
        yield template, d
    
if __name__ == "__main__":
    import doctest, sys
    doctest.testmod(sys.modules[__name__])

    parser = OptionParser(usage="%prog [options] wiki-url",
                          version=__version__)
    parser.add_option("-d", "--directory", dest="directory", default='./',
                      help="name of directory to store dumped tree")
    parser.add_option("-p", "--page-pattern", dest="pagePattern", default='.*',
                      help="regexp of page names to process")
    parser.add_option("-q", "--quiet",
                      action="store_false", dest="verbose", default=True,
                      help="don't print status messages to stdout")

    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.print_help()
        sys.exit(1)

    wikidumper = WikiDumper(args[0], options.pagePattern, options.directory)
    wikidumper.run()
    
