#! /usr/bin/env python

'''

This program ("Wiki Snapper") generates a set of web pages (on local
disk) in proper W3C Tech Report (TR) format from a set of MediaWiki
pages.

It is guided by RDF data about the group, the particular snapshot you
want to take, and the pages to be included in that snapshot.


To Do:
    - move all the used fields to dictionary & _default_ form
    - document all the fields
    - make CSS dynamic instead of hard-coded
    - SOME way to have multiple convergent series of documents...
       (version numbers other than dates?   multi-level snaps?)
    - link checking
    - anchor checking
    - html validation?
    
    - generate the wiki table about the documents?
    - stuff to help with handling of review comments
    
Maybe do my own:
    - toc
    - section numbering


ENTITIES:

http://www.dpawson.co.uk/xsl/sect2/nbsp.html
  NBSP:  1. &#160;    2. &#xA0;


             

'''
__version__ = "$Revision: 1.21 $"


import urlparse
import urllib2 
import time
import datetime
import tempfile
import os
import os.path
import sys
import re
from optparse import OptionParser
from xml.dom.minidom import parseString
import htmlentitydefs

import debugtools
from debugtools import debug
import webdata

WSNAP = "http://www.w3.org/2008/03/wikisnapper#"
webdata_ns = WSNAP
 
doctypes = {
    "ED": "Editor's Draft",
    "WD": "Working Draft",
    }

def run():

    parser = OptionParser(usage="%prog [options] load_uri snapshot_uri",
                          version=__version__)
    parser.set_defaults(verbose=True)
    parser.set_defaults(ed_only=True)
    parser.add_option("-q", "--quiet",
                      action="store_false", dest="verbose", 
                      help="don't print status messages to stdout")
    parser.add_option("--real",
                      action="store_false", dest="ed_only", 
                      help="Allow more the Editors Draft")
    parser.add_option("-D", "--debug",
                      action="append", dest="debugTags", 
                      help="turn on debugging for some subsystem (try 'all')")

    global options
    (options, args) = parser.parse_args()

    if options.debugTags:
        debugtools.tags.update(options.debugTags)
    verbose = options.verbose

    if len(args) == 1:
        snapshot = webdata.toPython(args[0])
    elif len(args) == 2:
        webdata.load(args[0])
        snapshot = webdata.toPython(args[1], autoload=False)
    else:
        parser.print_help()
        sys.exit(1)

    # Sanity-checking RDF data is hard....
    assert isinstance(snapshot, Snapshot)
    
    # set an uplink, to save us on parameter passing
    # (it's an inverse of the .pages property)
    for p in snapshot.pages+snapshot.oldPages:
        p.snapshot = snapshot

    for p in snapshot.oldPages:
        p.oldPage = True

    for p in snapshot.pages:
        p.fetch()

    for p in snapshot.oldPages:
        # just to get refTitle, inLineCredit
        p.fetch()

    for p in snapshot.pages:
        p.generate()

    width = 0
    for p in snapshot.pages:
        l = len(p.thisVersion)
        if l > width: width = l
    print
    print "Documents ready: (but may need CVS check-in)"
    for p in snapshot.pages:
        print " %-*s  %s" % (width, p.thisVersion, p["shortTitle"])
        

class NoData (RuntimeError):
    pass

class Person:

    # name, optional url, optional affiliation

    @property
    def line(self):

        try:
            return self.msg  # msg is a hack over-ride
        except AttributeError:
            pass
        
        try:
            result = u'''<a href="%s">%s</a>''' % (self.url, self.name)
        except AttributeError:
            result = self.name
        try:
            result += u", "+self.affiliation
        except AttributeError:
            pass

        # stupid hack way to do this --- should be general!
        result = re.sub("&", "&amp;", result)
        
        return result

    def loadFromXML(self, node):
        """node is a Dom node (such as a p, span, dd, or td) which
        contains a description of a Person, in some standard format.

        eg: <dd> <a href="http://www.cs.man.ac.uk/~bmotik/" class="external text" title="http://www.cs.man.ac.uk/~bmotik/">Boris Motik</a>, Oxford University
</dd>


        Should use bi-directional grammar templates!

        """

        node.normalize()
        here = node.firstChild
        if here.nodeType == here.TEXT_NODE and here.data.strip() == "":
            here = here.nextSibling
        if here is None:
            raise NoData
        if here.nodeType == here.ELEMENT_NODE and here.tagName == "a":
            self.url = here.getAttribute("href")
            self.name = here.firstChild.data
            here = here.nextSibling

            if here.nodeType == here.TEXT_NODE:
                text = here.data
                assert text[0:1] == ","
                text = text[1:]
                text.strip()
                self.affiliation = text
            else:
                raise RunTimeError
        elif here.nodeType == here.TEXT_NODE:
            text = here.data.strip()
            (self.name, self.affiliation) = text.split(", ", 1)
        elif here.nodeType == here.ELEMENT_NODE and here.tagName == "i":
            self.msg = "".join([x.toxml() for x in node.childNodes])
        else:
            raise Exception, "This doesn't look like a person: %s" % `here`
        
        
def parseCredits(node, page):
    """If the xml can be matched as an author/editor/contributor list,
    then do it, and add them to Page & return true; if it can't return false.
    """

    node = getDivById(node, 'editors')
    for e in node.getElementsByTagName('dl'):
        for (dt, dd) in parseDL(e):
            dtText = nodeContents(dt).strip().lower()
            if dtText.startswith('author'):
                attr = 'authors'
            elif dtText.startswith('editor'):
                attr = 'editors'
            elif dtText.startswith('contrib'):
                attr = 'contributors'
            else:
                raise RuntimeError, 'editor section with bad dt'
            #print 'Got a DT, "%s" ==>   %s' % (dtText,attr)
            ddText = nodeContents(dd)
            #print '   ...DD, "%s"' %ddText
            person = Person()
            try:
                person.loadFromXML(dd)
            except NoData:
                continue
            getattr(page, attr).append(person)
            
def parseDL(node):
    '''Given a dl node, return successive pairs of dt/dd.'''
    
    dt = None
    for child in node.childNodes:
        try:
            if child.tagName == 'dt':
                dt = child
        except AttributeError:
            pass
        try:
            if child.tagName == 'dd':
                yield (dt, child)
        except AttributeError:
            pass

def expandEntities(text):
    """
    Given some HTML text, expand any of the standard HTML entities --
    BUT NOT the XML ones in it.

    This is a total hack -- our HTML parser should be doing this for
    us, but right now I can do this faster.  :-(  :-(

    """
    result = re.sub("&(\w+);", expandEntity, text)
    return result

def expandEntity(match):
    entity = match.group(1)
    if (entity == "lt" or entity == "gt" or
        entity == "quot" or entity == "amp" or entity == "apos"):
        return "&"+entity+";"
    try:
        expansion = htmlentitydefs.name2codepoint[entity]
        return unichr(expansion).encode('utf-8')
    except KeyError:
        raise Exception, "undefined entity %s" % `entity`
    
def entify(text):
    """
    Given the whole HTML page, convert the code points to named HTML entities,
    and maybe convert some &quot; entities back into quote characters.

    This is needed to make the diff-to-wiki not so ugly.
    """

    print >>sys.stderr, "Entifying..."
    out = []
    for x in text:
        try:
            entity = htmlentitydefs.codepoint2name[ord(x)]
            if (entity == "lt" or entity == "amp" or entity == "gt" or entity=="quot"):
                out.append(x)
            else:
                #print >>sys.stderr, "Got one!  %s" % entity
                out.append("&"+entity+";")
        except KeyError:
            out.append(x)

    text = (u''.join(out)).encode( "utf-8" )

    # @@@@ OMG this is so wrong -- but I can't think of an easy way to
    # do it correctly, and I need something working right now.   The
    # problem is that we should use quote characters instead of &quot;
    # UNLESS we're inside an attribute value.   So, this really needs
    # to be done by the XML serializer.
    #
    # henry suggests using rxp  http://www.ltg.ed.ac.uk/~richard/rxp.ht
    text = re.sub("&quot;", '"', text)
    return text

    
class Page:
    """The thing being snapped....

    Implements the dictionary interface on its own properties,
    so that we can do "%" subsitution with it, like:
        "The page %(title) from %(date)" % page
    http://docs.python.org/ref/sequence-types.html

    
    """

    # we're just using __slots__ so webdata wont poke at all our
    # @properties, which don't have well-defined values sometimes.    
    
    __slots__ = [ "shortname",
                  "statusExtra",
                  "wikiPageName",
                  "doctypecode",
                  "pleaseComment",
                  "snapshot",
                  "authors",
                  "editors",
                  "contributors",
                  "pversioncode",
                  "pdateCode",
                  "previousVersion",
                  "commentOn",
                  "tidy",

                  # overrides for oldPage
                  "pubdate",
                  "oldPage",
                  "fragmentBase",

                  ]
    
    def __init__(self):

        for x in self.__slots__:
            setattr(self, x, None)
            
        self.statusExtra=""
        self.commentOn="these Working Drafts"

        # not deserialized, at present...
        # (tablejumper should be able to get a lot of this stuff, ideally)
        self.authors = []
        self.editors = []
        self.contributors = []
        self.stylesheets = None

    def fetch(self):
        print
        print 'Fetching', self.wikiPageURL
        
        if self.tidy is None:
            tidy = False
        else:
            if self.tidy.lower()=="yes":
                tidy = True
            else:
                tidy = False
                
        (self.fetched_page_text, self.raw_page_text) = fetch_page(self.wikiPageURL,tidy)

        save = open("/tmp/wikisnapper-save-posttidy-%s.html" % self.wikiPageName, "w")
        save.write(self.fetched_page_text)
        save.close()
        dom = parseString(self.fetched_page_text)
        self.domTree = dom

        save = open("/tmp/wikisnapper-save-postxml-%s.html" % self.wikiPageName, "w")
        save.write(self.domTree.toxml('utf-8'))
        save.close()

        self.shortTitle = nodeContents(getDivById(dom, 'short-title'))
        # self.mediumTitle = 'OWL 1.1 '+self.shortTitle
        self.refTitle = self.snapshot.titlePrefix+self.shortTitle
        # self.fullTitle = nodeContents(getDivById(dom, 'full-title'))

        self.htmlTitle = self.snapshot.titlePrefix+self.shortTitle
        self.h1Title = self.snapshot.titlePrefix+"<br />"+self.shortTitle

        self.abstract = nodeContents(getDivById(dom, 'abstract'))
        # self.editors = nodeContents(getDivById(dom, 'editors'))

        self.docbodyNode = getDivById(dom, 'docbody')
        
        parseCredits(dom, self)
        handle_editsections(self.docbodyNode)

        self.findPreviousVersion()

    def handle_links(self):
        for e in self.docbodyNode.getElementsByTagName('a'):
            href = e.getAttribute("href")
            if href.startswith("/"):
                uri = "http://www.w3.org"+href
            else:
                uri = href

            if uri.startswith("http://www.w3.org/2005/rules/wiki/"):
                href = uri[len("http://www.w3.org"):]
                
                try:
                    (main, fragment) = uri.split("#")
                    fragment = "#" + fragment
                except:
                    main = uri
                    fragment = ""

                if main == self.wikiPageURL:
                    debug("links", 'rewriting self link to %s' % fragment)
                    e.setAttribute("href", fragment)
                    continue

                self.matchedPage = None
                if self.wikiDocMatch(main):
                    if self.matchedPage.fragmentBase:
                        base = self.matchedPage.fragmentBase
                    else:
                        base = self.matchedPage.thisVersion
                    new_uri = base + fragment
                    e.setAttribute("href", new_uri)
                    debug("links", 'rewriting self %s to %s' % (
                        href, new_uri))
                    continue

                e.setAttribute("href", uri)
                if href.find('&action=edit') > -1:
                    continue   # just leave edit links in the text, hidden
                if href.find('/Image:') > -1:
                    continue   # just leave wiki image links in the text ?!?! @@@
                print >>sys.stderr, '? Wiki link to: ', href


    def generate(self):

        print >>sys.stderr, ""

        filename = self.directory+"/Overview.html"
        try:
            os.mkdir(self.directory)
        except OSError:
            pass

        # has to be after directory is made
        handle_images(self.docbodyNode, self.directory)

        # has to be after directory is made
        self.stylesheets = handle_css(self.domTree, self.directory)

        # has to be after all fetches are done
        self.handle_corefs(self.domTree)

        self.handle_review_comments()

        self.unlink_example_links()
        
        # should be after handle_corefs so it doesn't warn about them
        self.handle_links()

        self.docbody = nodeContents(self.docbodyNode)

        out = tr(self)

        # stupid hack to workaround the fact the in HTML you can't actually
        # repeat the xmlns declaration for HTML -- which rdf:Literals give
        # us.
        out = re.sub('<div xmlns="http://www.w3.org/1999/xhtml">', '<div>', out)

        # even stupider hack to workaround bug in rif-ucr javascript
        out = re.sub('<b class="syntax-head"/>',
                     '<b class="syntax-head"></b>', out)
        
        f = open(filename, "w")

        #str = out.encode("utf-8")
        str = out
        str = entify(str)
        f.write(str)
        f.close()
        print 'wrote', filename

        f = open(self.directory+"/wiki.html", "w")
        f.write(self.raw_page_text)
        f.close()
        cmd = ("hdiff %s/wiki.html %s/Overview.html > %s/diff-from-wiki.html"%
               (self.directory, self.directory, self.directory))
        os.system(cmd)

        cmd = ("prince -s print.css -o %s/all.pdf %s/Overview.html" %
               (self.directory, self.directory))
        os.system(cmd)

        if self.hasPrevious:
            debug("defaults", "self.pversioncode=%s" % self.pversioncode)
            debug("defaults", "self['pversioncode']=%s" % self['pversioncode'])
            cmd = 'hdiff %(pversioncode)s/Overview.html %(versioncode)s/Overview.html > %(diffFile)s.html' % self
            print 'producing %(diffFile)s.html' % self
            os.system(cmd)

        
    # make this look like a dictionary, so we can just do template
    # substitution with it.   And inherit from snapshot and
    # snapshot.group as we do it. 
    # http://docs.python.org/ref/sequence-types.html
    #
    # To override, you must (a) define the attribute, and (b) give it a value
    # that is not None.   You can override it with "", however.
    def __getitem__(self, key):

        for obj in (self, self.snapshot, self.snapshot.group):
            value = None
            try:
                value = getattr(obj, key) 
            except AttributeError:
                pass
            if value is not None:
                return value

        for obj in (self, self.snapshot, self.snapshot.group):
            value = None
            try:
                value = getattr(obj, "_default_"+key)()
            except AttributeError:
                pass
            if value is not None:
                return value
        
        # at one point I allowed an "eval" here, but now that some
        # content comes from the web, that's a security problem.
        
        raise KeyError, key
    def __setitem__(self, key, value):
        raise RuntimeError, 'not mutable this way'
    def __iter__(self):
        return self.iterkeys()
    def iterkeys(self):
        # this isn't really right....    eg it includes iterkeys!
        for x in self.__dict__.iterkeys():
            if not x.startswith("_"):
                yield x

    def wikiDocMatch(self, href):
        for page in self.snapshot.pages:
            if page.wikiPageURL.endswith(href):
                self.matchedPage = page
                return True
        for page in self.snapshot.oldPages:
            if page.wikiPageURL.endswith(href):
                self.matchedPage = page
                return True
        return False

    def handle_review_comments(self):
        for e in self.docbodyNode.getElementsByTagName('div'):
            try:
                this_class = e.attributes["class"].value
            except KeyError:
                continue
            if this_class == "note":
                e.parentNode.removeChild(e)

        for e in self.docbodyNode.getElementsByTagName('div'):
            try:
                this_class = e.attributes["class"].value
            except KeyError:
                continue
            if this_class == "review":
                e.parentNode.removeChild(e)
                
        
        #for a in tree_search(self.docbodyNode, is_review_comment):
        #    while a.hasChildNodes():
        #        child = a.firstChild
        #        a.removeChild(child)
        #        child.unlink()

    example_uri_pat = re.compile(r"""http://[^/]*example\..*""")
    
    def unlink_example_links(self):
        """
        Wikis turn URLs into links, more than we want.

        Undo that for URL starting with http://example.*
        """
        for e in self.docbodyNode.getElementsByTagName('a'):
            try:
                link = e.getAttribute('href')
                text = e.firstChild.data
            except:
                continue
            if link == text:
                m = self.example_uri_pat.match(link)
                if m:
                    parent = e.parentNode
                    new = e.firstChild
                    parent.insertBefore(new, e)
                    parent.removeChild(e)
            
    def handle_corefs(self, xml):
        self.matchedPage = None
        for dd in tree_search(xml, is_ref, self):
            while dd.hasChildNodes():
                child = dd.firstChild
                dd.removeChild(child)
                child.unlink()
                
            newStuff = toDom(self.matchedPage.referenceText)
            dd.appendChild(newStuff)
            self.matchedPage = None

    def generatePastDates(self):
        for pyear in xrange(2006, 3000):
            for pmonth in xrange(1, 13):
                for pday in xrange(1,32):
                    pdateCode = "%04d%02d%02d" % (pyear, pmonth, pday)
                    #debug("previous", "%s > %s" % (`pdateCode`, `self.dateCode`))
                    if pdateCode >= self._dateCode:
                        return
                    yield pdateCode

    def findPreviousVersion(self):

        # allow override -- if pdateCode is provided, we don't scan
        if self.pdateCode:
            self.hasPrevious = True
            return
        
        # this is perhaps not very efficient, but I don't want to
        # hard-code the filename format in two place, and have to
        # parse it appart.  Dumb?  I dunno.
        found = None
        self.hasPrevious = True
        debug("previous(", "looking for previous versions")
        for pdateCode in self.generatePastDates():
            self.pdateCode = pdateCode
            code = self["pversioncode"]
            debug("previous", "past date directory ", code)
            if os.path.exists(code):
                debug("previous", "Found a previous version", code)
                found = pdateCode
        if not found:
            self.hasPrevious = False
        self.pdateCode = found
        debug("previous)",
              "hasPrevious = %s, code=%s", (self.hasPrevious,found))

    def _default_doctype(self):
        return doctypes[self.maxdoctypecode]

    @property
    def maxdoctypecode(self):
        global options
        if options.ed_only and not self.oldPage:
            return "ED"
        else:
            return self.doctypecode
    
    @property
    def _dateCode(self):
        return dateCode(self["pubdate"])
        #return self.snapshot._dateCode

    @property
    def _dateName(self):
        return dateName(self["pubdate"])
        #return self.snapshot._dateName

    @property
    def commentsDue(self):
        return self.snapshot.commentsDue

    @property
    def hasPrevious(self):
        return hasattr(self, 'pdateCode') or hasattr(self.snapshot, 'pdateCode')
    
    @property
    def pdateCode(self):
        return self.snapshot.pdateCode

    @property
    def directory(self):
        return self.versioncode
    
    @property
    def versioncode(self):
        return (self.maxdoctypecode+"-"+
                self.snapshot.group.shortnamePrefix+
                self.shortname+"-"+
                self._dateCode)

    def _default_pversioncode(self):
        assert self.hasPrevious
        return (self.maxdoctypecode+"-"+
                self.snapshot.group.shortnamePrefix+
                self.shortname+"-"+
                self.pdateCode)

    @property
    def versionPrefixURL(self):
        #  can't easily do relative URLs, because we tell the
        #  users these URLs...
        if self.maxdoctypecode == 'ED':
            return self.snapshot.group.draftsURL
        if self.maxdoctypecode == 'WD':
            return "http://www.w3.org/TR/2008/"
        raise RuntimeError, 'dont know how to make version URL'        
        
    @property
    def thisVersion(self):
        return self.versionPrefixURL+self.versioncode+"/"


    def _default_previousVersion(self):
        return self.versionPrefixURL+self["pversioncode"]+"/"

    @property
    def latestURL(self):
        if self.maxdoctypecode == 'ED':
            prefix = self.versionPrefixURL
        if self.maxdoctypecode == 'WD':
            prefix = "http://www.w3.org/TR/"   # not quite the same!
        return prefix+self.snapshot.group.shortnamePrefix+self.shortname+"/"

    @property
    def wikiPageURL(self):
        return self.snapshot.group.wikiURL+self.wikiPageName
    
    @property
    def inLineCredit(self):
        if self.authors:
            return ", ".join([person.name for person in self.authors])
        if len(self.editors) == 1:
            ed = "editor"
        else:
            ed = "eds"
        return ", ".join([person.name for person in self.editors]) + ", " + ed


    def _default_css(self):
        result = """<style type="text/css">
   .editsection { display: none; }
</style>
"""
        for stylesheet in self.stylesheets:
            result+=('''<link rel="stylesheet" type="text/css" href="%s" />\n'''
                     % stylesheet)
        result += """<link rel="stylesheet" type="text/css" href="http://www.w3.org/StyleSheets/TR/W3C-%(maxdoctypecode)s" />\n""" % self

        return result

    def _default_javascript(self):
        return """
<script type="text/javascript">/*<![CDATA[*/
/*
	Written by Jonathan Snook, http://www.snook.ca/jonathan
	Add-ons by Robert Nyman, http://www.robertnyman.com
	Author says "The credit comment is all it takes, no license. Go crazy with it!:-)"
	From http://www.robertnyman.com/2005/11/07/the-ultimate-getelementsbyclassname/
*/

function getElementsByClassName(oElm, strTagName, oClassNames){
	var arrElements = (! (! (strTagName == "*") || ! (oElm.all)))? oElm.all : oElm.getElementsByTagName(strTagName);
	var arrReturnElements = new Array();
	var arrRegExpClassNames = new Array();
	if(typeof oClassNames == "object"){
		for(var i=0; !(i>=oClassNames.length); i++){ /*>*/
			arrRegExpClassNames.push(new RegExp("(^|\\s)" + oClassNames[i].replace(/\-/g, "\\-") + "(\\s|$)"));
		}
	}
	else{
		arrRegExpClassNames.push(new RegExp("(^|\\s)" + oClassNames.replace(/\-/g, "\\-") + "(\\s|$)"));
	}
	var oElement;
	var bMatchesAll;
	for(var j=0; !(j>=arrElements.length); j++){ /*>*/
		oElement = arrElements[j];
		bMatchesAll = true;
		for(var k=0; !(k>=arrRegExpClassNames.length); k++){ /*>*/
			if(!arrRegExpClassNames[k].test(oElement.className)){
				bMatchesAll = false;
				break;
			}
		}
		if(bMatchesAll){
			arrReturnElements.push(oElement);
		}
	}
	return (arrReturnElements)
}

function set_display_by_class(el, cls, newValue) {
   var e = getElementsByClassName(document, el, cls);
   if (e != null) {
      for (var i=0; !(i>=e.length); i++) {
        e[i].style.display = newValue;
      }
   }
}

function set_display_by_id(id, newValue) {
   var e = document.getElementById(id);
   if (e != null) {
     e.style.display = newValue;
   }
}
/*]]>*/
</script>

        
        """

    @property
    def credits(self):
        result = u"<dl>"
        for tag in ('Author', 'Editor', 'Contributor'):
            attr = tag.lower()+"s"
            people = getattr(self, attr)
            if people:
                if people > 1:
                    result += u"<dt>%ss:</dt>" % tag
                else:
                    result += u"<dt>%s:</dt>" % tag
                for person in people:
                    result += u"<dd>"+person.line+u"</dd>\n"
        result += u"</dl>"
        return result

    @property
    def formats(self):
        return '''<p>This document is also available in these non-normative formats: <a href="all.pdf">PDF version</a>.</p>'''

    @property
    def referenceText(self):
        """Return standard text that one can use in a References
        section to refer to this version of this document
        """
        result = u"""<span><cite><a class="external text" href="%(thisVersion)s">%(refTitle)s</a></cite> %(inLineCredit)s. W3C %(doctype)s, %(_dateName)s, <a class="external free" href="%(thisVersion)s">%(thisVersion)s</a>.  Latest version available at <a class="external free" href="%(latestURL)s">%(latestURL)s</a>.</span>""" % self
        return result

    @property
    def numDocs(self):
        return len(self.snapshot.pages)

    @property
    def allDocs(self):
        result = "<ol>\n"
        for page in self.snapshot.pages:
            if page is self:
                thisOne = "(this document)"
            else:
                thisOne = ""
            result += u"""<li><a href="%(thisVersion)s">%(shortTitle)s</a> """%page + thisOne+"</li>\n"
        result += "</ol>\n"
        return result

    @property
    def sotdSOD(self):
        if len(self.snapshot.pages) < 2:
            return ""
        
        result = """
<h4 class="no-toc no-num" id="related">Set of Documents</h4>

<p>This document is being published as one of a set of %(numDocs)s documents: </p>
%(allDocs)s
"""%self
        return result

    @property
    def pleaseCommentText(self):
        if self.pleaseComment is not None:
            
            return self.pleaseComment
        else:
            return """
        <h4 class="no-toc no-num" id="please">Please Comment By %(commentsDue)s</h4>

    <p>The <a class="http" href="%(homeURL)s"
    >%(name)s</a> seeks
    public feedback on %(commentOn)s.  Please send your
    comments to <a class="mailto"
    href="mailto:%(commentsList)s@w3.org"
    shape="rect">%(commentsList)s@w3.org</a> (<a class="http"
    href="http://lists.w3.org/Archives/Public/%(commentsList)s/"
    shape="rect">public archive</a>).  If possible, please offer
    specific changes to the text that would address your
    concern.  You may also wish to check the <a
    href="%(wikiPageURL)s">Wiki
    Version</a> of this document for internal-review comments and changes being
    drafted which may address your concerns. </p>""" % self 
    
    @property
    def diffURL(self):
        return self.versionPrefixURL+self.diffFile

    @property
    def diffFile(self):
        return self.directory+"/diff-since-"+self.pdateCode
    
    @property
    def diffText(self):
        if self.maxdoctypecode == 'ED':
            return u' (<a href="%s">color-coded diff</a>)' % self.diffURL
        else:
            return ""

    @property
    def labelForLatest(self):
        if self.maxdoctypecode == 'ED':
            return "Latest editor's draft"
        else:
            return "Latest version"
        
    @property
    def versionStuff(self):
        result = u"""
<dl>
<dt>This version:</dt>
<dd><a id="this-version-url" href="%(thisVersion)s">%(thisVersion)s</a></dd>
<dt>%(labelForLatest)s:</dt>
<dd><a href="%(latestURL)s">%(latestURL)s</a></dd>
""" % self
        if self.hasPrevious:
           result += u"""
<dt>Previous version:</dt>
<dd><a href="%(previousVersion)s">%(previousVersion)s</a>%(diffText)s</dd>
""" % self
        result += u"</dl>"
        return result


def toDom(text):
    newDoc = parseString(text.encode("utf-8"))
    # deep copy and set the ownerDocument? 
    return newDoc.documentElement

def fetch_page(URL, tidy):

    t0 = time.time()
    stream = urllib2.urlopen(URL)
    text = stream.read()
    save = open("/tmp/wikisnapper-save-pretidy.html", "w")
    save.write(text)
    save.close()
    if tidy:
        print 'running tidy on it'
        to_tidy = tempfile.NamedTemporaryFile()
        to_tidy.write(text)
        to_tidy.flush()
        from_tidy = tempfile.NamedTemporaryFile("r")
        tidy = "/usr/bin/tidy"
        tidy_error_sink = "/tmp/tidy.errors"
        cmd = ("""%s -quiet -asxml -utf8 -f %s < %s > %s""" %
                   (tidy, tidy_error_sink, to_tidy.name, from_tidy.name))
        #cmd = ("""%s -numeric -quiet -asxml -utf8 -f %s < %s > %s""" %
        #           (tidy, tidy_error_sink, to_tidy.name, from_tidy.name))
        code = os.system(cmd)
        to_tidy.close()
        xml = from_tidy.read()
    else:
        print 'not running tidy...'
        xml = text

    # minidom is not handling &nbsp entities; hack around it.
    xml = expandEntities(xml)
    t1 = time.time()
    print >>sys.stderr, len(text),"bytes copied from web in",(t1-t0),"seconds."

    return (xml, text)

def filterThrough(commandLine, inputText):
    toFilter = tempfile.NamedTemporaryFile()
    toFilter.write(inputText)
    toFilter.flush()
    fromFilter = tempfile.NamedTemporaryFile("r")
    cmd = ("""%s < %s > %s""" %
           (commandLine, toFilter.name, fromFilter.name))
    code = os.system(cmd)
    toFilter.close()
    result = fromFilter.read()
    fromFilter.close()
    return result


def nodeContents(xml):
    result = []
    for e in xml.childNodes:
        result.append(e.toxml())
    result = "".join(result)
    try:
        result = unicode(result)
    except UnicodeDecodeError:
        print >>stderr, 'Unicode error in string', result
    assert type(result) == unicode
    return result
        
def getDivById(xml, id):
    for e in xml.getElementsByTagName('div'):
        try:
            this_id = e.attributes["id"].value
        except KeyError:
            continue
        if this_id == id:
            return e
    for e in xml.getElementsByTagName('span'):     # DUMB DUMB DUMB!
        try:
            this_id = e.attributes["id"].value
        except KeyError:
            continue
        if this_id == id:
            return e
                
    raise RuntimeError, '''Cannot find a div with id="'''+id+'".'


def handle_editsections(xml):
    for e in xml.getElementsByTagName('span'):
        cls = e.getAttribute("class")
        if cls == "editsection":
            e.parentNode.removeChild(e)
            e.unlink()

def tree_search(tree, condition, extra=None):
    if condition(tree, extra):
        yield tree
    else:
        for child in tree.childNodes:
            for result in tree_search(child, condition, extra):
                yield result

def is_ref(node, extra):
    if hasattr(node, 'tagName') and node.tagName == 'dd':
        #print 'found a dd', node
        for links in tree_search(node, links_to_docs, extra):
            return True
    return False

def links_to_docs(node, extra):
    try:
        href = node.getAttribute('href')
        if href == "":
            return False
        return extra.wikiDocMatch(href)
    except AttributeError:
        return False

def handle_images(xml, directory):
    print >>sys.stderr, 'Downloading any embedded images.'
    t0 = time.time()
    imageCount = 0
    byteCount = 0
    for e in xml.getElementsByTagName('img'):
        src = e.getAttribute("src")
        # print >>sys.stderr, 'Image: ', src
        if src.startswith("http://"):
            fullsrc = src
        else:
            fullsrc = "http://www.w3.org"+src
        key = fullsrc[fullsrc.rindex("/")+1:]
        e.setAttribute("src", key)
        filename = directory+"/"+key
        #print >>sys.stderr, 'Downloading image\n   ', fullsrc, '-> ', filename

        inStream = urllib2.urlopen(fullsrc)
        text = inStream.read()
        byteCount+=len(text)
        imageCount+=1
        inStream.close()
        outStream = open(filename, "w")
        outStream.write(text)
        outStream.close()

        # if image is in an <a ...>, then remove the <a ...>
        parent = e.parentNode
        grandparent = parent.parentNode
        if parent.tagName == "a":
            # move e up to grandparent
            parent.removeChild(e)
            grandparent.insertBefore(e, parent)
            # remove parent
            grandparent.removeChild(parent)
            parent.unlink()

    t1 = time.time()
    print >>sys.stderr, ("%d images, %d bytes copied from web in %f seconds."%
                         (imageCount, byteCount, t1-t0))
    
css_url_pat = re.compile(r""".*/wiki/index.php\?title=CSS/(.*)&action=raw&ctype=text/css""")

def handle_css(xml, directory):

    '''Find all the CSS links which are NOT to mediawiki skins.
    Download them, and return their names for use in the header.'''

    
    print >>sys.stderr, 'Downloading custom style sheets...'
    t0 = time.time()
    fileCount = 0
    byteCount = 0
    result = []
    for e in xml.getElementsByTagName('link'):
        rel = e.getAttribute("rel")
        #print >>sys.stderr, 'found link', rel
        if rel != "stylesheet":
            continue
        src = e.getAttribute("href")
        if src.find("wiki/skins/common") >= 0:
            continue
        if src.startswith("http://"):
            fullsrc = src
        else:
            fullsrc = "http://www.w3.org"+src
        #print >>sys.stderr, 'abs form', fullsrc

        match = css_url_pat.match(fullsrc)
        
        if match:
            #print >>sys.stderr, 'pat-match on key'
            key = match.group(1)
        else:
            #print >>sys.stderr, 'Not matched: "%s"' % fullsrc
            continue
        print >>sys.stderr, 'key', key
        result.append(key)
        #e.setAttribute("href", key)
        filename = directory+"/"+key
        print >>sys.stderr, 'Downloading style sheet\n   ', fullsrc, '-> ', filename

        inStream = urllib2.urlopen(fullsrc)
        text = inStream.read()
        byteCount+=len(text)
        fileCount+=1
        inStream.close()
        outStream = open(filename, "w")
        outStream.write(text)
        outStream.close()


    t1 = time.time()
    #print >>sys.stderr, ("%d files, %d bytes copied from web in %f seconds."%
    #                     (fileCount, byteCount, t1-t0))
    return result


def tr(fields): 

    text1 = u"""<?xml version="1.0" encoding="UTF-8"?><!--*- nxml -*-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
       "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
  <title>%(htmlTitle)s</title>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  %(css)s
  %(javascript)s
</head>
<body>

<div class="head">
<a href="http://www.w3.org/"><img alt="W3C" height="48"
src="http://www.w3.org/Icons/w3c_home" width="72"/></a><h1 style="clear:both" id="title">%(h1Title)s</h1>

<h2 id="W3C-doctype">W3C %(doctype)s %(_dateName)s</h2>

%(versionStuff)s

%(credits)s

%(formats)s

<hr />

<p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &copy; 2008 <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a><sup>&reg;</sup> (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>, <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>, <a href="http://www.keio.ac.jp/">Keio</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> rules apply.</p>

</div>
<hr/>
<h2><a id="abstract" name="abstract">Abstract</a></h2>

<div>
%(abstract)s
</div>

<h2 class="no-toc no-num">
<a id="status" name="status">Status of this Document</a>
</h2>
    
<h4 class="no-toc no-num" id="may-be">May Be Superseded</h4>
    
<p><em>This section describes the status of this document at the time of its publication. Other documents may supersede this document. A list of current W3C publications and the latest revision of this technical report can be found in the <a href="http://www.w3.org/TR/">W3C technical reports index</a> at http://www.w3.org/TR/.</em></p>
    
%(sotdSOD)s    

%(snapshotStatusExtra)s

%(statusExtra)s

%(pleaseCommentText)s
    
<h4 class="no-toc no-num" id="no-endorsement">No Endorsement</h4>
    
<p><em>Publication as a Working Draft does not imply endorsement by the W3C Membership. This is a draft document and may be updated, replaced or obsoleted by other documents at any time. It is inappropriate to cite this document as other than work in progress.</em></p>
    
<h4 class="no-toc no-num" id="patents">Patents</h4>
    
<p><em>This document was produced by a group operating under the <a href="http://www.w3.org/Consortium/Patent-Policy-20040205/">5 February 2004 W3C Patent Policy</a>. W3C maintains a <a rel="disclosure" href="http://www.w3.org/2004/01/pp-impl/%(id)d/status">public list of any patent disclosures</a> made in connection with the deliverables of the group; that page also includes instructions for disclosing a patent. An individual who has actual knowledge of a patent which the individual believes contains <a href="http://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential">Essential Claim(s)</a> must disclose the information in accordance with <a href="http://www.w3.org/Consortium/Patent-Policy-20040205/#sec-Disclosure"> section 6 of the W3C Patent Policy</a>.</em></p>

<hr title="Separator After Status Section" />

%(docbody)s

</body>
</html>
""" % fields
    return text1
    #text2 = filterThrough('num', text1)
    #text3 = filterThrough('toc -l 2 -h 3 -x -t', text2)
    #print text3



class Group:

    def __init__(self):
        self.shortname = None
        self.id = None
        self.draftsURL = None
        self.wikiURL = None
        self.homeURL = None
        self.name = None
        self.commentsList = None
        self.shortnamePrefix = ""
        
class Snapshot:

    def __init__(self):
        self.pubdate=None
        self.commentsDue=None
        self.group=None
        self.snapshotStatusExtra=""    # along with page.statusExra
        self.pages=[]
        self.oldPages=[]
        self.titlePrefix=""

    @property
    def _dateCode(self):
        """Return  the 8-digit version of the pubdate"""
        return (self.pubdate[0:4]+
                self.pubdate[5:7]+
                self.pubdate[8:10])

    @property
    def _dateName(self):
        """Return pubdate in form "2 January 2020"
        """
        # could use datetime.today ?
        date = datetime.date(int(self.pubdate[0:4]),
                             int(self.pubdate[5:7]),
                             int(self.pubdate[8:10]))
        return date.strftime("%d %B %Y")
        
    


def dateCode(pubdate):
        """Return  the 8-digit version of the pubdate"""
        return (pubdate[0:4]+
                pubdate[5:7]+
                pubdate[8:10])

def dateName(pubdate):
        """Return pubdate in form "2 January 2020"
        """
        # could use datetime.today ?
        date = datetime.date(int(pubdate[0:4]),
                             int(pubdate[5:7]),
                             int(pubdate[8:10]))
        return date.strftime("%d %B %Y")
        
if __name__ == "__main__":
    import doctest, sys
    doctest.testmod(sys.modules[__name__])

    run()
