#! /usr/bin/env python
"""

Load data from the web, starting with (and focussed on) the source Document.

Could be a method of doc, but the logic gets so messy, I thought it
was cleaner as its own module.

This could be big and slow, so we'd like to cache this information.
If the data is on a wiki or in w3c-cvs, we can actually be notified
when it's changed.  We could have a separate program to prepare all
this data for us...
    
"""
__version__ = "$Revision: 1.3 $"

# standard
import datetime
import subprocess
import sys
import os
import time
import urllib2
from xml.dom.minidom import parseString
import htmlentitydefs
import re

# other packages
import debugtools
from debugtools import debug
import wiki_cache

def fetch_page(URL, useTidy):

    t0 = time.time()
    #stream = urllib2.urlopen(URL)
    #text = stream.read()
    text = wiki_cache.load(URL)
    save = open("/tmp/wikisnapper-save-pretidy.html", "w")
    save.write(text)
    save.close()
    if useTidy:
        print 'running tidy on it'
        to_tidy = tempfile.NamedTemporaryFile()
        to_tidy.write(text)
        to_tidy.flush()
        from_tidy = tempfile.NamedTemporaryFile("r")
        tidy = "/usr/bin/tidy"
        tidy_error_sink = "/tmp/tidy.errors"
        cmd = ("""%s -quiet -asxml -utf8 -f %s < %s > %s""" %
                   (tidy, tidy_error_sink, to_tidy.name, from_tidy.name))
        #cmd = ("""%s -numeric -quiet -asxml -utf8 -f %s < %s > %s""" %
        #           (tidy, tidy_error_sink, to_tidy.name, from_tidy.name))
        code = os.system(cmd)
        to_tidy.close()
        xml = from_tidy.read()
    else:
        print 'not running tidy...'
        xml = text

    # minidom is not handling &nbsp entities; hack around it.
    xml = expandEntities(xml)
    t1 = time.time()
    print >>sys.stderr, len(text),"bytes copied from web in",(t1-t0),"seconds."

    return (xml, text)


def load(doc, options):
    """

    
    """

    print
    print 'Fetching', doc.sourceURL

    if doc.tidy is None:
        useTidy = False
    else:
        if doc.tidy.lower()=="yes":
            useTidy = True
        else:
            useTidy = False

    (doc.fetched_page_text, doc.raw_page_text) = fetch_page(doc.sourceURL,useTidy)

    save = open("/tmp/wikisnapper-save-posttidy-%s.html" % doc.pageName, "w")
    save.write(doc.fetched_page_text)
    save.close()
    dom = parseString(doc.fetched_page_text)
    doc.domTree = dom

    save = open("/tmp/wikisnapper-save-postxml-%s.html" % doc.pageName, "w")
    save.write(doc.domTree.toxml('utf-8'))
    save.close()

    div_scrape(doc)

    gather_nearby(doc)
    
    # @@@ findPreviousVersion(doc)
    #   we want previous snapshots, which are ... recorded in our local db?
    #   ... and previous TR, as well -- which is from ... w3.org?
    #   ... or manually configured?


def div_scrape(doc):
    """
    Assume doc is loaded; set more attributes by looking at various ids.

    This is old style -- before open-data-tables.
    
    """
    dom = doc.domTree
    doc.shortTitle = nodeContents(getDivById(dom, 'short-title'))
    doc.abstract = nodeContents(getDivById(dom, 'abstract'))
    doc.docbodyNode = getDivById(dom, 'docbody')
    parseCredits(dom, doc)
    
    # doc.mediumTitle = 'OWL 1.1 '+doc.shortTitle
    #### doc.refTitle = doc.titlePrefix+doc.shortTitle
    # doc.fullTitle = nodeContents(getDivById(dom, 'full-title'))

    ###doc.htmlTitle = doc.titlePrefix+doc.shortTitle
    ###doc.h1Title = doc.titlePrefix+"<br />"+doc.shortTitle


    # doc.editors = nodeContents(getDivById(dom, 'editors'))




def gather_nearby(doc):
    #  guess at Group URL
    #  use ODT to read Group data, including Rounds
    #  ... or use RDF we gathered from there?
    raise Exception, 'not implemented'

class NoData (Exception):
    pass

def parseCredits(node, doc):
    """If the xml can be matched as an author/editor/contributor list,
    then do it, and add them to Doc & return true; if it can't return false.
    """

    node = getDivById(node, 'editors')
    for e in node.getElementsByTagName('dl'):
        for (dt, dd) in parseDL(e):
            dtText = nodeContents(dt).strip().lower()
            if dtText.startswith('author'):
                attr = 'authors'
            elif dtText.startswith('editor'):
                attr = 'editors'
            elif dtText.startswith('contrib'):
                attr = 'contributors'
            else:
                raise RuntimeError, 'editor section with bad dt'
            #print 'Got a DT, "%s" ==>   %s' % (dtText,attr)
            ddText = nodeContents(dd)
            #print '   ...DD, "%s"' %ddText
            person = Person()
            try:
                person.loadFromXML(dd)
            except NoData:
                continue
            getattr(doc, attr).append(person)
            
def parseDL(node):
    '''Given a dl node, return successive pairs of dt/dd.'''
    
    dt = None
    for child in node.childNodes:
        try:
            if child.tagName == 'dt':
                dt = child
        except AttributeError:
            pass
        try:
            if child.tagName == 'dd':
                yield (dt, child)
        except AttributeError:
            pass



def nodeContents(xml):
    result = []
    for e in xml.childNodes:
        result.append(e.toxml())
    result = "".join(result)
    try:
        result = unicode(result)
    except UnicodeDecodeError:
        print >>stderr, 'Unicode error in string', result
    assert type(result) == unicode
    return result
        
def getDivById(xml, id):
    for e in xml.getElementsByTagName('div'):
        try:
            this_id = e.attributes["id"].value
        except KeyError:
            continue
        if this_id == id:
            return e
    for e in xml.getElementsByTagName('span'):     # DUMB DUMB DUMB!
        try:
            this_id = e.attributes["id"].value
        except KeyError:
            continue
        if this_id == id:
            return e
                
    raise RuntimeError, '''Cannot find a div with id="'''+id+'".'




def expandEntities(text):
    """
    Given some HTML text, expand any of the standard HTML entities --
    BUT NOT the XML ones in it.

    This is a total hack -- our HTML parser should be doing this for
    us, but right now I can do this faster.  :-(  :-(

    """
    result = re.sub("&(\w+);", expandEntity, text)
    return result

def expandEntity(match):
    entity = match.group(1)
    if (entity == "lt" or entity == "gt" or
        entity == "quot" or entity == "amp" or entity == "apos"):
        return "&"+entity+";"
    try:
        expansion = htmlentitydefs.name2codepoint[entity]
        return unichr(expansion).encode('utf-8')
    except KeyError:
        raise Exception, "undefined entity %s" % `entity`
    
class Person:

    # name, optional url, optional affiliation

    @property
    def line(self):

        try:
            return self.msg  # msg is a hack over-ride
        except AttributeError:
            pass
        
        try:
            result = u'''<a href="%s">%s</a>''' % (self.url, self.name)
        except AttributeError:
            result = self.name
        try:
            result += u", "+self.affiliation
        except AttributeError:
            pass

        # stupid hack way to do this --- should be general!
        result = re.sub("&", "&amp;", result)
        
        return result

    def loadFromXML(self, node):
        """node is a Dom node (such as a p, span, dd, or td) which
        contains a description of a Person, in some standard format.

        eg: <dd> <a href="http://www.cs.man.ac.uk/~bmotik/" class="external text" title="http://www.cs.man.ac.uk/~bmotik/">Boris Motik</a>, Oxford University
</dd>


        Should use bi-directional grammar templates!

        """

        node.normalize()
        here = node.firstChild
        if here.nodeType == here.TEXT_NODE and here.data.strip() == "":
            here = here.nextSibling
        if here is None:
            raise NoData
        if here.nodeType == here.ELEMENT_NODE and here.tagName == "a":
            self.url = here.getAttribute("href")
            self.name = here.firstChild.data
            here = here.nextSibling

            if here.nodeType == here.TEXT_NODE:
                text = here.data
                assert text[0:1] == ","
                text = text[1:]
                text.strip()
                self.affiliation = text
            else:
                raise RunTimeError
        elif here.nodeType == here.TEXT_NODE:
            text = here.data.strip()
            (self.name, self.affiliation) = text.split(", ", 1)
        elif here.nodeType == here.ELEMENT_NODE and here.tagName == "i":
            self.msg = "".join([x.toxml() for x in node.childNodes])
        else:
            raise Exception, "This doesn't look like a person: %s" % `here`
        





    def generatePastDates(self):
        for pyear in xrange(2006, 3000):
            for pmonth in xrange(1, 13):
                for pday in xrange(1,32):
                    pdateCode = "%04d%02d%02d" % (pyear, pmonth, pday)
                    #debug("previous", "%s > %s" % (`pdateCode`, `self.dateCode`))
                    if pdateCode >= self._dateCode:
                        return
                    yield pdateCode

    def findPreviousVersion(self):

        # allow override -- if pdateCode is provided, we don't scan
        if self.pdateCode:
            self.hasPrevious = True
            return
        
        # this is perhaps not very efficient, but I don't want to
        # hard-code the filename format in two place, and have to
        # parse it appart.  Dumb?  I dunno.
        found = None
        self.hasPrevious = True
        debug("previous(", "looking for previous versions")
        for pdateCode in self.generatePastDates():
            self.pdateCode = pdateCode
            code = self["pversioncode"]
            debug("previous", "past date directory ", code)
            if os.path.exists(code):
                debug("previous", "Found a previous version", code)
                found = pdateCode
        if not found:
            self.hasPrevious = False
        self.pdateCode = found
        debug("previous)",
              "hasPrevious = %s, code=%s", (self.hasPrevious,found))
