#! /usr/bin/env python

'''
             

'''
__version__ = "$Revision: 1.1 $"


import urllib2 
import time
import sys
from xml.dom.minidom import parseString

import meeting

class Page:

    #'http://www.w3.org/2007/OWL/wiki/Participants'
    def __init__(self, location):
        self.wikiPageURL = location
        self.tidy = "no"
        self.people = []

    def getnames(self, node):
        #print >>sys.stderr, "Names"
        div = getDivById(node, "people")
        for e in div.getElementsByTagName('li'):
            if self.tryPersonWithURL(e):
                pass
            elif self.tryPerson(e):
                pass
            else:
                raise RuntimeError, "badly formatted name, I think"

    def tryPersonWithURL(self,xml):
        for child in xml.childNodes:
            if hasattr(child, "tagName") and child.tagName == "a":
                href = child.getAttribute("href")
                text = nodeContents(child).strip()
                text = text.encode('utf-8')
                p = meeting.Person(text, url=href)
                self.people.append(p)
                return True
        return False

    def tryPerson(xml):
        text = nodeContents(xml)
        p = meeting.Person(text)
        return True
    
    def run(self):
        #print
        #print 'Fetching', self.wikiPageURL
        
        if self.tidy is None:
            tidy = False
        else:
            if self.tidy.lower()=="yes":
                tidy = True
            else:
                tidy = False
                
        self.fetched_page_text = fetch_page(self.wikiPageURL,tidy)
        save = open("/tmp/wikisnapper-save-posttidy.html", "w")
        save.write(self.fetched_page_text)
        save.close()
        dom = parseString(self.fetched_page_text)
        self.domTree = dom

        save = open("/tmp/wikisnapper-save-postxml.html", "w")
        save.write(self.domTree.toxml('utf-8'))
        save.close()

        self.getnames(dom)


def fetch_page(URL, tidy):

    t0 = time.time()
    stream = urllib2.urlopen(URL)
    text = stream.read()
    save = open("/tmp/wikisnapper-save-pretidy.html", "w")
    save.write(text)
    save.close()
    if tidy:
        #print 'running tidy on it'
        to_tidy = tempfile.NamedTemporaryFile()
        to_tidy.write(text)
        to_tidy.flush()
        from_tidy = tempfile.NamedTemporaryFile("r")
        tidy = "/usr/bin/tidy"
        tidy_error_sink = "/tmp/tidy.errors"
        cmd = ("""%s -numeric -quiet -asxml -utf8 -f %s < %s > %s""" %
                   (tidy, tidy_error_sink, to_tidy.name, from_tidy.name))
        code = os.system(cmd)
        to_tidy.close()
        xml = from_tidy.read()
    else:
        #print 'not running tidy...'
        xml = text
    t1 = time.time()
    #print >>sys.stderr, len(text),"bytes copied from web in",(t1-t0),"seconds."
    return xml

def nodeContents(xml):
    result = []
    for e in xml.childNodes:
        result.append(e.toxml())
    result = "".join(result)
    try:
        result = unicode(result)
    except UnicodeDecodeError:
        print >>stderr, 'Unicode error in string', result
    assert type(result) == unicode
    return result
        
def getDivById(xml, id):
    for e in xml.getElementsByTagName('div'):
        try:
            this_id = e.attributes["id"].value
        except KeyError:
            continue
        if this_id == id:
            return e

    raise RuntimeError, '''Cannot find a div with id="'''+id+'".'


if __name__ == "__main__":
    import doctest, sys
    doctest.testmod(sys.modules[__name__])

    p = Page()
    p.fetch()
