#! /usr/bin/env python
"""
Load data from the web, starting with (and focussed on) the source Document.
Could be a method of doc, but the logic gets so messy, I thought it
was cleaner as its own module.
This could be big and slow, so we'd like to cache this information.
If the data is on a wiki or in w3c-cvs, we can actually be notified
when it's changed. We could have a separate program to prepare all
this data for us...
"""
__version__ = "$Revision: 1.3 $"
# standard
import datetime
import subprocess
import sys
import os
import time
import urllib2
from xml.dom.minidom import parseString
import htmlentitydefs
import re
# other packages
import debugtools
from debugtools import debug
import wiki_cache
def fetch_page(URL, useTidy):
t0 = time.time()
#stream = urllib2.urlopen(URL)
#text = stream.read()
text = wiki_cache.load(URL)
save = open("/tmp/wikisnapper-save-pretidy.html", "w")
save.write(text)
save.close()
if useTidy:
print 'running tidy on it'
to_tidy = tempfile.NamedTemporaryFile()
to_tidy.write(text)
to_tidy.flush()
from_tidy = tempfile.NamedTemporaryFile("r")
tidy = "/usr/bin/tidy"
tidy_error_sink = "/tmp/tidy.errors"
cmd = ("""%s -quiet -asxml -utf8 -f %s < %s > %s""" %
(tidy, tidy_error_sink, to_tidy.name, from_tidy.name))
#cmd = ("""%s -numeric -quiet -asxml -utf8 -f %s < %s > %s""" %
# (tidy, tidy_error_sink, to_tidy.name, from_tidy.name))
code = os.system(cmd)
to_tidy.close()
xml = from_tidy.read()
else:
print 'not running tidy...'
xml = text
# minidom is not handling   entities; hack around it.
xml = expandEntities(xml)
t1 = time.time()
print >>sys.stderr, len(text),"bytes copied from web in",(t1-t0),"seconds."
return (xml, text)
def load(doc, options):
"""
"""
print
print 'Fetching', doc.sourceURL
if doc.tidy is None:
useTidy = False
else:
if doc.tidy.lower()=="yes":
useTidy = True
else:
useTidy = False
(doc.fetched_page_text, doc.raw_page_text) = fetch_page(doc.sourceURL,useTidy)
save = open("/tmp/wikisnapper-save-posttidy-%s.html" % doc.pageName, "w")
save.write(doc.fetched_page_text)
save.close()
dom = parseString(doc.fetched_page_text)
doc.domTree = dom
save = open("/tmp/wikisnapper-save-postxml-%s.html" % doc.pageName, "w")
save.write(doc.domTree.toxml('utf-8'))
save.close()
div_scrape(doc)
gather_nearby(doc)
# @@@ findPreviousVersion(doc)
# we want previous snapshots, which are ... recorded in our local db?
# ... and previous TR, as well -- which is from ... w3.org?
# ... or manually configured?
def div_scrape(doc):
"""
Assume doc is loaded; set more attributes by looking at various ids.
This is old style -- before open-data-tables.
"""
dom = doc.domTree
doc.shortTitle = nodeContents(getDivById(dom, 'short-title'))
doc.abstract = nodeContents(getDivById(dom, 'abstract'))
doc.docbodyNode = getDivById(dom, 'docbody')
parseCredits(dom, doc)
# doc.mediumTitle = 'OWL 1.1 '+doc.shortTitle
#### doc.refTitle = doc.titlePrefix+doc.shortTitle
# doc.fullTitle = nodeContents(getDivById(dom, 'full-title'))
###doc.htmlTitle = doc.titlePrefix+doc.shortTitle
###doc.h1Title = doc.titlePrefix+"
"+doc.shortTitle
# doc.editors = nodeContents(getDivById(dom, 'editors'))
def gather_nearby(doc):
# guess at Group URL
# use ODT to read Group data, including Rounds
# ... or use RDF we gathered from there?
raise Exception, 'not implemented'
class NoData (Exception):
pass
def parseCredits(node, doc):
"""If the xml can be matched as an author/editor/contributor list,
then do it, and add them to Doc & return true; if it can't return false.
"""
node = getDivById(node, 'editors')
for e in node.getElementsByTagName('dl'):
for (dt, dd) in parseDL(e):
dtText = nodeContents(dt).strip().lower()
if dtText.startswith('author'):
attr = 'authors'
elif dtText.startswith('editor'):
attr = 'editors'
elif dtText.startswith('contrib'):
attr = 'contributors'
else:
raise RuntimeError, 'editor section with bad dt'
#print 'Got a DT, "%s" ==> %s' % (dtText,attr)
ddText = nodeContents(dd)
#print ' ...DD, "%s"' %ddText
person = Person()
try:
person.loadFromXML(dd)
except NoData:
continue
getattr(doc, attr).append(person)
def parseDL(node):
'''Given a dl node, return successive pairs of dt/dd.'''
dt = None
for child in node.childNodes:
try:
if child.tagName == 'dt':
dt = child
except AttributeError:
pass
try:
if child.tagName == 'dd':
yield (dt, child)
except AttributeError:
pass
def nodeContents(xml):
result = []
for e in xml.childNodes:
result.append(e.toxml())
result = "".join(result)
try:
result = unicode(result)
except UnicodeDecodeError:
print >>stderr, 'Unicode error in string', result
assert type(result) == unicode
return result
def getDivById(xml, id):
for e in xml.getElementsByTagName('div'):
try:
this_id = e.attributes["id"].value
except KeyError:
continue
if this_id == id:
return e
for e in xml.getElementsByTagName('span'): # DUMB DUMB DUMB!
try:
this_id = e.attributes["id"].value
except KeyError:
continue
if this_id == id:
return e
raise RuntimeError, '''Cannot find a div with id="'''+id+'".'
def expandEntities(text):
"""
Given some HTML text, expand any of the standard HTML entities --
BUT NOT the XML ones in it.
This is a total hack -- our HTML parser should be doing this for
us, but right now I can do this faster. :-( :-(
"""
result = re.sub("&(\w+);", expandEntity, text)
return result
def expandEntity(match):
entity = match.group(1)
if (entity == "lt" or entity == "gt" or
entity == "quot" or entity == "amp" or entity == "apos"):
return "&"+entity+";"
try:
expansion = htmlentitydefs.name2codepoint[entity]
return unichr(expansion).encode('utf-8')
except KeyError:
raise Exception, "undefined entity %s" % `entity`
class Person:
# name, optional url, optional affiliation
@property
def line(self):
try:
return self.msg # msg is a hack over-ride
except AttributeError:
pass
try:
result = u'''%s''' % (self.url, self.name)
except AttributeError:
result = self.name
try:
result += u", "+self.affiliation
except AttributeError:
pass
# stupid hack way to do this --- should be general!
result = re.sub("&", "&", result)
return result
def loadFromXML(self, node):
"""node is a Dom node (such as a p, span, dd, or td) which
contains a description of a Person, in some standard format.
eg: