%(htmlTitle)s

', '

', out) # even stupider hack to workaround bug in rif-ucr javascript out = re.sub('', '', out) f = open(filename, "w") #str = out.encode("utf-8") str = out str = entify(str) f.write(str) f.close() print 'wrote', filename f = open(self.directory+"/wiki.html", "w") f.write(self.raw_page_text) f.close() cmd = ("hdiff %s/wiki.html %s/Overview.html > %s/diff-from-wiki.html"% (self.directory, self.directory, self.directory)) os.system(cmd) cmd = ("prince -s print.css -o %s/all.pdf %s/Overview.html" % (self.directory, self.directory)) os.system(cmd) if self.hasPrevious: debug("defaults", "self.pversioncode=%s" % self.pversioncode) debug("defaults", "self['pversioncode']=%s" % self['pversioncode']) cmd = 'hdiff %(pversioncode)s/Overview.html %(versioncode)s/Overview.html > %(diffFile)s.html' % self print 'producing %(diffFile)s.html' % self os.system(cmd) # make this look like a dictionary, so we can just do template # substitution with it. And inherit from snapshot and # snapshot.group as we do it. # http://docs.python.org/ref/sequence-types.html # # To override, you must (a) define the attribute, and (b) give it a value # that is not None. You can override it with "", however. def __getitem__(self, key): for obj in (self, self.snapshot, self.snapshot.group): value = None try: value = getattr(obj, key) except AttributeError: pass if value is not None: return value for obj in (self, self.snapshot, self.snapshot.group): value = None try: value = getattr(obj, "_default_"+key)() except AttributeError: pass if value is not None: return value # at one point I allowed an "eval" here, but now that some # content comes from the web, that's a security problem. raise KeyError, key def __setitem__(self, key, value): raise RuntimeError, 'not mutable this way' def __iter__(self): return self.iterkeys() def iterkeys(self): # this isn't really right.... eg it includes iterkeys! for x in self.__dict__.iterkeys(): if not x.startswith("_"): yield x def wikiDocMatch(self, href): for page in self.snapshot.pages: if page.wikiPageURL.endswith(href): self.matchedPage = page return True for page in self.snapshot.oldPages: if page.wikiPageURL.endswith(href): self.matchedPage = page return True return False def handle_review_comments(self): for e in self.docbodyNode.getElementsByTagName('div'): try: this_class = e.attributes["class"].value except KeyError: continue if this_class == "note": e.parentNode.removeChild(e) for e in self.docbodyNode.getElementsByTagName('div'): try: this_class = e.attributes["class"].value except KeyError: continue if this_class == "review": e.parentNode.removeChild(e) #for a in tree_search(self.docbodyNode, is_review_comment): # while a.hasChildNodes(): # child = a.firstChild # a.removeChild(child) # child.unlink() example_uri_pat = re.compile(r"""http://[^/]*example\..*""") def unlink_example_links(self): """ Wikis turn URLs into links, more than we want. Undo that for URL starting with http://example.* """ for e in self.docbodyNode.getElementsByTagName('a'): try: link = e.getAttribute('href') text = e.firstChild.data except: continue if link == text: m = self.example_uri_pat.match(link) if m: parent = e.parentNode new = e.firstChild parent.insertBefore(new, e) parent.removeChild(e) def handle_corefs(self, xml): self.matchedPage = None for dd in tree_search(xml, is_ref, self): while dd.hasChildNodes(): child = dd.firstChild dd.removeChild(child) child.unlink() newStuff = toDom(self.matchedPage.referenceText) dd.appendChild(newStuff) self.matchedPage = None def generatePastDates(self): for pyear in xrange(2006, 3000): for pmonth in xrange(1, 13): for pday in xrange(1,32): pdateCode = "%04d%02d%02d" % (pyear, pmonth, pday) #debug("previous", "%s > %s" % (`pdateCode`, `self.dateCode`)) if pdateCode >= self._dateCode: return yield pdateCode def findPreviousVersion(self): # allow override -- if pdateCode is provided, we don't scan if self.pdateCode: self.hasPrevious = True return # this is perhaps not very efficient, but I don't want to # hard-code the filename format in two place, and have to # parse it appart. Dumb? I dunno. found = None self.hasPrevious = True debug("previous(", "looking for previous versions") for pdateCode in self.generatePastDates(): self.pdateCode = pdateCode code = self["pversioncode"] debug("previous", "past date directory ", code) if os.path.exists(code): debug("previous", "Found a previous version", code) found = pdateCode if not found: self.hasPrevious = False self.pdateCode = found debug("previous)", "hasPrevious = %s, code=%s", (self.hasPrevious,found)) def _default_doctype(self): return doctypes[self.maxdoctypecode] @property def maxdoctypecode(self): global options if options.ed_only and not self.oldPage: return "ED" else: return self.doctypecode @property def _dateCode(self): return dateCode(self["pubdate"]) #return self.snapshot._dateCode @property def _dateName(self): return dateName(self["pubdate"]) #return self.snapshot._dateName @property def commentsDue(self): return self.snapshot.commentsDue @property def hasPrevious(self): return hasattr(self, 'pdateCode') or hasattr(self.snapshot, 'pdateCode') @property def pdateCode(self): return self.snapshot.pdateCode @property def directory(self): return self.versioncode @property def versioncode(self): return (self.maxdoctypecode+"-"+ self.snapshot.group.shortnamePrefix+ self.shortname+"-"+ self._dateCode) def _default_pversioncode(self): assert self.hasPrevious return (self.maxdoctypecode+"-"+ self.snapshot.group.shortnamePrefix+ self.shortname+"-"+ self.pdateCode) @property def versionPrefixURL(self): # can't easily do relative URLs, because we tell the # users these URLs... if self.maxdoctypecode == 'ED': return self.snapshot.group.draftsURL if self.maxdoctypecode == 'WD': return "http://www.w3.org/TR/2008/" raise RuntimeError, 'dont know how to make version URL' @property def thisVersion(self): return self.versionPrefixURL+self.versioncode+"/" def _default_previousVersion(self): return self.versionPrefixURL+self["pversioncode"]+"/" @property def latestURL(self): if self.maxdoctypecode == 'ED': prefix = self.versionPrefixURL if self.maxdoctypecode == 'WD': prefix = "http://www.w3.org/TR/" # not quite the same! return prefix+self.snapshot.group.shortnamePrefix+self.shortname+"/" @property def wikiPageURL(self): return self.snapshot.group.wikiURL+self.wikiPageName @property def inLineCredit(self): if self.authors: return ", ".join([person.name for person in self.authors]) if len(self.editors) == 1: ed = "editor" else: ed = "eds" return ", ".join([person.name for person in self.editors]) + ", " + ed def _default_css(self): result = """ """ for stylesheet in self.stylesheets: result+=('''\n''' % stylesheet) result += """\n""" % self return result def _default_javascript(self): return """ """ @property def credits(self): result = u"
" for tag in ('Author', 'Editor', 'Contributor'): attr = tag.lower()+"s" people = getattr(self, attr) if people: if people > 1: result += u"
%ss:
" % tag else: result += u"
%s:
" % tag for person in people: result += u"
"+person.line+u"
\n" result += u"
" return result @property def formats(self): return '''
This document is also available in these non-normative formats: PDF version.
''' @property def referenceText(self): """Return standard text that one can use in a References section to refer to this version of this document """ result = u"""%(refTitle)s %(inLineCredit)s. W3C %(doctype)s, %(_dateName)s, %(thisVersion)s. Latest version available at %(latestURL)s.""" % self return result @property def numDocs(self): return len(self.snapshot.pages) @property def allDocs(self): result = "
\n" for page in self.snapshot.pages: if page is self: thisOne = "(this document)" else: thisOne = "" result += u"""
%(shortTitle)s """%page + thisOne+"
\n" result += "
\n" return result @property def sotdSOD(self): if len(self.snapshot.pages) < 2: return "" result = """
Set of Documents

This document is being published as one of a set of %(numDocs)s documents:
%(allDocs)s """%self return result @property def pleaseCommentText(self): if self.pleaseComment is not None: return self.pleaseComment else: return """
Please Comment By %(commentsDue)s

The %(name)s seeks public feedback on %(commentOn)s. Please send your comments to %(commentsList)s@w3.org (public archive). If possible, please offer specific changes to the text that would address your concern. You may also wish to check the Wiki Version of this document for internal-review comments and changes being drafted which may address your concerns.
""" % self @property def diffURL(self): return self.versionPrefixURL+self.diffFile @property def diffFile(self): return self.directory+"/diff-since-"+self.pdateCode @property def diffText(self): if self.maxdoctypecode == 'ED': return u' (color-coded diff)' % self.diffURL else: return "" @property def labelForLatest(self): if self.maxdoctypecode == 'ED': return "Latest editor's draft" else: return "Latest version" @property def versionStuff(self): result = u"""

This version:

%(thisVersion)s

%(labelForLatest)s:

%(latestURL)s
""" % self if self.hasPrevious: result += u"""
Previous version:

%(previousVersion)s%(diffText)s
""" % self result += u"
" return result def toDom(text): newDoc = parseString(text.encode("utf-8")) # deep copy and set the ownerDocument? return newDoc.documentElement def fetch_page(URL, tidy): t0 = time.time() stream = urllib2.urlopen(URL) text = stream.read() save = open("/tmp/wikisnapper-save-pretidy.html", "w") save.write(text) save.close() if tidy: print 'running tidy on it' to_tidy = tempfile.NamedTemporaryFile() to_tidy.write(text) to_tidy.flush() from_tidy = tempfile.NamedTemporaryFile("r") tidy = "/usr/bin/tidy" tidy_error_sink = "/tmp/tidy.errors" cmd = ("""%s -quiet -asxml -utf8 -f %s < %s > %s""" % (tidy, tidy_error_sink, to_tidy.name, from_tidy.name)) #cmd = ("""%s -numeric -quiet -asxml -utf8 -f %s < %s > %s""" % # (tidy, tidy_error_sink, to_tidy.name, from_tidy.name)) code = os.system(cmd) to_tidy.close() xml = from_tidy.read() else: print 'not running tidy...' xml = text # minidom is not handling entities; hack around it. xml = expandEntities(xml) t1 = time.time() print >>sys.stderr, len(text),"bytes copied from web in",(t1-t0),"seconds." return (xml, text) def filterThrough(commandLine, inputText): toFilter = tempfile.NamedTemporaryFile() toFilter.write(inputText) toFilter.flush() fromFilter = tempfile.NamedTemporaryFile("r") cmd = ("""%s < %s > %s""" % (commandLine, toFilter.name, fromFilter.name)) code = os.system(cmd) toFilter.close() result = fromFilter.read() fromFilter.close() return result def nodeContents(xml): result = [] for e in xml.childNodes: result.append(e.toxml()) result = "".join(result) try: result = unicode(result) except UnicodeDecodeError: print >>stderr, 'Unicode error in string', result assert type(result) == unicode return result def getDivById(xml, id): for e in xml.getElementsByTagName('div'): try: this_id = e.attributes["id"].value except KeyError: continue if this_id == id: return e for e in xml.getElementsByTagName('span'): # DUMB DUMB DUMB! try: this_id = e.attributes["id"].value except KeyError: continue if this_id == id: return e raise RuntimeError, '''Cannot find a div with id="'''+id+'".' def handle_editsections(xml): for e in xml.getElementsByTagName('span'): cls = e.getAttribute("class") if cls == "editsection": e.parentNode.removeChild(e) e.unlink() def tree_search(tree, condition, extra=None): if condition(tree, extra): yield tree else: for child in tree.childNodes: for result in tree_search(child, condition, extra): yield result def is_ref(node, extra): if hasattr(node, 'tagName') and node.tagName == 'dd': #print 'found a dd', node for links in tree_search(node, links_to_docs, extra): return True return False def links_to_docs(node, extra): try: href = node.getAttribute('href') if href == "": return False return extra.wikiDocMatch(href) except AttributeError: return False def handle_images(xml, directory): print >>sys.stderr, 'Downloading any embedded images.' t0 = time.time() imageCount = 0 byteCount = 0 for e in xml.getElementsByTagName('img'): src = e.getAttribute("src") # print >>sys.stderr, 'Image: ', src if src.startswith("http://"): fullsrc = src else: fullsrc = "http://www.w3.org"+src key = fullsrc[fullsrc.rindex("/")+1:] e.setAttribute("src", key) filename = directory+"/"+key #print >>sys.stderr, 'Downloading image\n ', fullsrc, '-> ', filename inStream = urllib2.urlopen(fullsrc) text = inStream.read() byteCount+=len(text) imageCount+=1 inStream.close() outStream = open(filename, "w") outStream.write(text) outStream.close() # if image is in an , then remove the parent = e.parentNode grandparent = parent.parentNode if parent.tagName == "a": # move e up to grandparent parent.removeChild(e) grandparent.insertBefore(e, parent) # remove parent grandparent.removeChild(parent) parent.unlink() t1 = time.time() print >>sys.stderr, ("%d images, %d bytes copied from web in %f seconds."% (imageCount, byteCount, t1-t0)) css_url_pat = re.compile(r""".*/wiki/index.php\?title=CSS/(.*)&action=raw&ctype=text/css""") def handle_css(xml, directory): '''Find all the CSS links which are NOT to mediawiki skins. Download them, and return their names for use in the header.''' print >>sys.stderr, 'Downloading custom style sheets...' t0 = time.time() fileCount = 0 byteCount = 0 result = [] for e in xml.getElementsByTagName('link'): rel = e.getAttribute("rel") #print >>sys.stderr, 'found link', rel if rel != "stylesheet": continue src = e.getAttribute("href") if src.find("wiki/skins/common") >= 0: continue if src.startswith("http://"): fullsrc = src else: fullsrc = "http://www.w3.org"+src #print >>sys.stderr, 'abs form', fullsrc match = css_url_pat.match(fullsrc) if match: #print >>sys.stderr, 'pat-match on key' key = match.group(1) else: #print >>sys.stderr, 'Not matched: "%s"' % fullsrc continue print >>sys.stderr, 'key', key result.append(key) #e.setAttribute("href", key) filename = directory+"/"+key print >>sys.stderr, 'Downloading style sheet\n ', fullsrc, '-> ', filename inStream = urllib2.urlopen(fullsrc) text = inStream.read() byteCount+=len(text) fileCount+=1 inStream.close() outStream = open(filename, "w") outStream.write(text) outStream.close() t1 = time.time() #print >>sys.stderr, ("%d files, %d bytes copied from web in %f seconds."% # (fileCount, byteCount, t1-t0)) return result def tr(fields): text1 = u""" %(htmlTitle)s %(css)s %(javascript)s

%(h1Title)s

W3C %(doctype)s %(_dateName)s
%(versionStuff)s %(credits)s %(formats)s

Copyright © 2008 W3C^® (MIT, ERCIM, Keio), All Rights Reserved. W3C liability, trademark and document use rules apply.

Abstract

%(abstract)s

Status of this Document

May Be Superseded

This section describes the status of this document at the time of its publication. Other documents may supersede this document. A list of current W3C publications and the latest revision of this technical report can be found in the W3C technical reports index at http://www.w3.org/TR/.
%(sotdSOD)s %(snapshotStatusExtra)s %(statusExtra)s %(pleaseCommentText)s
No Endorsement

Publication as a Working Draft does not imply endorsement by the W3C Membership. This is a draft document and may be updated, replaced or obsoleted by other documents at any time. It is inappropriate to cite this document as other than work in progress.

Patents

This document was produced by a group operating under the 5 February 2004 W3C Patent Policy. W3C maintains a public list of any patent disclosures made in connection with the deliverables of the group; that page also includes instructions for disclosing a patent. An individual who has actual knowledge of a patent which the individual believes contains Essential Claim(s) must disclose the information in accordance with section 6 of the W3C Patent Policy.

%(docbody)s """ % fields return text1 #text2 = filterThrough('num', text1) #text3 = filterThrough('toc -l 2 -h 3 -x -t', text2) #print text3 class Group: def __init__(self): self.shortname = None self.id = None self.draftsURL = None self.wikiURL = None self.homeURL = None self.name = None self.commentsList = None self.shortnamePrefix = "" class Snapshot: def __init__(self): self.pubdate=None self.commentsDue=None self.group=None self.snapshotStatusExtra="" # along with page.statusExra self.pages=[] self.oldPages=[] self.titlePrefix="" @property def _dateCode(self): """Return the 8-digit version of the pubdate""" return (self.pubdate[0:4]+ self.pubdate[5:7]+ self.pubdate[8:10]) @property def _dateName(self): """Return pubdate in form "2 January 2020" """ # could use datetime.today ? date = datetime.date(int(self.pubdate[0:4]), int(self.pubdate[5:7]), int(self.pubdate[8:10])) return date.strftime("%d %B %Y") def dateCode(pubdate): """Return the 8-digit version of the pubdate""" return (pubdate[0:4]+ pubdate[5:7]+ pubdate[8:10]) def dateName(pubdate): """Return pubdate in form "2 January 2020" """ # could use datetime.today ? date = datetime.date(int(pubdate[0:4]), int(pubdate[5:7]), int(pubdate[8:10])) return date.strftime("%d %B %Y") if __name__ == "__main__": import doctest, sys doctest.testmod(sys.modules[__name__]) run()