#! /usr/bin/env python
'''

This program ("Snapper", previously "WikiSnapper") generates a web
page (on local disk, expected to be made visible in web space) in
proper W3C Tech Report (TR) format from a MediaWiki page (or similar
Web page) which has the basic text content.  It performs various
cleanups on the text, and gets various metadata from that source page
and from linked pages as well.

'''
__version__ = "$Revision: 1.3 $"

# standard
import datetime
import subprocess
import os.path

# other packages
import debugtools
from debugtools import debug

# this package
import snapper_w3c_style
import snapper_load
from property_stack import PropertyStack




################################################################


def expandEntity(match):
    entity = match.group(1)
    if (entity == "lt" or entity == "gt" or
        entity == "quot" or entity == "amp" or entity == "apos"):
        return "&"+entity+";"
    try:
        expansion = htmlentitydefs.name2codepoint[entity]
        return unichr(expansion).encode('utf-8')
    except KeyError:
        raise Exception, "undefined entity %s" % `entity`
    
def entify(text):
    """
    Given the whole HTML page, convert the code points to named HTML entities,
    and maybe convert some &quot; entities back into quote characters.

    This is needed to make the diff-to-wiki not so ugly.
    """

    print >>sys.stderr, "Entifying..."
    out = []
    for x in text:
        try:
            entity = htmlentitydefs.codepoint2name[ord(x)]
            if (entity == "lt" or entity == "amp" or entity == "gt" or entity=="quot"):
                out.append(x)
            else:
                #print >>sys.stderr, "Got one!  %s" % entity
                out.append("&"+entity+";")
        except KeyError:
            out.append(x)

    text = (u''.join(out)).encode( "utf-8" )

    # @@@@ OMG this is so wrong -- but I can't think of an easy way to
    # do it correctly, and I need something working right now.   The
    # problem is that we should use quote characters instead of &quot;
    # UNLESS we're inside an attribute value.   So, this really needs
    # to be done by the XML serializer.
    #
    # henry suggests using rxp  http://www.ltg.ed.ac.uk/~richard/rxp.ht
    text = re.sub("&quot;", '"', text)
    return text

    

################################################################

class Document:   # aka Deliverable

    def __init__(self, url):
        self.sourceURL = url
        self.tidy = None
        (self.pageURLPrefix, self.pageName) = url.rsplit("/", 1)
        # fetched_page_text
        # raw_page_text
        # domTree
        # shortTitle, refTitle,
        # docbodyNode
        self.authors = []
        self.editors = []
        self.contributors = []
    
def in_latest_open_round(doc):
    """
    Return a DocumentInRound for this document, if there is one, in
    the latest round which is still open.   This is the "intelligent"
    default for DocumentInRound, so people can just snapshot a
    Document and use the a DocumentInRound information.

    @@@ leave it out for now!
    """
    
    
################################################################

class Snapshot:

    def __init__(self, doc, options):
        self.doc = doc
        self.options = options

        self.documentInRound = in_latest_open_round(doc)

        stack = [options, self]
        try:   stack.append(self.documentInRound)
        except AttributeError: pass
        try:   stack.append(self.documentInRound.round)
        except AttributeError: pass
        try:   stack.append(self.doc)
        except AttributeError: pass
        try:   stack.append(self.doc.group)
        except AttributeError: pass
        try:   stack.append(self.doc.group.docStyle)
        except AttributeError: pass
        stack.append(snapper_w3c_style.defaults)

        self._properties = PropertyStack(stack)

    def generate(self):

        self.directory = self._properties.directory
        if not os.path.exists(self.directory):
            os.mkdir(self.directory)
            
        snapper_transform.run(self)
        
        template = self._properties["documentTemplate"]
        self._text = template % self._properties
        
        self.save()

        self.generateAux()

    def save(self):

        # hack to workaround the fact the in HTML you can't actually
        # repeat the xmlns declaration for HTML -- which rdf:Literals give
        # us.
        out = re.sub('<div xmlns="http://www.w3.org/1999/xhtml">', '<div>',
                     self._text)
        
        f = open(filename, "w")

        #str = out.encode("utf-8")
        str = out
        str = entify(str)
        f.write(str)
        f.close()
        print 'wrote', filename

    def generateAux(self):
        """
        Write a bunch of other, related documents, like diffs and the
        PDF.   Has to be done after the main document is done.
        """

        f = open(self.directory+"/wiki.html", "w")
        f.write(self.doc.raw_page_text)
        f.close()
        cmd = ("hdiff %s/wiki.html %s/Overview.html > %s/diff-from-wiki.html"%
               (self.directory, self.directory, self.directory))
        os.system(cmd)
        #
        # switch to subprocess.check_call ?   like this:
        #  retcode = check_call(["unrar", "x", archive],
        #              cwd=unrar_dir,
        #              stdout=log,
        #              stderr=STDOUT )

        cmd = ("prince -s print.css -o %s/all.pdf %s/Overview.html" %
               (self.directory, self.directory))
        os.system(cmd)

        if self.hasPrevious:
            debug("defaults", "self.pversioncode=%s" % self.pversioncode)
            debug("defaults", "self['pversioncode']=%s" % self['pversioncode'])
            cmd = 'hdiff %(pversioncode)s/Overview.html %(versioncode)s/Overview.html > %(diffFile)s.html' % self
            print 'producing %(diffFile)s.html' % self
            os.system(cmd)


################################################################

def main():
    from optparse import OptionParser
    
    parser = OptionParser(usage="%prog [options] sourceURL",
                          version=__version__)
    parser.set_defaults(verbose=True)
    parser.set_defaults(ed_only=True)
    parser.add_option("-q", "--quiet",
                      action="store_false", dest="verbose", 
                      help="don't print status messages to stdout")
    parser.add_option("--real",
                      action="store_false", dest="ed_only", 
                      help="Allow more than Editors Draft")
    parser.add_option("-D", "--debug",
                      action="append", dest="debugTags", 
                      help="turn on debugging for some subsystem (try 'all')")

    global options
    (options, args) = parser.parse_args()

    if options.debugTags:
        debugtools.tags.update(options.debugTags)
    verbose = options.verbose

    if len(args) == 1:
        d = Document(args[0])
        snapper_load.load(d, options)
        snap = Snapshot(d, options)
    else:
        parser.print_help()
        sys.exit(1)

    #snap.sanity_check()
    #snap.cross_link()
    snap.generate()
    #snap.summarize()

if __name__ == "__main__":
    import doctest, sys
    doctest.testmod(sys.modules[__name__])

    main()





