#! /usr/bin/env python
"""

Various transformations we do Snapshots, starting with the content we
got from its source Document, and ending with something we can use.

This includes downloading css and images, putting them in the
filesystem, in snap.directory, and changing the document to point to
their new locations.

Logically, these could all be methods of Snapshot, but it would really
be too crowded.

"""
__version__ = "$Revision: 1.2 $"

from debugtools import debug
import wiki_cache

def run(snap):
    
    handle_editsections(snap.docbodyNode)

    handle_images(snap.docbodyNode, snap.directory)

    snap.stylesheets = handle_css(snap.domTree, snap.directory)

    handle_corefs(snap, snap.domTree)

    handle_review_comments(snap)

    unlink_example_links(snap)
        
    # should be after handle_corefs so it doesn't warn about them
    handle_links(snap)


def handle_editsections(xml):
    for e in xml.getElementsByTagName('span'):
        cls = e.getAttribute("class")
        if cls == "editsection":
            e.parentNode.removeChild(e)
            e.unlink()

def handle_images(xml, directory):
    print >>sys.stderr, 'Downloading any embedded images.'
    t0 = time.time()
    imageCount = 0
    byteCount = 0
    for e in xml.getElementsByTagName('img'):
        src = e.getAttribute("src")
        # print >>sys.stderr, 'Image: ', src
        if src.startswith("http://"):
            fullsrc = src
        else:
            fullsrc = "http://www.w3.org"+src
        key = fullsrc[fullsrc.rindex("/")+1:]
        e.setAttribute("src", key)
        filename = directory+"/"+key
        #print >>sys.stderr, 'Downloading image\n   ', fullsrc, '-> ', filename

        #inStream = urllib2.urlopen(fullsrc)
        #text = inStream.read()
        #inStream.close()
        text = wiki_cache.load(fullsrc)

        byteCount+=len(text)
        imageCount+=1
        outStream = open(filename, "w")
        outStream.write(text)
        outStream.close()

        # if image is in an <a ...>, then remove the <a ...>
        parent = e.parentNode
        grandparent = parent.parentNode
        if parent.tagName == "a":
            # move e up to grandparent
            parent.removeChild(e)
            grandparent.insertBefore(e, parent)
            # remove parent
            grandparent.removeChild(parent)
            parent.unlink()

    t1 = time.time()
    print >>sys.stderr, ("%d images, %d bytes copied from web in %f seconds."%
                         (imageCount, byteCount, t1-t0))
    
css_url_pat = re.compile(r""".*/wiki/index.php\?title=CSS/(.*)&action=raw&ctype=text/css""")

def handle_css(xml, directory):

    '''Find all the CSS links which are NOT to mediawiki skins.
    Download them, and return their names for use in the header.'''

    
    print >>sys.stderr, 'Downloading custom style sheets...'
    t0 = time.time()
    fileCount = 0
    byteCount = 0
    result = []
    for e in xml.getElementsByTagName('link'):
        rel = e.getAttribute("rel")
        #print >>sys.stderr, 'found link', rel
        if rel != "stylesheet":
            continue
        src = e.getAttribute("href")
        if src.find("wiki/skins/common") >= 0:
            continue
        if src.startswith("http://"):
            fullsrc = src
        else:
            fullsrc = "http://www.w3.org"+src
        #print >>sys.stderr, 'abs form', fullsrc

        match = css_url_pat.match(fullsrc)
        
        if match:
            #print >>sys.stderr, 'pat-match on key'
            key = match.group(1)
        else:
            #print >>sys.stderr, 'Not matched: "%s"' % fullsrc
            continue
        print >>sys.stderr, 'key', key
        result.append(key)
        #e.setAttribute("href", key)
        filename = directory+"/"+key
        print >>sys.stderr, 'Downloading style sheet\n   ', fullsrc, '-> ', filename

        #inStream = urllib2.urlopen(fullsrc)
        #text = inStream.read()
        #inStream.close()
        text = wiki_cache.load(fullsrc)
        
        byteCount+=len(text)
        fileCount+=1
        outStream = open(filename, "w")
        outStream.write(text)
        outStream.close()


    t1 = time.time()
    #print >>sys.stderr, ("%d files, %d bytes copied from web in %f seconds."%
    #                     (fileCount, byteCount, t1-t0))
    return result


def handle_corefs(snap, xml):
    snap.matchedPage = None
    for dd in tree_search(xml, is_ref, snap):
        while dd.hasChildNodes():
            child = dd.firstChild
            dd.removeChild(child)
            child.unlink()

        newStuff = toDom(snap.matchedPage.referenceText)
        dd.appendChild(newStuff)
        snap.matchedPage = None

def handle_review_comments(snap):
    for e in snap.docbodyNode.getElementsByTagName('div'):
        try:
            this_class = e.attributes["class"].value
        except KeyError:
            continue
        if this_class == "note":
            e.parentNode.removeChild(e)

    for e in snap.docbodyNode.getElementsByTagName('div'):
        try:
            this_class = e.attributes["class"].value
        except KeyError:
            continue
        if this_class == "review":
            e.parentNode.removeChild(e)


def unlink_example_links(snap):
    """
    Wikis turn URLs into links, more than we want.

    Undo that for URL starting with http://example.*
    """
    for e in snap.docbodyNode.getElementsByTagName('a'):
        try:
            link = e.getAttribute('href')
            text = e.firstChild.data
        except:
            continue
        if link == text:
            m = snap.example_uri_pat.match(link)
            if m:
                parent = e.parentNode
                new = e.firstChild
                parent.insertBefore(new, e)
                parent.removeChild(e)


def handle_links(snap):
    for e in snap.docbodyNode.getElementsByTagName('a'):
        href = e.getAttribute("href")
        if href.startswith("/"):
            uri = "http://www.w3.org"+href

            try:
                (main, fragment) = uri.split("#")
                fragment = "#" + fragment
            except:
                main = uri
                fragment = ""

            if main == snap.wikiPageURL:
                debug("links", 'rewriting snap link to %s' % fragment)
                e.setAttribute("href", fragment)
                continue

            snap.matchedPage = None
            if snap.wikiDocMatch(main):
                new_uri = snap.matchedPage.thisVersion + fragment
                e.setAttribute("href", new_uri)
                debug("links", 'rewriting snap %s to %s' % (
                    href, new_uri))
                continue

            e.setAttribute("href", uri)
            if href.find('&action=edit') > -1:
                continue   # just leave edit links in the text, hidden
            if href.find('/Image:') > -1:
                continue   # just leave wiki image links in the text ?!?! @@@
            print >>sys.stderr, '? Wiki link to: ', href


    def wikiDocMatch(self, href):
        for page in self.snapshot.pages:
            if page.wikiPageURL.endswith(href):
                self.matchedPage = page
                return True
        return False