"""Read an XML instance, using the fully-striped syntax


see also http://www.w3.org/2000/10/swap/StripeSkippingParser.py

http://docs.python.org/lib/dom-node-objects.html


kind of works --- parses fully-striped XML to triples,
   using <List> for rdf:List structures
   lacks ability to put strings in lists
   lacks datatypeing
   uri support untested
   qname support not implemented


NOW: how can we abbreviate the XML by using the object model?

unless we're willing to put properties in order, it's hard!

"""

__version__ = "$Revision: 1.2 $"
# $Source: /sources/public/2007/asn/xmli.py,v $

from sys import stderr
from xml.dom.minidom import parse
from xml.sax.handler import feature_namespaces

from rdflib.Graph import Graph
from rdflib import URIRef, Literal, BNode, RDF

class Error(Exception):
    pass

class Parser:
    """

    >>> import opendata.xmli
    >>> p = opendata.xmli.Parser()
    >>> world = { }    # map from URI to python objects

    >>> stream = open("test-data/poscond-m3.xml", "r")
    >>> root = p.load(stream, world)

    >>> p.graph.serialize("out.rdf", format="pretty-xml")
    >>> p.graph.serialize("out.nt", format="nt")


    """

    def __init__(self):
        self.indent = 0
        self.graph = Graph()

    def load(self, input, world):
        self.dom = parse(input)
        #  self.printElement(self.dom.documentElement)
        value = self.value(self.dom.documentElement)
        return value

    def printElement(self, e, prefix = ""):

        if e.nodeType == e.ELEMENT_NODE:
            print >>stderr,  prefix+e.localName,
            if e.namespaceURI:
                print >>stderr,  "("+e.namespaceURI+")"
            else:
                print >>stderr 
            for child in e.childNodes:
                self.printElement(child, prefix+"|    ")
        elif e.nodeType == e.TEXT_NODE:
            if e.data.strip():
                print >>stderr,  prefix+repr(e.data)
        else:
            print >>stderr,  prefix+"unknown node type"

    def value(self, e, text_ok = True):
        if e.nodeType == e.TEXT_NODE:
            if text_ok:
                return e.data
            else:
                value = e.data
                if value.strip():
                    raise Error("text '%s' where not expected" % value)
                else:
                    return None   # let parent skip
        if e.nodeType == e.ELEMENT_NODE:

            print >>stderr, "looking for value (class) at "+e.nodeName

            if e.localName == "List":
                return self.listValue(e)

            type = URIRef(e.namespaceURI + e.localName)
            me = None

            for child in e.childNodes:
                uri = self.givesURI(child)
                if uri:
                    me = URIRef(uri)
            if not me:
                me = BNode()

            self.graph.add((me, RDF.type, type))
            for child in e.childNodes:
                if not self.givesURI(child):
                    result = self.pv(child)
                    if result:
                        (prop, value) = result
                        self.graph.add((me, prop, value))

            return me
        
        raise RuntimeError('looking for value, got non-text, non-element')

    def givesURI(self, e):
        if e.nodeType == e.ELEMENT_NODE:
            if e.localName == "uri":     # @@ namespace?
                return self.children_as_string(e)
            if e.localName == "qname":
                q = self.children_as_string(e)
                raise RuntimeError('not implemented yet')
        
    def pv(self, e):
        if e.nodeType == e.TEXT_NODE:
            if e.data.strip():
                raise Error("text '%s' where property element expected"
                            % e.data)
            else:
                return
        print >>stderr, "looking for property at "+e.nodeName
        if e.nodeType == e.ELEMENT_NODE:
            prop = URIRef(e.namespaceURI + e.localName)

            try:
                text = self.children_as_string(e)
                return (prop, Literal(text))
            except AttributeError:   # missing "data" attribute
                pass 

            # there should only be one child -- a type --
            # unless the type is being skipped.
            ####assert(len(e.childNodes) == 1)

            for child in e.childNodes:
                value = self.value(child, text_ok=False)
                print >>stderr, "  got value:  "+repr(value)
                if not value:
                    continue
                return (prop, value)
            self.printElement(e)
            raise RuntimeError('property %s has no usable content'
                               % repr(e.localName))

        raise RuntimeError('unexpect xml structure')

    def children_as_string(self, e):
        # doesn't check types -- dom should tell us if we're wrong
        return "".join([child.data for child in e.childNodes])

    def listValue(self, e):

        # @@@ doesn't handle empty lists
        prev = None
        first = RDF.nil
        for child in e.childNodes:
            value = self.value(child, text_ok = False)
            if not value:
                continue
            here = BNode()
            if first == RDF.nil:
                first = here
            if prev:
                pass
                self.graph.add((prev, RDF.rest, here))
            self.graph.add((here, RDF.first, value))
            prev = here
        self.graph.add((prev, RDF.rest, RDF.nil))
        return first
            

if __name__ == "__main__":
    import doctest, sys
    doctest.testmod(sys.modules[__name__])
