#! /usr/bin/env python
"""

Tools for providing a Python object-oriented view of data on the web
in XML and RDF.

By Sandro Hawke (sandro@w3.org)

See webdata_demo_1.py / webdata_demo_1.rdf


Basic functions:

    toPython(id, autoload=True)
            Returns a python object with the same data as the
            web object with that id.   id may be a URI or a
            pair of (nsuri, id).   This works for both RDF and
            XML instances, with certain conventions.

            (should this accept Nodes, in general, or just IDs?)

            toPython(node, autoload=True)

    getNode(pythonObject)
            Returns an ID for the given pythonObject

            fromPython(obj)
                returns a Node corresponding to that obj
                (probably changing the store)
            
    loadRDF(url)
    loadXML(url)
    load(url)      - smart version that does either one
            If you want to add some stuff manually, not just using
            autoload. 


TO DO:

     - use an OWL ontology to find out the properties
       (or at least sanity check?)
       
     - have a way to get a warning on extra/missing data???
    
"""
__version__ = """$Id: webdata.py,v 1.5 2009-03-03 00:29:05 sandro Exp $"""

# standard python libraries
import urlparse
import urllib
import urllib2 
import time
import sys
import os
import re
from optparse import OptionParser

# third party libraries
import rdflib

# my toolkit libraries
from debugtools import debug
import debugtools
import xmlextras

# sub-modules
import webdata_rdf

################################################################

class null_impl:

    @staticmethod
    def toPython(view, node, autoload=True):
        raise RuntimeError, "not enough known about this object"
    
class View:
    """A 'View' keeps the state involved in mapping between python
    instances and the various XML and RDF data stores (documents) out
    there.

    ################
    
    The public API for View is also available directly on the webdata
    module, for the convenience of applications which only need one
    View.  So:

    >> import webdata
    >> v = webdata.View()
    >> v.whatever()

    is the same as:

    >> import webdata
    >> webdata.whatever()

    """

    def __init__(self):
        self.instanceCache = {}
        self._moduleFor = {}
        self.classForID = {}
        
        # RDF specific stuff
        self.graph = rdflib.ConjunctiveGraph()

    def toPython(self, id, autoload=True):
        id = normalizeID(id)
        try:
            return self.instanceCache[id]
        except KeyError:
            pass

        if autoload and not isinstance(id, rdflib.Literal) and not isinstance(id, rdflib.BNode):
            self.load(id.ns)

        result = self.moduleFor(id).toPython(self, id, autoload)
        self.instanceCache[id] = result
        return result

    def moduleFor(self, id):
        """return a python module (or something like that) which
        implements various functions, suitable for handling id.
        Expected to be either webdata_xml or webdata_rdf, based on the
        type of data found in dereferencing id.ns"""
        if isinstance(id, rdflib.Literal) or isinstance(id, rdflib.BNode):
            return webdata_rdf
        try:
            return self._moduleFor[id.ns]
        except KeyError:
            #  @@@ this can't be right
            return webdata_rdf

    def load(self, location):

        if location in self._moduleFor:
            return

        # hack for now...    add a cache and sniff, when we need XML.
        self.loadRDF(location)

    def loadRDF(self, location):
        debug("webdata.load", "loading %s" % location)

        try:
            self.graph.parse(location)    # cacheLocation, pretendLocation
        except Exception:
            print >>sys.stderr, "Ignored error reading %s" % location
            pass
        self._moduleFor[location] = webdata_rdf
        debug("webdata.load", "view.graph up to %d triples" % len(self.graph))

    def scanAllModules(self):
        """Search through all imported modules for any which provide
        webdata namespace mappings, by giving all-module values for
        webdata_ns, webdata_prefix, or webdata_rename; call
        self.registerClass on all the classes in those modules.

        Maybe the classes should have to say "webdata_class = True" or
        something?  Or inherit from webdataClass?
        
        """ 

        for module in sys.modules.values():
            if not module:
                continue

            ns = getattr(module, "webdata_ns", None)
            prefix = getattr(module, "webdata_prefix", { })
            rename = getattr(module, "webdata_rename", { })
            special = getattr(module, "webdata_special", None)

            if ns or prefix or rename or special:
               debug("webdata.register(", "searching module %s " % `module`)
               for entry in module.__dict__.values():
                  if ( type(entry).__name__ == "classobj" and
                       entry.__module__ == module.__name__ ):
                     self.registerClass(entry)
               debug("webdata.register)")

    def registerClass(self, cls):
        """Record that this class can be instatiated for realizing the
        corresponding XML or RDF classes."""
        debug("webdata.register", "registering class %s " % `cls`)
        id = objID(cls)
        self.classForID[id] = cls
        debug("webdata.register", "registered as %s" % `id`)

        
################################################################

def objID(cls, attr=None):
   """Find out the ID for a Python class or an attribute of a Python class.

   If attr is given, it's the name of the attribute.  If it's omitted,
   then the namespaced name of the class itself is returned.

   The namespaced name is determined on a module-by-module basis, using
       webdata_ns     --- the default namespace for the module
       webdata_prefix --- a dict from prefixes to namespaces
       webdata_rename --- a dict from names to namespaced names
       webdata_special -- a callable to do the mapping
   """
   module = sys.modules[cls.__module__]
   webdata_ns = getattr(module, "webdata_ns", None)
   webdata_prefix= getattr(module, "webdata_prefix", {} )
   webdata_rename = getattr(module, "webdata_rename", {} )
   webdata_special = getattr(module, "webdata_special", None)
   name = attr or cls.__name__

   if webdata_special:
      return normalizeID(webdata_special(name))

   try:
      return normalizeID(webdata_rename[name])
   except KeyError:
      pass

   if name.endswith("_"):   # a python convention for python keywords
      name = name[:-1]

   for (key, value) in webdata_prefix.iteritems():
      key = key+"_"
      if name.startswith(key):
         return normalizeID(value, name[len(key):])

   if webdata_ns:
      return normalizeID(webdata_ns, name)
   else:
      return None

################################################################

def normalizeID(arg1, arg2=None):
    """Wrapper for constructor to ID which
    you can call many times, tested, with
    no effect -- to normalize ID arguments

    >>> i1 = ID("http://example.com/foo", "bar")
    >>> i1 is normalizeID(i1)
    True

    """
    if isinstance(arg1, ID):
        assert arg2 is None
        return arg1
    
    if isinstance(arg1, rdflib.Literal):
        return arg1
    
    #if arg2 is None:
    #    if hasattr(arg1, "uri"):
    #        return arg1

    return ID(arg1, arg2)

        
class ID:
    '''A Web object identifier (or refernce of some sort).  Sometimes
    used as a URIRef, sometimes as separate (ns, local).  Lets you
    form it either way.  If you provide it as a pair, the formed
    URIRef gets a "#" added if needed.

    >>> ID('http://example.com#foo')
    ID{ns='http://example.com', local='foo', uri=rdflib.URIRef('http://example.com#foo')}

    >>> ID("http://example.com/foo", "bar")
    ID{ns='http://example.com/foo', local='bar', uri=rdflib.URIRef('http://example.com/foo#bar')}
    
    Does not let you provide it in URIRef form when using slash, since
    it is too ambiguous.  Do you mean the page?   (Maybe we can provide
    an indirection flag, someday.)

    >>> ID("http://example.com/foo")
    ID{ns='http://example.com/foo', local='', uri=rdflib.URIRef('http://example.com/foo')}

    #Traceback (most recent call last):
    #    ...
    #ValueError: need more than 1 value to unpack

    >>> i1 = ID("http://example.com#foo")
    >>> i2 = ID("http://example.com", "foo")
    >>> i1 == i2
    True
    >>> hash(i1) == hash(i2)
    True


    >>> ID(rdflib.URIRef("http://example.org#foo"))
    ID{ns='http://example.org', local='foo', uri=rdflib.URIRef('http://example.org#foo')}

    '''


    # an implementation without redundancy in storage, and based on
    # __slots__ would be excellent.   Maybe we should not use
    # rdflib.URIRef like this?

    def __init__(self, arg1, arg2=None):

        if arg2 is None:
            if isinstance(arg1, rdflib.URIRef):
                arg1 = str(arg1)

            if isinstance(arg1, rdflib.BNode):
                self.rdfNode = rdflib.URIRef(arg1)
            else:
                self.rdfNode = rdflib.URIRef(arg1)  # safe, its idempotent

            try:
                (self.ns, self.local) = arg1.split("#")
            except ValueError, e:
                #raise e
                #    it seems dangerous to proceed this way, but .. we
                #    have code that relies on it now.  :-/
                self.ns  = arg1
                self.local = ""   
        else:
            assert isinstance(arg1, basestring)
            assert isinstance(arg2, basestring)
            self.ns = arg1
            self.local = arg2
            if arg1[-1] not in ["#", "/"]:
                arg1 += "#"
            self.rdfNode = rdflib.URIRef(arg1 + arg2)

    def __repr__(self):
        return ("ID{ns=%s, local=%s, uri=%s}" %
                (`self.ns`, `self.local`, `self.rdfNode`)
                )

    def __eq__(self, other):
        return self.rdfNode == other.rdfNode

    def __hash__(self):
       #debug("hash", "hash of %s is %d" % (`self.uri`, hash(self.uri)))
       return hash(self.rdfNode)
    
################################################################

_global_view = None

def _view():
    global _global_view
    if _global_view is None:
        _global_view = View()
        _global_view.scanAllModules()
    return _global_view


def load(location):
    "Global version of webdata.View function"
    return _view().load(location)

def toPython(*args, **kwargs):
    "Global version of webdata.View function"
    return _view().toPython(*args, **kwargs)



################################################################



def run():
    parser = OptionParser(usage="%prog [options] input-location",
                          version=__version__)
    parser.set_defaults(verbose=True)
    parser.add_option("-q", "--quiet",
                      action="store_false", dest="verbose", 
                      help="don't print status messages to stdout")
    parser.add_option("-D", "--debug",
                      action="append", dest="debugTags", 
                      help="turn on debugging of a particular type (try 'all')")
    parser.add_option("-o", "--output", action="store", dest="output",
                      help="Save the transformed output to this file, or '-' for stdout")
                      
    (options, args) = parser.parse_args()

    if options.debugTags:
        debugtools.tags.update(options.debugTags)
    verbose = options.verbose
    
    if len(args) != 1:
        parser.print_help()
        sys.exit(1)


    graph = rdflib.ConjunctiveGraph()
    for source in args:
        doPage(source, graph)

    if options.output:
        if options.output == "-":
            out = sys.stdout
        else:
            out = open(options.output, "w")
        out.write(graph.serialize(format="n3"))
    else:
        print "Output not requested.  Use --output to request it."


if __name__ == "__main__":
    import doctest, sys
    doctest.testmod(sys.modules[__name__])

    run()
