#!/usr/bin/python
"""
$Id: characterencoding.py,v 1.12 2006/09/08 09:38:20 dom Exp $

The CharacterEncodingTestCase implements the validator.testcase.TestCase
interface for the tests related to the following BP:
* Ensure that content is encoded using a character encoding that is known to be supported by the target device.
* Indicate in the response the character encoding being used.

License
-------
Copyright (c) 2006 World Wide Web Consortium, (Massachusetts
Institute of Technology, European Research Consortium for Informatics
and Mathematics, Keio University). All Rights Reserved. This work is
distributed under the W3C Software License [1] in the hope that it
will be useful, but WITHOUT ANY WARRANTY; without even the implied
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

[1] http://www.w3.org/Consortium/Legal/copyright-software

"""


from  validator import testcase 

# Tests regarding the CHARACTER_ENCODING_* BP
class CharacterEncodingTestCase(testcase.LinksBasedTestCase):
    BpId=["CHARACTER_ENCODING_SUPPORT","CHARACTER_ENCODING_USE"]

    def startDocument(self):
        import re
        self._hasContent = False
        self._markupEncoding = None
        self._whitespace_regexp = re.compile(u"^[\u0032\u0009\u0013\u0010]*$")
        testcase.LinksBasedTestCase.startDocument(self)

            
    def startElement(self, name, attrs):
        if name=="meta" and attrs.has_key("http-equiv") and attrs["http-equiv"].lower()=="content-type" and attrs.has_key("content"):
               from cgi import parse_header
               ct = parse_header(attrs["content"])
               if len(ct)>1 and ct[1].has_key("charset"):
                   encoding=ct[1]["charset"]
                   # while this interesting, it's not really relevant
                   # for the BP checker, so commented out
                   #if self._hasContent:
                       # one shouldn't declare the encoding when some content
                       # has already been parsed
                       #location = testcase.LineColumnLocation(self.uri,self.getEncoding(),self.locator.getLineNumber(),self.locator.getColumnNumber())
                       #self.addObservation(testcase.Observation("CE4",location))
                   #else:
                   self._markupEncoding = encoding
        testcase.LinksBasedTestCase.startElement(self,name,attrs)


    def characters(self,content):
        if not self._hasContent and not self._whitespace_regexp.match(content):
            self._hasContent = True
        testcase.LinksBasedTestCase.characters(self,content)

        
    def _getDeclaredEncoding(self,uri,type):
        """ Gets the encoding from a resource based on its type """
        encoding = None
        if type=="css":
            # reading @charset in a CSS depends on CSS2 support
            # in particular, this doesn't apply to the default
            # delivery context
            # @@@ needs to be sent as a param from ddc profile
            pass
        elif type=="xml":
            headers,content=self._http_request(uri)
            return self._getXMLDeclaredEncoding(content)
            pass
        return encoding


    def _isXMLMimeType(self,mime):
        """ Determines whether a given media type follows the convention
        of RFC 3023 """
        if not mime:
            return False
        mimeParse = mime.split("/")
        if len(mimeParse) > 1:
            subtype = mimeParse[1]
            if subtype == "xml":
                return True
            subtypeParse = subtype.split("+")
            if len(subtypeParse)>1 and subtypeParse[-1]=="xml":
                return True
        return False
            
    def _getMimeDefaultEncoding(self,mime):
        """ Returns the default encoding known for a given media type
        if it is defined."""
        if not mime:
            return None
        mimeParse = mime.split("/")
        if len(mimeParse) > 1:
            # No encoding declared at the protocol level
            # we try to get it from the resource itself
            toplevelmt = mimeParse[0]
            subtype = mimeParse[1]
            if toplevelmt=="text":
                #http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
                #says that encoding of text/* MIME representation over HTTP
                # defaults to iso-8859-1
                encoding = "iso-8859-1"
                # Should this send a warning?@@@
            else:
                if self._isXMLMimeType(mime):
                    # we're hitting a +xml mime type
                    # so applying the default encoding as per RFC2023
                    encoding = "utf-8"
                else:
                    encoding = None
        return encoding

    def _observeLinks(self):
        # looking at the main document encoding
        mime = None
        encoding = None
        if self._headers and self._headers.has_key("content-type"):
            from cgi import parse_header
            ct = parse_header(self._headers["content-type"])
            mime = ct[0]
            if len(ct)>1 and ct[1].has_key("charset"):
                encoding = ct[1]["charset"]
        # include the current doc as part of the link list
        # so that it can be handled in the same loop
        from validator import utils
        selflink = utils.LinkTarget(self.uri,utils.LINK_TYPE_EMBED,mime,None,testcase.Location(self.uri))
        selflink.protocolEncoding = encoding
        # if some encoding was defined in the markup, we use it as a hint
        selflink.encodingHint = self._markupEncoding
        links = self.getLinks()
        links.insert(0,selflink)
        for link in links:
            if link.nature == utils.LINK_TYPE_EMBED:
                type=None
                if link.mime=="text/css":
                    type="css"
                elif self._isXMLMimeType(link.mime):
                    type="xml"
                # encoding is determined in priority by the protocol
                # then by what the resource itself declares
                # then by a hint given in the markup 
                # and eventually by the default rule for the said mime type
                # @@@ Should send a warning when discrepancy between
                # declared encoding?
                enc = None
                implicit = ""
                for i in [link.protocolEncoding,
                     self._getDeclaredEncoding(link.target,type),
                     link.encodingHint]:
                    if i:
                        enc = i
                        break
                if not enc:
                     enc = self._getMimeDefaultEncoding(link.mime)
                     implicit = "implicit "


                if link.mime and len(link.mime.split("/")) > 1 and link.mime.split("/")[0]=="text" and not enc:
                    self.addObservation(testcase.Observation("CE1",link.location))
                # @@@ the list of accepted encodings
                # needs to be loaded from the profile rather than hard set
                elif enc and enc.lower() not in ["utf-8"]:
                    self.addObservation(testcase.Observation("CE2",link.location,{"encoding":enc,"implicit":implicit}))
        if not "CE1" in self._observations and not "CE2" in self._observations:
            self.addObservation(testcase.Observation("CE3",testcase.Location(self.uri)))
            

# -------------------------
# Unit tests for this module
import unittest

from validator.testcase import TestResults, Observation, Location, LineColumnLocation
class Tests(unittest.TestCase):
    def _test(self,inp,res):
        a = CharacterEncodingTestCase()
        self.assertEqual(a.run(inp),res)
        
    def testWithOKCharacterEncoding(self):
        inp = "http://dev.w3.org/cvsweb/~checkout~/2006/mwbp-validator/tests/characterencoding-1.xhtml?content-type=application/xhtml%2Bxml"
        res = TestResults(
            [Observation("CE3",Location(inp))]
            )
        self._test(inp,res)

    def testWithBadCharacterEncoding(self):
        inp = "http://dev.w3.org/cvsweb/~checkout~/2006/mwbp-validator/tests/characterencoding-2.xhtml?content-type=application/xml"
        res = TestResults(
            [Observation('CE2',Location(inp),{'encoding': u'iso-8859-1'}),
             Observation('CE2',LineColumnLocation(inp,'utf-8',7,0),{'encoding': 'iso-8859-1'})]
            )
        self._test(inp,res)

    def testWithBadCharacterEncodingInXMLDecl(self):
        inp = "http://dev.w3.org/cvsweb/~checkout~/2006/mwbp-validator/tests/characterencoding-3.xhtml?content-type=application/xhtml%2Bxml"
        res = TestResults(
            [Observation('CE2',Location(inp),{'encoding': 'iso-8859-1'}),
             Observation('CE2',LineColumnLocation(inp,'utf-8',7,0),{'encoding': 'iso-8859-1'})]
            )
        self._test(inp,res)

    # This tested for misplaced character encoding declaration
    # but we don't test for this anymore
    def testWithMiscplacedCharacterEncodingDecl(self):
        return True
        inp = "http://dev.w3.org/cvsweb/~checkout~/2006/mwbp-validator/tests/characterencoding-4.xhtml?content-type=application/xhtml%2Bxml"
        res = TestResults(
            [
            #Observation('CE4',LineColumnLocation(inp,'utf-8',6,0)),
             Observation('CE2',LineColumnLocation(inp,'utf-8',7,0),{'encoding': u'iso-8859-2'})]
            )
        self._test(inp,res)


def _test():
    unittest.main()

if __name__ == '__main__':
    _test()





