#!/usr/local/bin/python
""" $Id: nschecker,v 1.3 2005/02/09 10:59:49 hugo Exp $
"""

import cgi
import sys
import os
import urlparse
import urllib
import http_head
import http_auth
import re
import popen2

Page1 = """Content-Type: text/html; charset=utf-8


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
<head>
<link href="http://www.w3.org/StyleSheets/base" rel="stylesheet"/>
<link href="http://www.w3.org/2001/11/results" rel="stylesheet" />
<title>Namespaces checker service%s</title></head>
<body>

<p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C"/></a></p>

<h1>Namespaces checker%s</h1>
<h2>Description</h2>
<p>This tool takes the URI of an (X)HTML document as input and outputs the "visible" URIs in it and make a HTTP HEAD on them to check their validity. The intent is to help the <a href="/Guide/pubrules">pubrules</a> checking. It <strong>does not</strong> check the validity of the anchors.</p>
<p>You may want to use <a href="http://validator.w3.org/checklink">the linkchecker tool</a> to check all your links in your document.</p>
"""

Page2 = """
<form method="get" action='/2003/09/nschecker'>
<div>
<label>URI(s) of the document(s) you want to check URIs: <input type="text" name="uri" value="%s"/></label>
<input type="submit" value="Get results"/>
</div>
</form>

<hr />
<address>
script $Revision: 1.3 $ of $Date: 2005/02/09 10:59:49 $<br />
by <a href="http://www.w3.org/People/Dom/">Dominique Hazael-Massieux</a><br />
</address>
</body>
</html>
"""

class myHEADURLopener(http_head.HEADURLopener):
        res =""
	def http_error_default(self, url, fp, errcode, errmsg, headers):
		return None

	def http_error(self, url, fp, errcode, errmsg, headers, data=None):
		self.res = self.res + formatHeaders(errcode,errmsg,headers)
		return urllib.URLopener.http_error(self, url, fp, errcode, errmsg, headers, data)		

        def retry_http_basic_auth(self, url, realm, data=None):
                return None

def formatHeaders(errcode,errmsg,headers):
        classe=""
        if errcode==200:
                classe=" class='yes'"
        elif errcode==301 or errcode==302:
                classe=""
        elif errcode==401:
                classe=" class='tocheck'"
        else:
                classe=" class='no'"
	return( "-> <span" + classe + ">"+ `errcode` + "</span> (<span class='errmsg'>" + errmsg + '</span>) ')

def formatUriCheck(uri,number,head_request):
        plural= ""
        if number>1:
                plural = "s"
        print "<dt><a href='"+uri+"'>"+uri+"</a> (%s occurence%s)</dt>\n" % (number,plural)
        print "<dd>"
        head = head_request.open(uri)
        print head_request.res
        if head:
                print formatHeaders(200,"OK","")
                head.close()
        head_request.res = ""
        print "</dd>"

def serveRequest():
    fields = cgi.FieldStorage()

    if not fields.has_key('uri'):
        print Page1 % ("","")
	print Page2 % ("")
    else:
        addr = fields['uri'].value
	if len(urlparse.urlparse(addr)[0])<2:
		print "Status: 403"
		print "Content-Type: text/plain"
		print
		print "sorry, I decline to handle file: addresses"
	else:
                title = " for %s" % (addr)
                link = " for <a href='%s'>%s</a>" % (addr,addr)
                import http_auth
		url_opener = http_auth.ProxyAuthURLopener()
                error = ""
                try:
        		doc = url_opener.open(addr)
           	except IOError, (errno, strerror):
			doc = None
		print Page1 % (title,link)

                command = "/usr/bin/lynx -nolist -dump -width=256 -stdin"
                (piperfd,pipewfd,pipeErr) = popen2.popen3(command)
                if (doc):
                        pipewfd.write(doc.read())
                        doc.close()
                        pipewfd.close()
                        if (piperfd):
                                head_request = myHEADURLopener()
                                line = piperfd.readline()
                                pattern = "(http://[^><\s\"'\&\)]*)[>|<|\s|\"|'|\&\)]"
                                uriMatcher = re.compile(pattern)
                                foundUris = {}
                                while line:
                                        results = uriMatcher.findall(line)
                                        for uri in results:
                                                if uri[-1]==".":
                                                        uri=uri[:-1]
                                                if foundUris.has_key(uri):
                                                        foundUris[uri] = foundUris[uri] + 1
                                                else:
                                                        foundUris[uri] = 1
                                        line = piperfd.readline()
                                piperfd.close()
        		print '<p>URIs found :</p>\n'
                        uris = foundUris.keys()
                        w3_org = "(http://([^.]*\.)*w3\.org($|/))"
                        w3_org_matcher = re.compile(w3_org)
                        example_org = "(http://((([^.]*\.)*example(\.(net|org|com))?)|(([^.]*\.)+example))($|/))"
                        example_org_matcher = re.compile(example_org)
                        exampleUris = []
                        nsUris = []
                        w3Uris = []
                        otherUris = []
                        uris.sort()
                        for uri in uris:
                                if example_org_matcher.search(uri):
                                        exampleUris.append(uri)
                                elif uri[:20]=="http://www.w3.org/TR":
                                        w3Uris.append(uri)
                                elif uri[:27]=="http://www.ietf.org/rfc/rfc":
                                        w3Uris.append(uri)
                                elif uri[:18]=="http://www.w3.org/":
                                        nsUris.append(uri)
                                elif w3_org_matcher.search(uri):
                                        w3Uris.append(uri)
                                else:
                                        otherUris.append(uri)
                        if len(nsUris):
                                print "<h3><code>www.w3.org</code> URIs</h3>"
                                print "<p>The following visible URIs in the <code>www.w3.org</code> domain have been found; some of them may be namespaces, in which case they need to follow <a href='http://www.w3.org/1999/10/nsuri'>the namespaces rules</a>.</p>"
                                print "<dl>"
                                for uri in nsUris:
                                        formatUriCheck(uri,foundUris[uri],head_request)
                                print "</dl>"
                        if len(otherUris):
                                print "<h3>URIs not in <code>*.w3.org</code> nor <code>*.example.*</code></h3>"
                                print "<p>The following visible URIs are neither in w3.org nor example.org; check they aren't used as namespaces.</p>"
                                print "<dl>"
                                for uri in otherUris:
                                        formatUriCheck(uri,foundUris[uri],head_request)
                                print "</dl>"
                        if len(w3Uris) or len(exampleUris):
                                print "<h3>Other URIs</h3>"
                                if len(w3Uris):
                                        print "<p>The following URIs aren't likely to be problematic, but make sure they are dereferenceable; you may want to have a quick look at them to check they aren't used as formal namespaces too.</p>"
                                        print "<dl>"
                                        for uri in w3Uris:
                                                formatUriCheck(uri,foundUris[uri],head_request)
                                        print "</dl>"
                                if len(exampleUris):
                                        print "<p>The following examples URIs were found:</p>"
                                        print "<ul>"
                                        for uri in exampleUris:
                                                print "<li>%s</li>" % (uri)
                                        print "</ul>"
                        if len(uris)==0:
                                print "<p>No visible URIs were found in the document.</p>"
                else:
                        print "<p><span class='no'>An error</span> (%s) occured trying to get <a href='%s'>%s</a>.</p>" % (url_opener.error,addr,addr)
		print Page2 % (addr)	

if __name__ == '__main__':
    if os.environ.has_key('SCRIPT_NAME'):
        serveRequest()
