#!/usr/local/bin/python """ $Id: nschecker,v 1.3 2005/02/09 10:59:49 hugo Exp $ """ import cgi import sys import os import urlparse import urllib import http_head import http_auth import re import popen2 Page1 = """Content-Type: text/html; charset=utf-8 Namespaces checker service%s

Namespaces checker%s

Description

This tool takes the URI of an (X)HTML document as input and outputs the "visible" URIs in it and make a HTTP HEAD on them to check their validity. The intent is to help the pubrules checking. It does not check the validity of the anchors.

You may want to use the linkchecker tool to check all your links in your document.

""" Page2 = """

script $Revision: 1.3 $ of $Date: 2005/02/09 10:59:49 $
by Dominique Hazael-Massieux

""" class myHEADURLopener(http_head.HEADURLopener): res ="" def http_error_default(self, url, fp, errcode, errmsg, headers): return None def http_error(self, url, fp, errcode, errmsg, headers, data=None): self.res = self.res + formatHeaders(errcode,errmsg,headers) return urllib.URLopener.http_error(self, url, fp, errcode, errmsg, headers, data) def retry_http_basic_auth(self, url, realm, data=None): return None def formatHeaders(errcode,errmsg,headers): classe="" if errcode==200: classe=" class='yes'" elif errcode==301 or errcode==302: classe="" elif errcode==401: classe=" class='tocheck'" else: classe=" class='no'" return( "-> "+ `errcode` + " (" + errmsg + ') ') def formatUriCheck(uri,number,head_request): plural= "" if number>1: plural = "s" print "

"+uri+" (%s occurence%s)

\n" % (number,plural) print "

" head = head_request.open(uri) print head_request.res if head: print formatHeaders(200,"OK","") head.close() head_request.res = "" print "

" def serveRequest(): fields = cgi.FieldStorage() if not fields.has_key('uri'): print Page1 % ("","") print Page2 % ("") else: addr = fields['uri'].value if len(urlparse.urlparse(addr)[0])<2: print "Status: 403" print "Content-Type: text/plain" print print "sorry, I decline to handle file: addresses" else: title = " for %s" % (addr) link = " for %s" % (addr,addr) import http_auth url_opener = http_auth.ProxyAuthURLopener() error = "" try: doc = url_opener.open(addr) except IOError, (errno, strerror): doc = None print Page1 % (title,link) command = "/usr/bin/lynx -nolist -dump -width=256 -stdin" (piperfd,pipewfd,pipeErr) = popen2.popen3(command) if (doc): pipewfd.write(doc.read()) doc.close() pipewfd.close() if (piperfd): head_request = myHEADURLopener() line = piperfd.readline() pattern = "(http://[^><\s\"'\&\)]*)[>|<|\s|\"|'|\&\)]" uriMatcher = re.compile(pattern) foundUris = {} while line: results = uriMatcher.findall(line) for uri in results: if uri[-1]==".": uri=uri[:-1] if foundUris.has_key(uri): foundUris[uri] = foundUris[uri] + 1 else: foundUris[uri] = 1 line = piperfd.readline() piperfd.close() print '

URIs found :

\n' uris = foundUris.keys() w3_org = "(http://([^.]*\.)*w3\.org($|/))" w3_org_matcher = re.compile(w3_org) example_org = "(http://((([^.]*\.)*example(\.(net|org|com))?)|(([^.]*\.)+example))($|/))" example_org_matcher = re.compile(example_org) exampleUris = [] nsUris = [] w3Uris = [] otherUris = [] uris.sort() for uri in uris: if example_org_matcher.search(uri): exampleUris.append(uri) elif uri[:20]=="http://www.w3.org/TR": w3Uris.append(uri) elif uri[:27]=="http://www.ietf.org/rfc/rfc": w3Uris.append(uri) elif uri[:18]=="http://www.w3.org/": nsUris.append(uri) elif w3_org_matcher.search(uri): w3Uris.append(uri) else: otherUris.append(uri) if len(nsUris): print "

`www.w3.org` URIs

" print "

The following visible URIs in the www.w3.org domain have been found; some of them may be namespaces, in which case they need to follow the namespaces rules.

" print "

" for uri in nsUris: formatUriCheck(uri,foundUris[uri],head_request) print "" if len(otherUris): print "

URIs not in `.w3.org` nor `.example.*`

" print "

The following visible URIs are neither in w3.org nor example.org; check they aren't used as namespaces.

" print "

" for uri in otherUris: formatUriCheck(uri,foundUris[uri],head_request) print "" if len(w3Uris) or len(exampleUris): print "

Other URIs

" if len(w3Uris): print "

The following URIs aren't likely to be problematic, but make sure they are dereferenceable; you may want to have a quick look at them to check they aren't used as formal namespaces too.

" print "

" for uri in w3Uris: formatUriCheck(uri,foundUris[uri],head_request) print "" if len(exampleUris): print "

The following examples URIs were found:

" print "

" if len(uris)==0: print "

No visible URIs were found in the document.

" else: print "

An error (%s) occured trying to get %s.

" % (url_opener.error,addr,addr) print Page2 % (addr) if __name__ == '__main__': if os.environ.has_key('SCRIPT_NAME'): serveRequest()

Namespaces checker%s

Description

www.w3.org URIs

URIs not in *.w3.org nor *.example.*

Other URIs

`www.w3.org` URIs

URIs not in `.w3.org` nor `.example.*`