File:  [Public] / 2006 / surbl.py
Revision 1.1: download - view: text, annotated - select for diffs
Tue Apr 18 11:12:59 2006 UTC (18 years, 1 month ago) by dom
Branches: MAIN
CVS tags: HEAD
surbl resolver

#!/usr/bin/python
"""
$Id: surbl.py,v 1.1 2006/04/18 11:12:59 dom Exp $

SURBL implementation
http://www.surbl.org/

This module implements a SurblChecker class which allows to check whether a given URL matches a domain listed in SURBL.


It relies on DNSPython
http://dnspython.org/

License
-------
Copyright (c) 2006 World Wide Web Consortium, (Massachusetts
Institute of Technology, European Research Consortium for Informatics
and Mathematics, Keio University). All Rights Reserved. This work is
distributed under the W3C Software License [1] in the hope that it
will be useful, but WITHOUT ANY WARRANTY; without even the implied
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

[1] http://www.w3.org/Consortium/Legal/copyright-software

"""


import dns.resolver
import urlparse

class SurblChecker:
    """An object that allows to check whether a given URL matches a domain listed ni SURBL. Example:
    S = surbl.SurblChecker()
    if S->isMarkedAsSpam('http://www.w3.org/2006/'):
       print "w3.org has been marked as spam!"

    """

    # Takes the location of the file listing the known TLDs where second level registration is well-known
    # An example of such a file can be downloaded from:
    # http://spamcheck.freeapp.net/two-level-tlds
    def __init__(self,twoLevelsTlds):
        f = open(twoLevelsTlds)
        self._twoLevelsTlds = f.readlines()

    def isMarkedAsSpam(self,uri):
        # The domain part of the URI is the 2nd item in the set
        domainData = urlparse.urlparse(uri)
        registeredName = self._extractRegisteredDomain(domainData[2])
        try:
            answers = dns.resolver.query(registeredName + '.multi.surbl.org', 'A')
            return 1
        except dns.resolver.NXDOMAIN:
            return 0

    def _extractRegisteredDomain(self,authorityComponent):
        import string
        # removing userinfo and port
        hostComponent = authorityComponent
        if string.count(hostComponent,'@')>0:
            hostComponent = hostComponent[string.find(hostComponent,'@'):-1]
        if string.count(hostComponent,':')>0:
            hostComponent = hostComponent[1:string.find(hostComponent,':')]
        dnsParts = string.split(hostComponent,'.')
        secondLevelTld = dnsParts[-2] + '.' + dnsParts[-1] + "\n"
        if secondLevelTld in self._twoLevelsTlds and len(dnsParts) > 2:
            registeredName = dnsParts[-3] + '.' + dnsParts[-2] + '.' + dnsParts[-1]
        else:
            registeredName = dnsParts[-2] + '.' + dnsParts[-1]
        return registeredName

import unittest

class Tests(unittest.TestCase):
    def testDomainExtraction(self):
        S = SurblChecker('/home/dom/data/2006/04/two-level-tlds')
        cases = (("www.w3.org", "w3.org"),
                 ('chirurgiens-dentistes.fr','chirurgiens-dentistes.fr'),
                 ("myteeth.example.chirurgiens-dentistes.fr","example.chirurgiens-dentistes.fr"),
                 ("example:example@www.example.org:80","example.org")
                 )

        for inp, exp in cases:
            self.assertEquals(S._extractRegisteredDomain(inp),exp)

    def testMarkedAsSpam(self):
        S = SurblChecker('/home/dom/data/2006/04/two-level-tlds')
        cases = (("www.w3.org",0),
               ("www.microsoft.com",0),
               ("allofall.net",1)
               )
        for inp, exp in cases:
            self.assertEquals(S.isMarkedAsSpam(inp),exp)

def _test():
    import doctest, surbl
    doctest.testmod(surbl)
    unittest.main()

if __name__ == '__main__':
    _test()

Webmaster