Synopsis - Cross-Reference

File: /scripts/html-validator
  1#!/usr/bin/env python
  2#
  3# Copyright (C) 2004 Stefan Seefeld
  4# All rights reserved.
  5# Licensed to the public under the terms of the GNU LGPL (>= 2),
  6# see the file COPYING for details.
  7#
  8
  9
 10from xml.sax import saxexts, saxlib, saxutils
 11import sys, os, string, urllib, urlparse
 12import getopt
 13
 14verbose = False
 15
 16class Reference:
 17    def __init__(self, orig, line, ref):
 18        self.orig = orig
 19        self.line = line
 20        self.ref = ref
 21
 22class DocumentHandler(saxlib.DocumentHandler):
 23    """Store urefs with the linenumbers they were encountered in,
 24    so we can either traverse them, too, or report errors with specific
 25    line numbers."""
 26
 27    def __init__(self):
 28
 29        self.urefs = {}
 30        self.locator = None
 31
 32    def get_urefs(self):
 33        
 34        urefs = [Reference(self.locator.getSystemId(), l, u) for (u, l) in self.urefs.items()]
 35        self.urefs = {}
 36        self.locator = None
 37        return urefs
 38
 39    def setDocumentLocator(self, locator):
 40        "Receive an object for locating the origin of SAX document events."
 41        self.locator = locator
 42
 43    def startElement(self, name, attrs):
 44        "Look for ancors and store links."
 45
 46        if name == 'a':
 47            href = attrs.getValue('href')
 48            if not self.urefs.has_key(href):
 49                self.urefs[href] = self.locator.getLineNumber()
 50
 51from xml.sax.drivers import drv_xmlproc
 52SAXparser=drv_xmlproc.SAX_XPParser()
 53
 54handler = DocumentHandler()
 55SAXparser.setDocumentHandler(handler)
 56SAXparser.setErrorHandler(saxutils.ErrorRaiser())
 57
 58def validate(url):
 59    """validate (x)html conformance using 'tidy'."""
 60
 61    if verbose: print 'validating', url
 62    status = os.system('tidy -errors -quiet "%s"'%url)
 63    if os.WIFSIGNALED(status):
 64        print 'internal error:', os.WTERMSIG(status)
 65    elif os.WIFEXITED(status):
 66        if os.WEXITSTATUS(status) == 2:
 67            print 'validation failed'
 68        return
 69    else:
 70        print 'internal error !'
 71        
 72def usage():
 73   print 'Usage : %s [options] <input files>'%sys.argv[0]
 74   print """
 75List of options:
 76
 77  -h, --help             help
 78  -p, --print            provide verbose feedback during validation
 79  -m, --maximum          maximum number of pages to validate
 80  -v, --validate         call http://validator.w3.org to validate html
 81  -e, --external         follow external links
 82"""
 83
 84def main():
 85   global verbose
 86
 87   max = 50
 88   external = False
 89   do_validate = False
 90
 91   opts, args = getopt.getopt(sys.argv[1:],
 92                              'pm:evh',
 93                              ['print', 'maximum=', 'external', 'validate', 'help'])
 94   for o, a in opts:
 95      if o in ['-h', '--help']:
 96         usage()
 97         sys.exit(0)
 98      elif o in ['-p', '--print']: verbose = True
 99      elif o in ['-m', '--maximum']: max = int(a)
100      elif o in ['-e', '--external']: external = True
101      elif o in ['-v', '--validate']: do_validate = True
102
103   if not args:
104         usage()
105         sys.exit(0)
106
107   done = []
108   urefs = [Reference('.', 0, args[0])]
109   while urefs and (max == -1 or len(done) < max):
110
111       uref = urefs.pop(0)
112       url = urlparse.urljoin(uref.orig, uref.ref)
113       scheme, location, path, query, fragment = urlparse.urlsplit(url)
114       if not external and scheme and scheme != 'file': continue
115       url = urlparse.urlunsplit((scheme, location, path, query, ''))
116       if url in done: continue
117       try:
118           if verbose: print 'parsing', url
119           SAXparser.parse(url)
120           if do_validate: validate(url)
121           done.append(url)
122           urefs.extend(handler.get_urefs())
123            
124       except saxlib.SAXParseException, e:
125           sys.stderr.write('%s; processing aborted\n'%e)
126           break
127
128if __name__ == '__main__':
129
130    main()