Synopsis - Cross-Reference
File: /scripts/html-validator1#!/usr/bin/env python 2# 3# Copyright (C) 2004 Stefan Seefeld 4# All rights reserved. 5# Licensed to the public under the terms of the GNU LGPL (>= 2), 6# see the file COPYING for details. 7# 8 9 10from xml.sax import saxexts, saxlib, saxutils 11import sys, os, string, urllib, urlparse 12import getopt 13 14verbose = False 15 16class Reference: 17 def __init__(self, orig, line, ref): 18 self.orig = orig 19 self.line = line 20 self.ref = ref 21 22class DocumentHandler(saxlib.DocumentHandler): 23 """Store urefs with the linenumbers they were encountered in, 24 so we can either traverse them, too, or report errors with specific 25 line numbers.""" 26 27 def __init__(self): 28 29 self.urefs = {} 30 self.locator = None 31 32 def get_urefs(self): 33 34 urefs = [Reference(self.locator.getSystemId(), l, u) for (u, l) in self.urefs.items()] 35 self.urefs = {} 36 self.locator = None 37 return urefs 38 39 def setDocumentLocator(self, locator): 40 "Receive an object for locating the origin of SAX document events." 41 self.locator = locator 42 43 def startElement(self, name, attrs): 44 "Look for ancors and store links." 45 46 if name == 'a': 47 href = attrs.getValue('href') 48 if not self.urefs.has_key(href): 49 self.urefs[href] = self.locator.getLineNumber() 50 51from xml.sax.drivers import drv_xmlproc 52SAXparser=drv_xmlproc.SAX_XPParser() 53 54handler = DocumentHandler() 55SAXparser.setDocumentHandler(handler) 56SAXparser.setErrorHandler(saxutils.ErrorRaiser()) 57 58def validate(url): 59 """validate (x)html conformance using 'tidy'.""" 60 61 if verbose: print 'validating', url 62 status = os.system('tidy -errors -quiet "%s"'%url) 63 if os.WIFSIGNALED(status): 64 print 'internal error:', os.WTERMSIG(status) 65 elif os.WIFEXITED(status): 66 if os.WEXITSTATUS(status) == 2: 67 print 'validation failed' 68 return 69 else: 70 print 'internal error !' 71 72def usage(): 73 print 'Usage : %s [options] <input files>'%sys.argv[0] 74 print """ 75List of options: 76 77 -h, --help help 78 -p, --print provide verbose feedback during validation 79 -m, --maximum maximum number of pages to validate 80 -v, --validate call http://validator.w3.org to validate html 81 -e, --external follow external links 82""" 83 84def main(): 85 global verbose 86 87 max = 50 88 external = False 89 do_validate = False 90 91 opts, args = getopt.getopt(sys.argv[1:], 92 'pm:evh', 93 ['print', 'maximum=', 'external', 'validate', 'help']) 94 for o, a in opts: 95 if o in ['-h', '--help']: 96 usage() 97 sys.exit(0) 98 elif o in ['-p', '--print']: verbose = True 99 elif o in ['-m', '--maximum']: max = int(a) 100 elif o in ['-e', '--external']: external = True 101 elif o in ['-v', '--validate']: do_validate = True 102 103 if not args: 104 usage() 105 sys.exit(0) 106 107 done = [] 108 urefs = [Reference('.', 0, args[0])] 109 while urefs and (max == -1 or len(done) < max): 110 111 uref = urefs.pop(0) 112 url = urlparse.urljoin(uref.orig, uref.ref) 113 scheme, location, path, query, fragment = urlparse.urlsplit(url) 114 if not external and scheme and scheme != 'file': continue 115 url = urlparse.urlunsplit((scheme, location, path, query, '')) 116 if url in done: continue 117 try: 118 if verbose: print 'parsing', url 119 SAXparser.parse(url) 120 if do_validate: validate(url) 121 done.append(url) 122 urefs.extend(handler.get_urefs()) 123 124 except saxlib.SAXParseException, e: 125 sys.stderr.write('%s; processing aborted\n'%e) 126 break 127 128if __name__ == '__main__': 129 130 main()