Testing Links at $httproot

# link-check.tcl # AOLserver link verifier # this program crawls through all of your Web content and finds the dead links # it should simply be placed in "link-check.tcl" somewhere accessible through an # AOLserver 2.2 (2.1 might work also but no guarantees). Request the URL and it # will grind through the Web content. # Copyright Jin Choi (jsc@arsdigita.com) and Philip Greenspun (philg@mit.edu) # distributed under the GNU Public License global webroot global httproot global debug_link_checker global running_on_wimpy_machine set debug_link_checker 0 # if you set this to 1 then the checker will sleep for 1 second periodically # thus giving Web service a chance set running_on_wimpy_machine 0 # webroot, httproot # webroot is the Unix fully qualified path set webroot [ns_info pageroot] set httproot [ns_conn location $conn] proc ReturnHeaders {conn {content_type text/html}} { ns_write $conn "HTTP/1.0 200 OK MIME-Version: 1.0 Content-Type: $content_type " } proc check_file {f conn} { ns_write $conn "

$f\n

skipping $url because it doesn't look like HTTP:// or relative ref\n" continue } # strip off any trailing #foo section directives to browsers regexp {^(.*/?[^/]+)\#[^/]+$} $url dummy url if [catch { set response [check_link $f $url] } errmsg ] { # we got an error (probably a dead server) set response "probably the foreign server isn't responding at all" } if {$response == 404 || $response == 405 || $response == 500 } { # we should try again with a full GET # because a lot of program-backed servers return 404 for HEAD # when a GET works fine if [catch { set response [check_link $f $url 1] } errmsg] { set response "probably the foreign server isn't responding at all" } } if { $response != 200 && $response != 302 } { ns_write $conn "
$url: $response\n" } } ns_write $conn "

\n" } proc walk_tree {dir procedure conn seen_already_cache {pattern {.*}}} { upvar $seen_already_cache seen global debug_link_checker global running_on_wimpy_machine # do this so that pwd works (so that we can avoid infinite loops) cd $dir set canonical_dirname [pwd] if [info exists seen($canonical_dirname)] { if { $debug_link_checker == 1 } { ns_write $conn "walk_tree: skipping directory $canonical_dirname (already seen)
" } return } set seen($canonical_dirname) 1 if { $debug_link_checker == 1 } { ns_write $conn "walk_tree: checking out directory $dir
\n" } foreach f [glob -nocomplain $dir/*] { if [file readable $f] { if [file isdirectory $f] { if { $running_on_wimpy_machine == 1 } { # we sleep for one second in order to not trash Web service ns_sleep 1 } walk_tree $f $procedure $conn seen $pattern } else { if {[file isfile $f]} { if {[ns_info winnt]} { set match [regexp -nocase $pattern $f] } else { set match [regexp $pattern $f] } if $match { $procedure $f $conn } } } } } } ## Assumes url is a URL valid for use with ns_httpopen proc get_http_status {url {use_get_p 0} {timeout 30}} { if $use_get_p { set http [ns_httpopen GET $url "" $timeout] } else { set http [ns_httpopen HEAD $url "" $timeout] } # philg changed these to close BOTH rfd and wfd set rfd [lindex $http 0] set wfd [lindex $http 1] close $rfd close $wfd set headers [lindex $http 2] set response [ns_set name $headers] set status [lindex $response 1] ns_set free $headers return $status } proc check_link {base_file reference_inside_href {use_get_p 0}} { # base_file is the full file system path where the # HTML was found; reference_inside_href is the string # that was inside the Testing Links at $httproot

Testing Links

at $httproot

All HTML files:

" set seen_already_cache() 0 walk_tree $webroot check_file $conn seen_already_cache {\.html$} ns_write $conn "

Jin S. Choi