#!/usr/bin/perl -w # Copyright © 2003-2013 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 21-Dec-2003. require 5; use diagnostics; use strict; use LWP::UserAgent; use LWP::Simple; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.41 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 0; # The HTML validator generates an error if we use a tag that isn't # in this list (in order to detect typos.) # "1" means balanced tag; "2" means no close tag expected. # "3" means no close tag expected, but allowed (e.g.,

, ). # my %allowed_tags = ( "A" => 1, "ABBR" => 1, "AREA" => 2, "B" => 1, "BASE" => 2, "BLOCKQUOTE" => 1, "BODY" => 1, "BR" => 3, "DD" => 1, "DIV" => 1, "DL" => 1, "DT" => 1, "FORM" => 1, "FRAMESET" => 1, "FRAME" => 2, "H1" => 1, "H2" => 1, "H3" => 1, "H4" => 1, "HEAD" => 1, "HR" => 2, "HTML" => 1, "I" => 1, "IMG" => 3, "INPUT" => 3, "LI" => 1, "LINK" => 3, "MAP" => 1, "META" => 3, "NOBR" => 1, "NOFRAMES" => 1, "NOSCRIPT" => 1, "OL" => 1, "OPTION" => 1, "P" => 3, "PRE" => 1, "SCRIPT" => 1, "SELECT" => 1, "SPAN" => 1, "STRIKE" => 1, "STYLE" =>1, "SUP" => 1, "TABLE" => 1, "TEXTAREA" => 1, "TD" => 1, "TH" => 1, "TITLE" => 1, "TR" => 1, "TT" => 1, "U" => 1, "UL" => 1, "WBR" => 2, "BLINK" => 1, "SUB" => 1, "SUP" => 1, "MARQUEE" => 1, "OBJECT" => 1, "PARAM" => 1, "EMBED" => 1, "HYPE" => 2, "CANVAS" => 1, "IFRAME" => 1, "FB:LOGIN-BUTTON" => 1, "CITE" => 1, "SMALL" => 1, "LABEL" => 1, "CODE" => 1, "EM" => 1, "STRONG" => 1, # "FONT" => 1, # "LJ" => 2, "LJ-CUT" => 3, "LJ-TEMPLATE" => 1, "LJ-EMBED" => 1, # "LJ-POLL" => 1, "LJ-PQ" => 1, "LJ-PI" => 1, # "FB:SWF" => 1, ); # Does some simple syntax-checking on the HTML: makes sure tags are # balanced, etc. # sub validate_html($$) { my ($filename, $html) = @_; my @stack = (); my $debug = ($verbose > 1); # Kludge for doctype: turn it into a comment. # $html =~ s@^(\s*]*>)@@s; # Kludge for PHP embeds: turn them into comments. # # $html =~ s@(<\?.*?\?>)@@gs; # Kludge for self-closed tags like : map to # $html =~ s@(<([a-z:]+)\b[^<>]*?)/>@$1>@gsi; # lose text inside comments (but keep the newlines, for line numbering)... # $html =~ s@()@{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/^.*$//gm; "$a$b$c"; }@gse; $html =~ s/()//g; # lose comment tags themselves # lose text inside )@{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/^.*$//gm; "$a$b$c"; }@gsei; my $lineno = 1; my $upcoming_lines = 0; # Simpleminded check for stray ampersands and mis-typed entities. { my $ents = $html; $ents =~ s/&/\001&/gi; # Allow non-entity ampersands inside URLs. Technically illegal. $ents =~ s/\bHREF=\"[^\"\n]+\"//gsi; my @ents = split(m/\001/, $ents); my $count = 0; foreach my $e (@ents) { $lineno += $upcoming_lines; $upcoming_lines = -1; foreach (split ("\n", $e)) { $upcoming_lines++; } next if ($count++ == 0); if ($e !~ m/^&([a-z]+\d*|#\d\d+|#x[\da-f][\da-f]+);/si) { $e =~ s/^([^\s]*\s+[^\s<>]*)\b.*$/$1/s; error ("$filename: $lineno: non-entity ampersand: \"$e\""); } } } $html =~ s/\n/\001 /gs; $html =~ s/\s]+)\s*([^<>]*)/; next unless defined ($tag); $tag = uc($tag); error ("$filename: $lineno: broken tag: <$tag>") if (m/^<[^<>]*(<|$)/ || # < closed by < or EOL m/^<[^<>\"]*\"[^<>\"]*(>|$)/); # only one " before > or EOL my $t2 = $tag; $t2 =~ s@^/@@; my $code = $allowed_tags{$t2}; if ($tag =~ m@^/@) { # closing a tag my $otag = pop @stack; my $popped = 1; if (!defined ($otag)) { error ("$filename: $lineno: extranious <$tag>"); } elsif ($tag ne "/$otag") { my $t2 = $tag; $t2 =~ s@^/@@; $code = $allowed_tags{$t2}; if ($code && $code == 2) { error ("$filename: $lineno: unexpected close-tag form: <$tag>"); } elsif ($code && $code == 3) { # Optional close tag (e.g.,

, ). # This isn't our closer. Put it back. push @stack, $otag; $popped = 0; } else { error ("$filename: $lineno: <$otag> closed by <$tag>"); } } print STDERR "$lineno: " . ($popped ? "POP " : "SKIP") . " " . ('. ' x ($#stack+1)) . "/$otag\n" if ($debug); } elsif (! $code) { error ("$filename: $lineno: unknown tag <$tag>"); } else { # if ($tag eq 'TD' && $attrs !~ m/\bALIGN=/) { # print STDERR "$filename: $lineno: <$tag> has no ALIGN attribute\n"; # } my $otag = $stack[$#stack] || ''; if ( ($tag eq 'TD' && $otag ne 'TR') || ($tag eq 'TH' && $otag ne 'TR') || ($tag eq 'TR' && $otag ne 'TABLE') || ($tag eq 'DT' && $otag ne 'DL') || ($tag eq 'DD' && $otag ne 'DL') || ($tag eq 'FRAME' && $otag ne 'FRAMESET') || ($otag eq 'TR' && $tag ne 'TD' && $tag ne 'TH') || ($otag eq 'TABLE' && $tag ne 'TR') || ($otag eq 'FRAMESET' && $tag ne 'FRAME' && $tag ne 'FRAMESET') || ($otag eq 'OPTION' && $tag eq 'OPTION') || ($otag eq 'LI' && $tag eq 'LI') || ($otag eq 'A' && $tag eq 'A') || ($otag eq 'A' && $tag eq 'UL') || ($otag eq 'A' && $tag eq 'OL') # || ($otag ne 'HEAD' && $tag eq 'STYLE') ) { error ("$filename: $lineno: <$tag> not allowed inside <$otag>"); } if ($code == 2 || $code == 3) { # ignore these -- don't insist on closing tags. print STDERR "$lineno: SKIP " . ('. ' x ($#stack+1)) . "$tag\n" if ($debug); } else { # opening a new tag print STDERR "$lineno: PUSH " . ('. ' x ($#stack+1)) . "$tag\n" if ($debug); push @stack, $tag; } } } if ($#stack >= 0) { error ("$filename: $lineno: unclosed tags: <" . join (">, <", @stack) . ">"); } } # expands the first URL relative to the second. # sub expand_url($$) { my ($url, $base) = @_; $url =~ s/^\s+//gs; # lose whitespace at front and back $url =~ s/\s+$//gs; if ($url =~ m@^[a-z]+:|^//@si) { print STDERR "$progname: absolute URL: $url\n" if ($verbose > 5); } else { $base =~ s@(\#.*)$@@; # strip anchors $base =~ s@(\?.*)$@@; # strip arguments $base =~ s@/[^/]*$@/@; # take off trailing file component my $tail = ''; if ($url =~ s@(\#.*)$@@) { $tail = $1; } # save anchors if ($url =~ s@(\?.*)$@@) { $tail = "$1$tail"; } # save arguments my $base2 = $base; $base2 =~ s@^(([a-z]+:)?//[^/]+)/.*@$1@si # url is an absolute path if ($url =~ m@^/@); my $ourl = $url; $url = $base2 . $url; $url =~ s@/\./@/@g; # expand "." 1 while ($url =~ s@/[^/]+/\.\./@/@); # expand ".." $url .= $tail; # put anchors/args back print STDERR "$progname: relative URL: $ourl --> $url\n" if ($verbose > 4); } return $url; } my %image_size_cache = (); sub image_size($;$) { my ($file, $body) = @_; my $cache = $image_size_cache{$file}; return @{$cache} if $cache; my $file2 = $file; if ($file =~ m@^[a-z]+://@si) { error ("$file: no body") unless $body; my ($suf) = ($file =~ m@\.([a-z\d]+)$@si); $file2 = sprintf ("%s/img-%08x.%s", ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp"), rand(0xFFFFFFFF), $suf); open (my $out, '>', $file2) || error ("$file2: $!"); print $out $body; close $out; print STDERR "$progname: $file: wrote $file2\n" if ($verbose > 5); } else { error ("$file does not exist") unless -f $file; return (0, 0) unless -f $file; } my $cmd = ("identify -format '%[width]x%[height]' '${file2}[0]'"); print STDERR "$progname: executing: $cmd\n" if ($verbose > 4); my $result = `$cmd`; print STDERR "$progname: ==> $result\n" if ($verbose > 4); unlink $file2 unless ($file eq $file2); my ($w, $h) = ($result =~ m/^(\d+)x(\d+)$/); error ("no size: $file") unless ($w && $h); my @c = ($w, $h); $image_size_cache{$file} = \@c; return ($w, $h); } # Checks that the IMG tag has width and height matching the image. # sub validate_image($$$$) { my ($filename, $img, $body, $tag_opts) = @_; return if ($img =~ m@(^|/)logo\.gif$@s); # Kludge return if ($img =~ m@(^|/)(facebook|twitter)\.png$@s); # Kludge my ($fw, $fh) = image_size ($img, $body); my ($tw) = ($tag_opts =~ m@\b WIDTH \s* = \s* [\"\']? (\d+)@six); my ($th) = ($tag_opts =~ m@\b HEIGHT \s* = \s* [\"\']? (\d+)@six); ($tw) = ($tag_opts =~ m@\b width: \s* (\d+) \s* px \b@six) unless $tw; ($th) = ($tag_opts =~ m@\b height: \s* (\d+) \s* px \b@six) unless $th; if (!$tw || !$th) { # error ("$filename: $img: missing width/height (${fw}x$fh)") print STDERR "$progname: $filename: $img: missing width/height (${fw}x$fh)\n"; } elsif ($fw == $tw && $fh == $th) { print STDERR "$progname: $filename: size good (${tw}x$th)\n" if ($verbose > 3); } else { # error ("$filename: ${fw}x$fh, not ${tw}x$th"); print STDERR "$progname: $filename: $img: ${fw}x$fh, not ${tw}x$th\n"; } } # Hit the web site to check whether the URL exists. # my %validate_cache; sub validate_url($$$$) { my ($filename, $url, $images_p, $tag_opts) = @_; $url =~ s@#.*$@@s; # no anchors my $ret = $validate_cache{$url}; if (defined ($ret)) { print STDERR "$progname: $filename: cached: $url\n" if ($verbose > 4); } else { my $ua = LWP::UserAgent->new; $ua->agent("$progname/$version"); print STDERR "$progname: $filename: checking: $url\n" if ($verbose > 3); $images_p = 0 if ($images_p && $url !~ m@\.(p?jpe?g|gif|png)$@si); my $res = ($images_p ? $ua->get ($url) : $ua->head ($url)); $ret = ($res && $res->code) || 'null'; $validate_cache{$url} = $ret; if ($ret ne '200') { print STDERR "$progname: $filename: $url: status: $ret\n"; return; } validate_image ($filename, $url, $res->content, $tag_opts) if ($images_p); } } # Checks for broken A and IMG tags. # $urls_p: whether to validate URLs, or only relative links to local files. # 1 = only on the local site; # 2 = validate all URLs. # sub validate_links($$$$) { my ($filename, $html, $urls_p, $images_p) = @_; my @errors = (); return if ($filename eq 'header.html'); # Kludge. $html =~ s///gs; foreach (split (m/]*)>/si; my ($tag, $opts) = ($1, $2); my ($kind, $q, $url) = ($opts =~ m/(SRC|HREF|ONCLICK)=(["'])([^<>]+?)\2/si); next if ($opts =~ m/\b(ONCLICK)=/si); if (! $url) { print STDERR "$progname: $filename: skip: $tag $opts\n" if ($verbose > 3); next if ($opts =~ m/\b(NAME|TITLE)=/si); error ("$filename: unparsable " . uc($tag) . " tag: $opts"); } if ($url =~ m@^[a-z]+:|^/@si) { # has protocol or is web-rooted if ($filename =~ m@^[a-z]+:@si) { # base is a URL too $url = expand_url ($url, $filename); } else { if ($url !~ m@^[a-z]+:@si) { # no base to expand against print STDERR "$progname: $filename: skip absolute URL: $url\n" if ($verbose > 2); next; } } } if ($url =~ m@^(mailto|webcal|about|ftp|news|javascript|mms):@si || (!$urls_p && $url =~ m@^(https?):@si)) { print STDERR "$progname: $filename: skip URL: $url\n" if ($verbose > 2); next; } my $f2 = $filename; $f2 =~ s@/[^/]*$@/@s; $f2 =~ s@^[^/]*$@@s; if ($url =~ m@^[a-z]+:|^/@s) { $f2 = $url; } else { $f2 .= $url; } 1 while ($f2 =~ s@[^/]+[^./]/\.\./@@s); $f2 =~ s@\#.*$@@s; $f2 =~ s@\?.*$@@s; next if ($f2 eq ''); # points at "./" next if ($f2 eq '/'); if ($f2 =~ m@\.\./@s) { push @errors, "$filename: too many ..'s in \"$url\""; next; } my $this_images_p = ($images_p && $tag =~ m/^IMG$/si); if ($urls_p && $f2 =~ m@^([a-z]+://[^/]+/?)@si) { if ($urls_p > 1 || $filename =~ m@^\Q$1@s) { # URLs on same site validate_url ($filename, $f2, $this_images_p, $opts); } else { print STDERR "$progname: $filename: skip remote URL: $url\n" if ($verbose > 2); } next; } # Relative URL, but we're validating HTML from a URL. next if ($f2 =~ m@^https?:@si && $filename =~ m@^https?:@si); # Kludge next if ($f2 =~ m@^(calendar|flyers|gallery|backstage/log)(/\.)?/latest\.html$@s); next if ($f2 eq 'calendar/index.html'); next if ($f2 eq 'webcast/archive/'); next if ($f2 eq 'webcast/mixtapes/'); next if ($f2 eq 'backstage/src/archiver/archive.rss'); next if ($f2 eq '/dnalounge.css'); $f2 =~ s/%20/ /g; $f2 =~ s/&/&/g; if (! ($f2 =~ m@^(.*)/$@s ? -d $1 : -f $f2)) { push @errors, "$filename: $f2 does not exist"; next; } print STDERR "$progname: $filename: link: $f2\n" if ($verbose > 2); validate_image ($filename, $f2, undef, $opts) if ($this_images_p); } if (@errors) { my $err = shift @errors; foreach (@errors) { $err .= "\n$progname: $_"; } error ($err); } } sub error($) { my ($err) = @_; print STDERR "$progname: $err\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] [--urls] [--images] files...\n"; exit 1; } sub main() { my @files = (); my $urls_p = 0; my $images_p = 0; while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "--verbose") { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^--?urls?$/s) { $urls_p++; } elsif (m/^--?images?$/s) { $images_p++; } elsif (m/^-./) { usage; } else { push @files, $_; } } usage unless ($#files >= 0); foreach my $file (@files) { my $body = ''; if ($file eq '-') { print STDERR "$progname: reading stdin\n" if ($verbose); local $/ = undef; # read entire file $body = <>; $file = 'stdin'; } elsif (-f $file) { open (my $in, '<', $file) || error ("$file: $!"); print STDERR "$progname: reading $file\n" if ($verbose); local $/ = undef; # read entire file $body = <$in>; close $in; } elsif ($file =~ m/^https?:/) { print STDERR "$progname: reading $file\n" if ($verbose); $LWP::Simple::ua->agent("$progname/$version"); $body = LWP::Simple::get ($file); error ("no data: $file") if (!$body || $body =~ m/^\s*$/s); } else { error ("not a file or URL: $file"); } validate_html ($file, $body); validate_links ($file, $body, $urls_p, $images_p); } } main(); exit 0;