#!/usr/bin/perl -w # Copyright © 2003-2011 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 21-Dec-2003. require 5; use diagnostics; use strict; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.32 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 0; # The HTML validator generates an error if we use a tag that isn't # in this list (in order to detect typos.) # "1" means balanced tag; "2" means no close tag expected. # my %allowed_tags = ( "A" => 1, "ABBR" => 1, "AREA" => 2, "B" => 1, "BASE" => 2, "BLOCKQUOTE" => 1, "BODY" => 1, "BR" => 2, "DD" => 1, "DIV" => 1, "DL" => 1, "DT" => 1, "FORM" => 1, "FRAMESET" => 1, "FRAME" => 2, "H1" => 1, "H2" => 1, "H3" => 1, "H4" => 1, "HEAD" => 1, "HR" => 2, "HTML" => 1, "I" => 1, "IMG" => 2, "INPUT" => 2, "LI" => 1, "LINK" => 2, "MAP" => 1, "MENUBASE" => 2, "META" => 2, "NOBR" => 1, "NOFRAMES" => 1, "NOSCRIPT" => 1, "OL" => 1, "OPTION" => 1, "P" => 2, "PRE" => 1, "SCRIPT" => 1, "SELECT" => 1, "SPAN" => 1, "STRIKE" => 1, "STYLE" =>1, "SUP" => 1, "TABLE" => 1, "TEXTAREA" => 1, "TD" => 1, "TH" => 1, "TITLE" => 1, "TR" => 1, "TT" => 1, "U" => 1, "UL" => 1, "WBR" => 2, "BLINK" => 1, "SUB" => 1, "SUP" => 1, "MARQUEE" => 1, "OBJECT" => 1, "PARAM" => 1, "EMBED" => 1, "HYPE" => 2, "CANVAS" => 1, "IFRAME", => 1, # "FONT" => 1, # "LJ" => 2, "LJ-CUT" => 2, "LJ-TEMPLATE" => 1, "LJ-EMBED" => 1, # "LJ-POLL" => 1, "LJ-PQ" => 1, "LJ-PI" => 1, # "FB:SWF" => 1, ); # Does some simple syntax-checking on the HTML: makes sure tags are # balanced, etc. # sub validate_html($$) { my ($filename, $html) = @_; my $lineno = 1; my @stack = (); my $debug = ($verbose > 1); # Kludge for doctype: turn it into a comment. # $html =~ s@^(\s*]*>)@@s; # Kludge for PHP embeds: turn them into comments. # # $html =~ s@(<\?.*?\?>)@@gs; # Kludge for self-closed tags like : map to # $html =~ s@(<([a-z:]+)\b[^<>]*?)/>@$1>@gsi; # lose text inside comments (but keep the newlines, for line numbering)... # $html =~ s@()@{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/^.*$//gm; "$a$b$c"; }@gse; $html =~ s/()//g; # lose comment tags themselves # Simpleminded check for stray ampersands and mis-typed entities. { my $ents = $html; $ents =~ s/&/\001&/gi; $ents =~ s/\bHREF=\"[^\"]+\"//gsi; # kludge my @ents = split(m/\001/, $ents); shift @ents; foreach (@ents) { if (! m/^&([a-z]+\d*|#\d\d+|#x[\da-f][\da-f]);/si) { s/^([^\s]*\s+[^\s<>]*)\b.*$/$1/s; error ("$filename: $lineno: non-entity ampersand: \"$_\""); } } } $html =~ s/\n/\001 /gs; $html =~ s/\s]+)\s*([^<>]*)/; next unless defined ($tag); $tag = uc($tag); error ("$filename: $lineno: broken tag: <$tag>") if (m/^<[^<>]*(<|$)/ || # < closed by < or EOL m/^<[^<>\"]*\"[^<>\"]*(>|$)/); # only one " before > or EOL $tag = 'P' if ($tag eq '/P'); # kludge... # $tag = 'LJ-CUT' if ($tag eq '/LJ-CUT'); # kludge... my $code = $allowed_tags{$tag}; if ($tag =~ m@^/@) { # closing a tag my $otag = pop @stack; print STDERR "$lineno: POP " . ('. ' x ($#stack+1)) . "/$otag\n" if ($debug); if (!defined ($otag)) { error ("$filename: $lineno: extranious <$tag>"); } elsif ($tag ne "/$otag") { my $t2 = $tag; $t2 =~ s@^/@@; $code = $allowed_tags{$t2}; if ($code && $code == 2) { error ("$filename: $lineno: unexpected close-tag form: <$tag>"); } else { error ("$filename: $lineno: <$otag> closed by <$tag>"); } } } elsif (! $code) { error ("$filename: $lineno: unknown tag <$tag>"); } else { # if ($tag eq 'TD' && $attrs !~ m/\bALIGN=/) { # print STDERR "$filename: $lineno: <$tag> has no ALIGN attribute\n"; # } my $otag = $stack[$#stack] || ''; if (($tag eq 'TD' && $otag ne 'TR') || ($tag eq 'TH' && $otag ne 'TR') || ($tag eq 'TR' && $otag ne 'TABLE') || ($tag eq 'DT' && $otag ne 'DL') || ($tag eq 'DD' && $otag ne 'DL') || ($tag eq 'FRAME' && $otag ne 'FRAMESET') || ($otag eq 'TR' && $tag ne 'TD' && $tag ne 'TH') || ($otag eq 'TABLE' && $tag ne 'TR') || ($otag eq 'FRAMESET' && $tag ne 'FRAME' && $tag ne 'FRAMESET') || ($otag eq 'OPTION' && $tag eq 'OPTION') || ($otag eq 'LI' && $tag eq 'LI') || ($otag eq 'A' && $tag eq 'A') || ($otag eq 'A' && $tag eq 'UL') || ($otag eq 'A' && $tag eq 'OL') || ($otag ne 'HEAD' && $tag eq 'STYLE') ) { error ("$filename: $lineno: <$tag> not allowed inside <$otag>"); } if ($code == 2) { # ignore these -- don't insist on closing tags. print STDERR "$lineno: SKIP " . ('. ' x ($#stack+1)) . "$tag\n" if ($debug); } else { # opening a new tag print STDERR "$lineno: PUSH " . ('. ' x ($#stack+1)) . "$tag\n" if ($debug); push @stack, $tag; } } } if ($#stack >= 0) { error ("$filename: $lineno: unclosed tags: <" . join (">, <", @stack) . ">"); } } # Checks for broken A and IMG tags; only checks local files (relative links). # sub validate_links($$) { my ($filename, $html) = @_; return if ($filename eq 'header.html'); # Kludge. $html =~ s///gs; foreach (split (m/]*)>/si; my ($tag, $opts) = ($1, $2); my ($kind, $q, $url) = ($opts =~ m/(SRC|HREF|ONCLICK)=(["'])([^<>\"]+)\2/si); next if ($opts =~ m/\b(ONCLICK)=/si); if (! $url) { print STDERR "$progname: menuify: $filename: skip: $tag $opts\n" if ($verbose > 3); next if ($opts =~ m/\b(NAME|TITLE)=/si); error ("$filename: unparsable $tag tag: $opts"); } next if ($url =~ m@^(https?|mailto|webcal|about|ftp|news|javascript):@si); # next if ($url =~ m@/$@si); next if ($url =~ '^/favicon\.'); next if ($url eq '/jwz.css'); # Kludge next if ($url eq '/gallery.css'); # Kludge my $f2 = $filename; $f2 =~ s@/[^/]*$@/@s; $f2 =~ s@^[^/]*$@@s; $f2 .= $url; 1 while ($f2 =~ s@[^/]+[^./]/\.\./@@s); $f2 =~ s@\#.*$@@s; $f2 =~ s@\?.*$@@s; next if ($f2 eq ''); # points at "./" next if ($f2 eq '/'); error ("$filename: too many ..'s in \"$url\"") if ($f2 =~ m@\.\./@s); # Kludge next if ($f2 =~ m@^(calendar|flyers|gallery|backstage/log)(/\.)?/latest\.html$@s); next if ($f2 eq 'calendar/index.html'); next if ($f2 eq 'webcast/archive/'); next if ($f2 eq 'webcast/mixtapes/'); next if ($f2 eq 'backstage/src/archiver/archive.rss'); next if ($f2 eq '/dnalounge.css'); $f2 =~ s/%20/ /g; $f2 =~ s/&/&/g; error ("$filename: $f2 does not exist") unless ($f2 =~ m@^(.*)/$@s ? -d $1 : -f $f2); print STDERR "$progname: menuify: $filename: link: $f2\n" if ($verbose > 2); } } sub error($) { my ($err) = @_; print STDERR "$progname: $err\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] files...\n"; exit 1; } sub main() { my @files = (); while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "--verbose") { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^-./) { usage; } else { push @files, $_; } } usage unless ($#files >= 0); foreach my $file (@files) { error ("$file: not a file") unless -f $file; open (my $in, '<', $file) || error ("$file: $!"); print STDERR "$progname: reading $file\n" if ($verbose); local $/ = undef; # read entire file my $body = <$in>; close $in; validate_html ($file, $body); validate_links ($file, $body); } } main(); exit 0;