#!/usr/bin/perl -w # Copyright © 2003-2008 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 21-Dec-2003. require 5; use diagnostics; use strict; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.18 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 0; # The HTML validator generates an error if we use a tag that isn't # in this list (in order to detect typos.) # "1" means balanced tag; "2" means no close tag expected. # my %allowed_tags = ( "A" => 1, "AREA" => 2, "B" => 1, "BASE" => 2, "BLOCKQUOTE" => 1, "BODY" => 1, "BR" => 2, "DD" => 1, "DIV" => 1, "DL" => 1, "DT" => 1, "FONT" => 1, "FORM" => 1, "FRAMESET" => 1, "FRAME" => 2, "H1" => 1, "H2" => 1, "H3" => 1, "H4" => 1, "HEAD" => 1, "HR" => 2, "HTML" => 1, "I" => 1, "IMG" => 2, "INPUT" => 2, "LI" => 1, "LINK" => 2, "MAP" => 1, "MENUBASE" => 2, "META" => 2, "NOBR" => 1, "NOFRAMES" => 1, "NOSCRIPT" => 1, "OL" => 1, "OPTION" => 1, "P" => 2, "PRE" => 1, "SCRIPT" => 1, "SELECT" => 1, "SPAN" => 1, "STRIKE" => 1, "STYLE" =>1, "SUP" => 1, "TABLE" => 1, "TEXTAREA" => 1, "TD" => 1, "TH" => 1, "TITLE" => 1, "TR" => 1, "TT" => 1, "U" => 1, "UL" => 1, "WBR" => 2, "LJ" => 2, "LJ-CUT" => 2, "LJ-TEMPLATE" => 1, "BLINK" => 1, "SUB" => 1, "SUP" => 1, "MARQUEE" => 1, "LJ-POLL" => 1, "LJ-PQ" => 1, "LJ-PI" => 1, "OBJECT" => 1, "PARAM" => 1, "EMBED" => 1, "LJ-EMBED" => 1, "HYPE" => 2 ); # Does some simple syntax-checking on the HTML: makes sure tags are # balanced, etc. # sub validate_html($$) { my ($filename, $html) = @_; my $lineno = 1; my @stack = (); my $debug = ($verbose > 1); # Kludge for doctype: turn it into a comment. # $html =~ s@^(\s*]*>)@@s; # lose text inside comments (but keep the newlines, for line numbering)... # $html =~ s@()@{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/^.*$//gm; "$a$b$c"; }@gse; $html =~ s/()//g; # lose comment tags themselves # Simpleminded check for stray ampersands and mis-typed entities. { my $ents = $html; $ents =~ s/&/\001&/gi; $ents =~ s/\bHREF=\"[^\"]+\"//gsi; # kludge my @ents = split(m/\001/, $ents); shift @ents; foreach (@ents) { if (! m/^&([a-z]+\d*|#\d\d+);/si) { s/^([^\s]*\s+[^\s<>]*)\b.*$/$1/s; error ("$filename: $lineno: non-entity ampersand: \"$_\""); } } } $html =~ s/\n/\001 /gs; $html =~ s/\s]+)\s*([^<>]*)/; next unless defined ($tag); $tag =~ tr/a-z/A-Z/; error ("$filename: $lineno: broken tag: <$tag>") if (m/^<[^<>]*(<|$)/ || # < closed by < or EOL m/^<[^<>\"]*\"[^<>\"]*(>|$)/); # only one " before > or EOL $tag = 'P' if ($tag eq '/P'); # kludge... $tag = 'LJ-CUT' if ($tag eq '/LJ-CUT'); # kludge... my $code = $allowed_tags{$tag}; if ($tag =~ m@^/@) { # closing a tag my $otag = pop @stack; print STDERR "$lineno: POP " . ('. ' x ($#stack+1)) . "/$otag\n" if ($debug); if (!defined ($otag)) { error ("$filename: $lineno: extranious <$tag>"); } elsif ($tag ne "/$otag") { my $t2 = $tag; $t2 =~ s@^/@@; $code = $allowed_tags{$t2}; if ($code && $code == 2) { error ("$filename: $lineno: unexpected close-tag form: <$tag>"); } else { error ("$filename: $lineno: <$otag> closed by <$tag>"); } } } elsif (! $code) { error ("$filename: $lineno: unknown tag <$tag>"); } else { # if ($tag eq 'TD' && $attrs !~ m/\bALIGN=/) { # print STDERR "$filename: $lineno: <$tag> has no ALIGN attribute\n"; # } my $otag = $stack[$#stack] || ''; if (($tag eq 'TD' && $otag ne 'TR') || ($tag eq 'TH' && $otag ne 'TR') || ($tag eq 'TR' && $otag ne 'TABLE') || ($tag eq 'FRAME' && $otag ne 'FRAMESET') || ($otag eq 'TR' && $tag ne 'TD' && $tag ne 'TH') || ($otag eq 'TABLE' && $tag ne 'TR') || ($otag eq 'FRAMESET' && $tag ne 'FRAME' && $tag ne 'FRAMESET') || ($otag eq 'OPTION' && $tag eq 'OPTION') || ($otag eq 'LI' && $tag eq 'LI') || ($otag eq 'DD' && $tag eq 'DT') || ($otag eq 'DT' && $tag eq 'DD') || ($otag eq 'A' && $tag eq 'A') || ($otag eq 'A' && $tag eq 'UL') || ($otag eq 'A' && $tag eq 'OL') || ($otag ne 'HEAD' && $tag eq 'STYLE') ) { error ("$filename: $lineno: <$tag> not allowed inside <$otag>"); } if ($code == 2) { # ignore these -- don't insist on closing tags. print STDERR "$lineno: SKIP " . ('. ' x ($#stack+1)) . "$tag\n" if ($debug); } else { # opening a new tag print STDERR "$lineno: PUSH " . ('. ' x ($#stack+1)) . "$tag\n" if ($debug); push @stack, $tag; } } } if ($#stack >= 0) { error ("$filename: $lineno: unclosed tags: <" . join (">, <", @stack) . ">"); } } sub error($) { my ($err) = @_; print STDERR "$progname: $err\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] files...\n"; exit 1; } sub main() { my @files = (); while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "--verbose") { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^-./) { usage; } else { push @files, $_; } } usage unless ($#files >= 0); foreach my $file (@files) { local *IN; open (IN, "<$file") || error ("$file: $!"); print STDERR "$progname: reading $file\n" if ($verbose); my $body = ''; while () { $body .= $_; } close IN; validate_html ($file, $body); } } main(); exit 0;