#!/usr/bin/perl -w # Copyright © 2000 Jamie Zawinski , all rights reserved. # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Generates summaries of Unicode charset tables (e.g., for comparing and # contrasting Latin1 versus CP-1252.) # # Files you need: # ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT # http://www.w3.org/TR/html4/sgml/entities.html # # Created: 23-Jul-00. require 5; use diagnostics; use strict; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.1 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 0; my %name_map; sub get_short_names { my ($file) = @_; open (IN, "<$file") || die "$progname: opening $file: $!\n"; my $line = -1; my $body = ""; while () { $body .= $_; } close IN; $_ = $body; s/<[^>]+>//g; s/ / /g; s/<//g; s/&/&/g; s/[\r\n]+/ /gs; s/\n". "\n". "\n". "\n". "\n". "\n". "\n". "\n". "\n". "\n". "\n". "\n". "\n". "\n". "\n"; foreach (split (/\n/)) { $line++; next unless m/^.*/>/; if (! m@@){ print STDERR "$progname: $file \#$line unparsable: $_\n"; exit 1; } my ($entity, $n, $uname) = ($1, $2, $3); $uname =~ s@^([^=]+)=.*$@$1@; $uname =~ s@^([^,]+),.*$@$1@; $uname =~ s@\s*$@@; $uname =~ tr [a-z] [A-Z]; $uname =~ s/^APPLE LOGO$/Apple logo/; # Baah... $name_map{$uname} = $entity; $name_map{$entity} = $uname; } $name_map{""} = "NULL"; $name_map{"NULL"} = "NULL"; } sub map_name { my ($cname, $char) = @_; $_ = $cname; my $cap = m/capital/i; tr [A-Z] [a-z]; s/^latin //; s/^(capital|small|modifier) (letter|ligature) (.*)$/$3/; s/^vulgar //; s/^fraction //; s/ with /-/; s/ sign//; s/ mark//; s/ accent//; s/ indicator//; s/-pointing//; s/ quotation/ quote/; if ($cap) { s/(^[a-z]+)/\U$1/; } s/^([a-zA-Z])-/$1/; return $_; # $_ = $name_map{$cname}; # if (defined ($_)) { # $cname = $_; # } elsif ($char > 127) { # print STDERR "$progname: unknown character \"$cname\"\n"; # } } # returns an array (really, a list) of the character names in the file. # The Nth element of the list has character N's name in it. # sub parse_file { my ($name, $file) = @_; my @chars; local *IN; open (IN, "<$file") || die "$progname: opening $file: $!\n"; my $line = -1; # Read the file first so we can deal with fucking mac line terminators... my $body = ""; while () { $body .= $_; } close IN; $body =~ s/\r\r?\n/\n/gs; $body =~ s/\r/\n/gs; $_ = $body; foreach (split (/\n/)) { $line++; next if (m/^$/); next if (m/^\s*\#/); if (! m/^0x([0-9A-Z]+)\s+(0x([0-9A-Z]+))?\s*\#\s*([^\#]+)/i) { print STDERR "$progname: $file \#$line unparsable: $_\n"; exit 1; } my ($char, $uchar, $cname) = ($1, $3, $4); $uchar = 0 unless $uchar; $char = hex ($char); $uchar = hex ($uchar); $cname =~ s/^(null||undefined)$//i; $cname =~ s/ \([A-Za-z]+\)$//; $chars[$char] = map_name ($cname, $char); } $chars[0] = $name; return @chars; } # Takes a set of references to lists as arguments, and prints a comparison # of the characters in them. # sub compare_charsets { my @sets = @_; my $nsets = $#sets; my $good_summary = ""; my $bad_summary = ""; for (my $i = 0; $i <= 255; $i++) { next if ($i < 128); # don't compare ASCII characters. my @names; foreach my $set (@sets) { my @chars = @$set; my $c = $chars[$i]; $c = "" unless $c; push @names, $c; } my $ok = 1; for (my $i = 1; $i <= $#names; $i++) { if ($names[0] ne $names[$i]) { $ok = 0; last; } } if ($ok) { next if ($names[0] eq "NULL" || $names[0] eq ""); my $s = sprintf ("\t| %-24s %c %3d %02X %03o |\n", $names[0], $i, $i, $i, $i); $good_summary .= $s; } else { my @indexes; my $name = ""; my @n2 = @names; while ($name eq "NULL" || $name eq "") { $name = shift @n2; } foreach my $set2 (@sets) { my @chars2 = @$set2; my $ok = 0; for (my $j = 0; $j <= 255; $j++) { my $n = $chars2[$j]; $n = "" unless $n; if ($n eq $name) { push @indexes, $j; $ok = 1; last; } } if (! $ok) { push @indexes, 0; } } my $chars = ""; my $ords = ""; foreach my $j (@indexes) { if ($j == 0) { $chars .= "-"; $ords .= " --- -- ---"; } else { $chars .= sprintf ("%c", $j); $ords .= sprintf (" %3d %02X %03o", $j, $j, $j); } } my $s = sprintf ("| %-24s %s%s |\n", $name, $chars, $ords); $bad_summary .= $s; } } print "Problematic Characters:\n"; my $sep = "+-----------------------------"; my $sep2 = "| Name "; my $sep3 = "| "; foreach (@sets) { $sep .= "-"; $sep2 .= " "; $sep3 .= " "; } foreach (@sets) { my @chars = @$_; $sep .= "-------------"; $sep2 .= sprintf ("%11s ", $chars[0]); $sep3 .= " Dec/Hex/Oct"; } $sep .= "-+\n"; $sep2 .= " |\n"; $sep3 .= " |\n"; print $sep; print $sep2; print $sep3; print $sep; print "$bad_summary"; print $sep; print "\n\nIdentical Characters:\n"; print "\t+--------------------------------------------+\n"; print "\t| Name - Dec/Hex/Oct |\n"; print "\t+--------------------------------------------+\n"; print "$good_summary"; print "\t+--------------------------------------------+\n"; } sub main { # get_short_names "/tmp/entities.html"; my @latin1 = parse_file ("Latin1", "/tmp/8859-1.TXT"); my @windows = parse_file ("CP-1252", "/tmp/CP1252.TXT"); my @macroman = parse_file ("MacRoman", "/tmp/ROMAN.TXT"); compare_charsets (\@latin1, \@windows, \@macroman); # compare_charsets (\@latin1, \@windows); # compare_charsets (\@windows, \@macroman); } main;