#!/usr/bin/perl -w # Copyright © 2002, 2004, 2005 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 1-Jun-2002. # # Takes an HTML document and converts it to the body of a # multipart/alternative mail message, with a text/plain first part, # and a text/html second part. # # The conversion of HTML to plain text handles most interesting tags: # it does nested indentation for UL, OL, BLOCKQUOTE, etc; it handles PRE; # it handles character entities; it wraps paragraphs. # # It also handles
: any text inside that scope will # have "> " prepended to the beginning of the text/plain line. # # Quoted-printable encoding will be used when necessary. When QP is used, # lines are broken at word boundaries instead of merely every 72 characters. # # This doesn't handle UTF-8 -- see the comment above the simplify_utf8() # function for what needs to be done. require 5; use diagnostics; use strict; use Text::Wrap; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.4 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 0; my %entity_table = ( "quot" => '"', "amp" => '&', "lt" => '<', "gt" => '>', "nbsp" => ' ', "iexcl" => '¡', "cent" => '¢', "pound" => '£', "curren" => '¤', "yen" => '¥', "brvbar" => '¦', "sect" => '§', "uml" => '¨', "copy" => '©', "ordf" => 'ª', "laquo" => '«', "not" => '¬', "shy" => '­', "reg" => '®', "macr" => '¯', "deg" => '°', "plusmn" => '±', "sup2" => '²', "sup3" => '³', "acute" => '´', "micro" => 'µ', "para" => '¶', "middot" => '·', "cedil" => '¸', "sup1" => '¹', "ordm" => 'º', "raquo" => '»', "frac14" => '¼', "frac12" => '½', "frac34" => '¾', "iquest" => '¿', "Agrave" => 'À', "Aacute" => 'Á', "Acirc" => 'Â', "Atilde" => 'Ã', "Auml" => 'Ä', "Aring" => 'Å', "AElig" => 'Æ', "Ccedil" => 'Ç', "Egrave" => 'È', "Eacute" => 'É', "Ecirc" => 'Ê', "Euml" => 'Ë', "Igrave" => 'Ì', "Iacute" => 'Í', "Icirc" => 'Î', "Iuml" => 'Ï', "ETH" => 'Ð', "Ntilde" => 'Ñ', "Ograve" => 'Ò', "Oacute" => 'Ó', "Ocirc" => 'Ô', "Otilde" => 'Õ', "Ouml" => 'Ö', "times" => '×', "Oslash" => 'Ø', "Ugrave" => 'Ù', "Uacute" => 'Ú', "Ucirc" => 'Û', "Uuml" => 'Ü', "Yacute" => 'Ý', "THORN" => 'Þ', "szlig" => 'ß', "agrave" => 'à', "aacute" => 'á', "acirc" => 'â', "atilde" => 'ã', "auml" => 'ä', "aring" => 'å', "aelig" => 'æ', "ccedil" => 'ç', "egrave" => 'è', "eacute" => 'é', "ecirc" => 'ê', "euml" => 'ë', "igrave" => 'ì', "iacute" => 'í', "icirc" => 'î', "iuml" => 'ï', "eth" => 'ð', "ntilde" => 'ñ', "ograve" => 'ò', "oacute" => 'ó', "ocirc" => 'ô', "otilde" => 'õ', "ouml" => 'ö', "divide" => '÷', "oslash" => 'ø', "ugrave" => 'ù', "uacute" => 'ú', "ucirc" => 'û', "uuml" => 'ü', "yacute" => 'ý', "thorn" => 'þ', "yuml" => 'ÿ', "ndash" => '-', "mdash" => "--" ); # Convert HTML character entities to their Latin1 equivalents. # sub de_entify($) { my ($text) = @_; # decimal entities $text =~ s/(&\#(\d+);?)/chr($2)/gexi; # named entities $text =~ s/(&([a-z]+);?)/ { my $c = $entity_table{$2}; print STDERR "$progname: warning: unknown HTML character entity \"$1\"\n" unless $c; ($c ? $c : "[$2]"); } /gexi; $text =~ s/\240/ /g; # nbsp return $text; } # Does a simplistic converstion of HTML to plain-text. # sub html_to_text($;$) { my ($html, $columns) = @_; $columns = 72 unless $columns; 1 while ($html =~ s///); # nuke comments $html =~ s/^\s+//gs; $html =~ s/\s+$//gs; $html = "$html\n"; # simplifies matches # first pass: convert
, since it's a pain in the ass.
  #
  {
    $html =~ s@(]*>)\n@$1@gs;   # swallow \n after 
    $html =~ s@\n(]*>)@$1@gs;  # swallow \n before 
my $pre = 0; my $html2 = ''; foreach (split (/]*>)(.*)$/s; if ($tag =~ m/^PRE\b/i) { $pre++; $body = "\001$body" if ($pre == 1); # kludge to mark pre blocks. } elsif ($pre && $tag =~ m@^/PRE\b@i) { $pre--; } if ($pre) { $body =~ s@\n@&\#10;@gs; # (don't use BR, so we don't compress it) $body =~ s@\t@ @gs; # FIXME: handle tab stops $body =~ s@ @\240@gs; # space -. nbsp } $html2 .= "<$tag$body"; } $html = $html2; } # # Now handle the more normal tags... # my $indent_tags = 'UL|OL|DL|BLOCKQUOTE'; my $swallow_tags = 'TITLE|SCRIPT|STYLE'; # tags with implicit

around them my $ptags = "PRE|H\\d|TABLE|$indent_tags"; $html =~ s@<($ptags)\b@

<$1@gio; $html =~ s@]*)>@

@gio; my $btags = 'LI|DT|DD|TR|TD'; # tags with implicit
before them $html =~ s@<($btags)\b@
<$1@gio; $html =~ s@\s+@ @gs; # compress all whitespace $html =~ s@

@

@sig; $html =~ s@(
\s*)(

-->

$html =~ s@(]*>\s*)+@$1@sig; # compress consecutive

s $html =~ s@(]*>)@

@sig; #

-->

$html =~ s@\s+(
)@$1@sig; # strip whitespace before
$html =~ s@(
)\s+@$1@sig; # strip whitespace after
$html =~ s@^\s+@@si; $html =~ s@\s+$@@si; my $indent = 0; my $cite = 0; my $swallow = 0; my $text = ''; foreach my $hpara (split (/]*>\s*/i, $html)) { my $para = ''; my $prefix2 = ''; foreach (split (/]*>)?(.*)$/s; $tag = '' unless $tag; if ($tag =~ m/^LI\b/i) { $body =~ s/^\s+//g; $body = "* $body"; $prefix2 = " "; } elsif ($tag =~ m@^($indent_tags)\b@io) { $indent++; if ($tag =~ m@BLOCKQUOTE\b.*TYPE\s*=\s*\"?CITE\b@i) { $cite = $indent; } } elsif ($indent && $tag =~ m@^/($indent_tags)\b@io) { $indent--; $prefix2 = ''; $cite = 0 if ($cite > $indent); } elsif ($tag =~ m@^($swallow_tags)\b@io) { $swallow++; } elsif ($swallow && $tag =~ m@^/($swallow_tags)\b@io) { $swallow--; $prefix2 = ''; } elsif ($tag =~ m@^IMG\b@io) { # my $alt = "[IMAGE]"; my $alt = ''; if ($tag =~ m@\bALT=\"([^\"]*)\"@ || $tag =~ m@\bALT=([^\"\s]*)\b@) { $alt = $1; } $body = "$alt$body"; } $body = '' if ($swallow); # inside a tag whose body is discarded $para .= $body; } $para =~ s/^\s+//g; my $pre_p = ($para =~ s/^\001//); # kludgey PRE marker... # wrap the paragraph unless it is empty, or was PRE. my $prefix = (" " x $indent); $prefix = "> $prefix" if ($cite); $prefix2 = $prefix . $prefix2; if ($pre_p || $para =~ m/^\s*$/) { $para = $prefix . $para; } else { $Text::Wrap::columns = $columns; $para = wrap ($prefix, $prefix2, $para); } $text .= "\n$para"; } $text .= "\n\n"; $text = de_entify ($text); $text =~ s/[ \t]+$//gm; # strip spaces from ends of lines... $text =~ s/^\n+//gs; # clean up any vertical whitespace mistakes... $text =~ s/\n\n+/\n\n/gs; # blah. kludge to delete consecutive blank quoted lines. $text =~ s/(\n>){3,}/$1$1/gs; return $text; } # Returns the charset used in the HTML document. # sub guess_charset($) { my ($html) = @_; if ($html =~ m/(]*>)/i) { $_ = $1; my ($charset) = m/CONTENT\s*=\s*\"[^\"]*\bCHARSET\s*=\s*\"?([^\"]+)\"?/i; return $charset if $charset; } return "ISO-8859-1"; } # The MIME standard says that the content-type on a part should be the # smallest charset that can represent the text. Therefore, if the # document is written in Latin1 but only contains ASCII characters, # the charset should be marked as US-ASCII. # # Similarly, a document should only be mailed as UTF-8 if it actually # contained characters from disjoint charsets: if the document contains # only Latin1 characters, it should be coded in Latin1. # # #### Someone needs to write this function to convert UTF-8 documents # #### to a form suitable for shipping through the mail. # sub simplify_utf8($) { my ($text) = @_; my $charset = "ISO-8859-1"; #### bzzt, wrong. return ($charset, $text); } # Construct a multipart/* mime message from the arguments. # arguments are: # # # [ ]+ # sub mime_pack(@) { my (@args) = @_; my $body = ""; my $mp_type = shift @args; my $boundary = sprintf ("%08x.%08x%08x", time(), rand(0xFFFFFF), rand(0xFFFFFF)); $body .= "Content-Type: $mp_type; boundary=\"$boundary\"\r\n"; $body .= "\r\n"; $body .= "This is a MIME-encapsulated message.\r\n"; while ($#args >= 0) { my $type = shift @args; my $enc = shift @args; my $cset = shift @args; my $part = shift @args; $part =~ s/\r\n/\n/g; # canonicalize to CRLF linebreaks $part =~ s/\r/\n/g; $part =~ s/\n/\r\n/g; $body .= "\r\n--$boundary\r\n"; $body .= "Content-Type: $type"; $body .= "; charset=\"$cset\"" if ($cset); $body .= "\r\n"; $body .= "Content-Transfer-Encoding: $enc\r\n" if ($enc); $body .= "Content-Length: " . length($part) . "\r\n"; $body .= "\r\n"; $body .= $part; } $body .= "\r\n--$boundary--\r\n"; return $body; } # Convert the text to quoted-printable. # sub quoted_printable($) { my ($text) = @_; # convert ctl chars, high-bit chars, and "=" to "=XX". $text =~ s/([=\000-\010\013\014\016-\037\177-\377])/ sprintf("=%02X", ord($1)) /gex; if ($text =~ m/^[^\n]{73}/m) { # if there are any long lines, wrap them. my @lines = (); foreach my $line (split(/\n/, $text)) { my $text2 = ''; my $col = 0; foreach my $chunk (split (/(\n|[ \t]+)/, $line)) { my $L = length($chunk); if ($col + $L >= 72) { $col = 0; $text2 .= "=\n"; } $col += $L; # take care of any words that are themselves too long $chunk = join ("=\n", split (/([^\n]{71})/, $chunk)); $text2 .= $chunk; } push @lines, $text2; } $text = join("\n", @lines); $text =~ s/(=\n)(=\n)+/$1/gs; # eliminate redundancy $text =~ s/^(=\n)//gm; } return $text; } # If the given text requires encoding to make it through mail, # then convert it to quoted-printable. Returns: # # , , # # The returned charset will be the provided charset, or "US-ASCII" # if it turns out that that will work. # sub mime_encode($$) { my ($text, $charset) = @_; my $enc = (($text =~ m/\200-\377/) ? "8-bit" : "7-bit"); $charset = "ISO-8859-1" unless ($charset); if (! ($text =~ m/\200-\377/)) { # how nice, only ASCII characters! $charset = "US-ASCII"; # Speak English or die. } if ($charset =~ m/^UTF-8$/i) { ($charset, $text) = simplify_utf8 ($text); } my $need_to_encode_p = (($text =~ m/[\000]/) || # if there are any nulls, encode. ($text =~ m/^[^\n]{900}/m)); # if any line is >900 chars, encode. if ($need_to_encode_p) { $text = quoted_printable ($text); $enc = "quoted-printable"; } return ($text, $enc, $charset); } # Construct and return a multipart/alternative message from the # given files, which are assumed to contain HTML. (The contents # of the files are all concatenated together.) # sub make_html_multalt(@) { my (@files) = @_; my $html = ""; foreach my $file (@files) { local *IN; open (IN, "<$file") || error ("$file: $!"); print STDERR "$progname: reading $file\n" if ($verbose); local $/ = undef; # read entire file $html = ; close IN; } my $text = html_to_text ($html); my $cs = guess_charset ($html); my ($html_encoded, $html_encoding, $html_charset) = mime_encode ($html, $cs); my ($text_encoded, $text_encoding, $text_charset) = mime_encode ($text, $cs); return mime_pack("multipart/alternative", "text/plain", $text_encoding, $text_charset, $text_encoded, "text/html", $html_encoding, $html_charset, $html_encoded); } sub error($) { my ($err) = @_; print STDERR "$progname: $err\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] html-files ... > message\n"; exit 1; } sub main() { my @files; while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "--verbose") { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^-./) { usage; } else { push @files, $_; } } usage if ($#files < 0); my $msg = make_html_multalt (@files); print STDOUT "$msg\n"; } main(); exit 0;