#!/usr/bin/perl -w # Copyright © 2002-2008 Jamie Zawinski # Constructs a web page out of a set of RSS files. # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 21-Nov-2002. require 5; use diagnostics; use strict; use POSIX; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.25 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 0; my $max_rss_file_entries = 300; my $max_total_entries = 4000; my $html_title = "jwz portal"; my $html_header = ("\n" . "$html_title\n" . "\n" . "\n" . "\n" . "\n" . "

$html_title

\n" ); my %entity_table = ( "apos" => '\'', "quot" => '"', "amp" => '&', "lt" => '<', "gt" => '>', "nbsp" => ' ', "iexcl" => '¡', "cent" => '¢', "pound" => '£', "curren" => '¤', "yen" => '¥', "brvbar" => '¦', "sect" => '§', "uml" => '¨', "copy" => '©', "ordf" => 'ª', "laquo" => '«', "not" => '¬', "shy" => '­', "reg" => '®', "macr" => '¯', "deg" => '°', "plusmn" => '±', "sup2" => '²', "sup3" => '³', "acute" => '´', "micro" => 'µ', "para" => '¶', "middot" => '·', "cedil" => '¸', "sup1" => '¹', "ordm" => 'º', "raquo" => '»', "frac14" => '¼', "frac12" => '½', "frac34" => '¾', "iquest" => '¿', "Agrave" => 'À', "Aacute" => 'Á', "Acirc" => 'Â', "Atilde" => 'Ã', "Auml" => 'Ä', "Aring" => 'Å', "AElig" => 'Æ', "Ccedil" => 'Ç', "Egrave" => 'È', "Eacute" => 'É', "Ecirc" => 'Ê', "Euml" => 'Ë', "Igrave" => 'Ì', "Iacute" => 'Í', "Icirc" => 'Î', "Iuml" => 'Ï', "ETH" => 'Ð', "Ntilde" => 'Ñ', "Ograve" => 'Ò', "Oacute" => 'Ó', "Ocirc" => 'Ô', "Otilde" => 'Õ', "Ouml" => 'Ö', "times" => '×', "Oslash" => 'Ø', "Ugrave" => 'Ù', "Uacute" => 'Ú', "Ucirc" => 'Û', "Uuml" => 'Ü', "Yacute" => 'Ý', "THORN" => 'Þ', "szlig" => 'ß', "agrave" => 'à', "aacute" => 'á', "acirc" => 'â', "atilde" => 'ã', "auml" => 'ä', "aring" => 'å', "aelig" => 'æ', "ccedil" => 'ç', "egrave" => 'è', "eacute" => 'é', "ecirc" => 'ê', "euml" => 'ë', "igrave" => 'ì', "iacute" => 'í', "icirc" => 'î', "iuml" => 'ï', "eth" => 'ð', "ntilde" => 'ñ', "ograve" => 'ò', "oacute" => 'ó', "ocirc" => 'ô', "otilde" => 'õ', "ouml" => 'ö', "divide" => '÷', "oslash" => 'ø', "ugrave" => 'ù', "uacute" => 'ú', "ucirc" => 'û', "uuml" => 'ü', "yacute" => 'ý', "thorn" => 'þ', "yuml" => 'ÿ', "ndash" => '-', "mdash" => "--" ); # converts HTML entities back to the raw characters. # sub html_unquote($) { my ($html) = @_; if ($html =~ m@^\s*\s*$@s) { # bad craziness $html = $1; } else { $html =~ s/&([a-z\d]+);/ { my $e = $entity_table{$1}; if (defined ($e)) { $e; } else { print STDERR "$progname: unknown entity &$1;\n"; "&$1;"; } } /xige; } return $html; } # Read the HTML file, if it exists, and return a list of the entries in it. # See also, rss_to_html, which is the generator for the thing we're parsing. # sub parse_portal_html($) { my ($html_file) = @_; return () unless (-f $html_file); local *IN; open (IN, "<$html_file") || error ("$html_file: $!"); my $body = ''; while () { $body .= $_; } close IN; $body =~ s@^.*?(]*\bCLASS=\"entry\"[^<>]*>)@$1@is; # strip head $body =~ s@\s*]*\bCLASS=\"footer\"[^<>]*>.*$@@is; # strip tail $body =~ s@(]*\bCLASS=\"entry\"[^<>]*>)@\001\001\001$1@gsi; my @hentries = split (/\001\001\001/, $body); my @hentries2 = (); # this crud is to strip out empty entries foreach (@hentries) { next if (m/^$/); push @hentries2, $_; if ($verbose > 4) { my $e = "$_"; $e =~ s/\n/\\n/gs; print STDERR "$progname: parsed HTML entry: $e\n"; } } return @hentries2; } # Parses an RSS file and returns a list of RSS-item objects. # sub parse_rss($) { my ($file) = @_; local *IN; open (IN, "<$file") || error ("$file: $!"); my $body = ''; while () { $body .= $_; } close IN; if ($body =~ m/^\s*$/gs) { print STDERR "$progname: $file is empty: skipping.\n"; return (); } if ($body =~ m/^\s*()@$2@is; # strip (and save) head my $head = $1; #$body =~ s@\s*.*$@@is; # strip tail $body =~ s@(<(ITEM|ENTRY))@\001\001\001$1@gsi; $_ = $head; m@(.*?)@is || error ("$file: unparsable channel title"); my $chan_title = $1; my $chan_url = undef; if (m@(.*?)@is) { $chan_url = $1; } if (m@]*?\bHREF=\"([^\"<>]+)\"@is) { $chan_url = $1; } if (m@(.*?)@is) { $_ = $1; my ($url) = m@(.*?)@is; my ($w) = m@(.*?)@is; my ($h) = m@(.*?)@is; if ($url) { $url = "/is); next if (m//is); next if (m/(.*?)@is || error ("$file: unparsable item (title)"); my ($title) = m@(.*?)@is; $title = '' unless defined ($title); my ($author) = m@(.*?)@is; $author = '' unless defined ($author); m@<(LINK|GUID)>(.*?)@is || m@<(LINK[^<>]*?)\bHREF=\"([^\"<>]+)\"@is || error ("$file: unparsable item url: $title"); my $url = $2; my $desc = ''; if (m@<(DESCRIPTION|CONTENT)\b[^<>]*>(.*?)@is) { $desc = $2; } my $itemref = { 'title' => $title, 'author' => $author, 'ctitle' => $chan_title, 'curl' => $chan_url, 'url' => $url, 'body' => $desc, }; print STDERR "$progname: $file: parse: " . ($title || $url) . "\n" if ($verbose > 2); push @items, $itemref; } return @items; } # returns true if the given RSS entry is present in the list of HTML entries. # sub entry_present($@) { my ($rss_entry, @html_entries) = @_; my $rss_url = $rss_entry->{'url'}; my $rss_body = html_unquote($rss_entry->{'body'}); error ("no url in rss entry: " . $rss_entry->{'title'}) unless ($rss_url); # For comparison purposes, strip all tags and non-alphabetics; downcase. $rss_body =~ s@<[^<>]*>@@gsi; $rss_body =~ s@[^a-z]@@gsi; $rss_body = lc($rss_body); foreach (@html_entries) { my ($body) = m@]*\bCLASS=\"rss_body\"[^<>]*>(.*?)@s; my ($url) = m@]*\bCLASS=\"entry_url\"[^<>]*HREF=\"([^\"<>]*)\">@i; if (! $url) { s/\n/\\n/g; s/^(.{100}).*$/$1/s; error ("no url in html entry: $_"); } # For comparison purposes, strip all tags and non-alphabetics; downcase. $body =~ s@<[^<>]*>@@gsi; $body =~ s@[^a-z]@@gsi; $body = lc($body); my $url_p = ($url eq $rss_url); my $body_p = ($body eq $rss_body); if ($url_p && $body_p) { print STDERR "$progname: URL+body present: $url\n" if ($verbose > 2); # both match - this is a dup. return 1; } elsif ($body_p) { print STDERR "$progname: body present but URL different: $url\n" if ($verbose > 2); # URL changed, but body is the same - this is a dup. return 1; } elsif ($url_p) { print STDERR "$progname: URL present but body different: $url\n" if ($verbose > 2); # URL is a dup, but body changed - this is probably a dup. return 1; } } print STDERR "$progname: new entry: " . ($rss_entry->{'title'} || $rss_entry->{'url'}) . "\n" if ($verbose > 3); return 0; } # Write the HTML entries to the file. # sub write_html($$@) { my ($file, $new_count, @hentries) = @_; my $body = ''; my $count = 0; my $head = $html_header; my $date = strftime ("
%a, %e %b %l:%M%p", localtime); $head =~ s@(]*>)(.*?)(]*>)@$1$2$date$3@; $body .= "$head\n"; foreach my $hentry (@hentries) { $body .= "$hentry\n"; $count++; } $body =~ s/(\n\n)\n+/$1/gs; local *OUT; open (OUT, ">$file") || error ("$file: $!"); print OUT $body; close OUT || error ("$file: $!"); print STDERR "$progname: wrote $file ($count entries, $new_count new)\n" if ($verbose); } # Converts an RSS entry object to an HTML string. # Note that parse_portal_html parses this text. # sub rss_to_html($) { my ($entry) = @_; my $title = $entry->{'title'}; my $ctitle = $entry->{'ctitle'}; my $url = $entry->{'url'}; my $curl = $entry->{'curl'}; my $body = $entry->{'body'}; my $author = $entry->{'author'}; my $cdate = strftime ("%a, %e %b
%l:%M%p", localtime); $body = html_unquote ($body); $title = html_unquote ($title); $title = "$title" if ($url); $ctitle = "$ctitle" if ($curl); $author .= ": " if $author; $body = ("
\n" . " \n" . " \n" . " \n" . "
" . "$ctitle" . "$cdate" . "$author" . "$title" . "
" . "
$body
\n" . "
\n" . "

\n\n"); return $body; } # Read each of the RSS input files, and update the output html file. # If an entry is already present in the HTML file, it is not added again. # New entries are added at the top. # sub portalize($@) { my ($outfile, @infiles) = @_; print STDERR "$progname: reading $outfile...\n" if ($verbose > 1); my @old_entries = parse_portal_html ($outfile); my @new_entries = (); my $changed_p = 0; foreach my $file (@infiles) { my @rss_entries = parse_rss ($file); if ($#rss_entries > $max_rss_file_entries) { print STDERR "$progname: $file: $#rss_entries entries; ". "truncating to $max_rss_file_entries.\n" if ($verbose > 2); @rss_entries = @rss_entries[0 .. $max_rss_file_entries-1]; } foreach my $entry (@rss_entries) { if (entry_present ($entry, @old_entries)) { print STDERR "$progname: $file: skipping: " . ($entry->{'title'} || $entry->{'url'}) . "\n" if ($verbose > 2); } else { print STDERR "$progname: $file: adding: " . ($entry->{'title'} || $entry->{'url'}) . "\n" if ($verbose > 1); push @new_entries, rss_to_html ($entry); $changed_p++; } } } if ($#old_entries + $#new_entries + 1 > $max_total_entries) { my $n = ($max_total_entries - $#new_entries - 2); $n = 0 if ($n < 0); print STDERR "$progname: $outfile: " . ($#old_entries + $#new_entries + 1) . " total entries; ". "truncating to $n old entries.\n" if ($verbose > 2); @old_entries = @old_entries[0 .. $n]; if ($#old_entries + $#new_entries + 1 > $max_total_entries) { print STDERR "$progname: $outfile: truncating to $n new entries.\n" if ($verbose > 2); @new_entries = @new_entries[0 .. $max_total_entries-1]; } } if ($changed_p) { write_html ($outfile, $#new_entries+1, (@new_entries, @old_entries)); } else { print STDERR "$progname: $outfile unchanged\n" if ($verbose > 1); } } sub error($) { my ($e) = @_; print STDERR "$progname: $e\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] output.html [ input.rss ... ]\n"; exit 1; } sub main() { my @infiles = (); my $outfile = undef; while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "--verbose") { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^-./) { usage; } elsif (!defined ($outfile)) { $outfile = $_; } else { push @infiles, $_; } } usage unless ($#infiles >= 0); portalize ($outfile, @infiles); } main; exit 0;