#!/usr/bin/perl -w # cheesegrater -- scrapes HTML from web sites into RSS feeds. # Copyright © 2002-2007 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 20-Nov-2002. # # RSS doc: http://my.netscape.com/publish/formats/rss-spec-0.91.html require 5; #use diagnostics; # this screws up error messages when using eval/die. use strict; use POSIX; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.104 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $progclass = 'CheeseGrater'; my $progurl = 'http://www.jwz.org/cheesegrater/'; my $verbose = 0; $ENV{PATH} .= ":/opt/local/bin:/sw/bin"; my $html_cache_dir = ".html"; my $rss_output_dir = ".rss"; my $max_entries = 150; my $rss_lang = "en"; my $rss_webmaster = "webmaster\@jwz.org"; my $rss_editor = $rss_webmaster; # How to parse sites. Entries in the table are: # # "URL" => [ parse_function, "site name", "site description", # "site logo image url", image_width, image_height, # expirey_minutes # ] # # The logo url is optional. # If the url has been checked within expirey_minutes, it's not checked again. # my %filter_table = ( "http://www.mitchclem.com/nothingnice/" => [ \&do_nothingnice, "Nothing Nice to Say", "Nothing Nice to Say", "http://www.mitchclem.com/nothingnice/images/nn2s_header.gif", 278, 100, 60 * 6 ], "http://www.daniellecorsetto.com/gws.html" => [ \&do_girlswithslingshots, "Girls With Slingshots", "Girls With Slingshots", "http://www.daniellecorsetto.com/images/gwsmenu/gwslogoheader.jpg", 172, 57, 60 * 6 ], "http://www.thismodernworld.com/" => [ \&do_thismodernworld_blog, "This Modern World", "This Modern World weblog, by Tom Tomorrow", "http://images.salon.com/comics/tomo/2002/10/28/tomo/lc.gif", 58, 50, 60 ], "http://dir.salon.com/topics/tom_tomorrow/" => [ \&do_thismodernworld_comic, "This Modern World", "This Modern World comic, by Tom Tomorrow", "http://images.salon.com/comics/tomo/2002/10/28/tomo/lc.gif", 58, 50, 60 ], "http://www.workingforchange.com/column_lst.cfm?AuthrId=43" => [ \&do_thismodernworld_comic2, "This Modern World", "This Modern World comic, by Tom Tomorrow", "http://www.workingforchange.com/webgraphics/WFC/sparky3.gif", 47, 46, 60 * 6 ], "http://www.straightdope.com/columns/" => [ \&do_straightdope, "The Straight Dope", "The Straight Dope, by Cecil Adams", undef, 0, 0, 60 * 6 ], "http://antwrp.gsfc.nasa.gov/apod/" => [ \&do_apod_inline, "Astronomy Picture of the Day", "Each day a different image or photograph of our " . "fascinating universe is featured, along with a " . "brief explanation written by a professional " . "astronomer.", "http://www.nasa.gov/images/hotnasa.gif", 78, 68, 60 * 6 ], "http://www.redmeat.com/redmeat/meatlocker/" => [ \&do_redmeat, "Red Meat", "Red Meat, from the secret files of Max Cannon", "http://www.redmeat.com/redmeat/images/rm_nav2.gif", 57, 108, 60 * 6 ], "http://www.space.com/news/" => [ \&do_space_com, "Space Dot Com", "Space news from space.com.", undef, 0, 0, 60 * 6 ], "http://www.catandgirl.com/" => [ \&do_catandgirl_inline, "Cat and Girl", "A small Girl, a large anthropomorphic Cat, a few " . "wacky adventures and some pretentious conversation. " . "Also Beatnik Vampires and Joseph Beuys. " . "Updated every monday.", undef, 0, 0, 60 * 6 ], "http://www.wtbw.net/geisha/" => [ \&do_geisha_asobi, "Geisha asobi blog", "Geisha asobi blog, by Asobi Tsuchiya", undef, 0, 0, 60 ], "http://www.pennandteller.com/03/coolstuff/roadpenn.html" => [ \&do_penn, "Penniphile", "Penn Jillette, the taller, louder half of Penn and Teller", undef, 0, 0, 60 * 6 ], "http://www.mnftiu.cc/mnftiu.cc/war.html" => [ \&do_gywo, "Get Your War On", "Get Your War On", "http://www.mnftiu.cc/mnftiu.cc/images/gywo_cover.gif", 120, 81, 60 * 6 ], "http://slashdot.org/" => [ \&do_slashdot, "Slashdot", # "Slashdot: News for \"nerds.\" Stuff that \"matters.\"", "Nothing to see here. Move along." . "<P>You want <lj user=\"slashdot\">.", undef, 0, 0, 60 ], "http://www.linkfilter.net/" => [ \&do_linkfilter, "LinkFilter", "A better-formatted (screen-scraped) feed of this site.", undef, 0, 0, 60 ], "http://www.creaturesinmyhead.com/creature.php" => [ \&do_creaturesinmyhead, "The Creatures in my Head", "By Andrew Bell.", undef, 0, 0, 60 * 6 ], "http://www.asofterworld.com/" => [ \&do_asofterworld, "a softer world", "a softer world", undef, 0, 0, 60 * 6 ], "http://www.bobharris.com/" => [ \&do_bobharris, "Bob Harris", "Bob Harris", undef, 0, 0, 60 ], "http://videos.antville.org/" => [ \&do_antville_videos, "videos.antville.org", "videos.antville.org", undef, 0, 0, 60 ], # This site has a feed at http://shes.aflightrisk.org/index.rdf # but it truncates at the first paragraph. "http://shes.aflightrisk.org/" => [ \&do_flightrisk, "She's a Flight Risk", "An International Fugitive.", undef, 0, 0, 60 ], "http://www.doodie.com/" => [ \&do_doodie, "doodie.com", "Shit, Poop and Crap Cartoons.", undef, 0, 0, 60 * 6 ], "http://feeds.feedburner.com/crooksandliars/YaCP" => [ \&do_crooks, "Crooks and Liars", "Crooks and Liars, minus 'Music' and 'Open Threads'", undef, 0, 0, 60 ], ); ############################################################################# my $inside_eval_p = 0; sub error($) { my ($e) = @_; if ($inside_eval_p) { # perl's exception handling sucks die $e; } else { print STDERR "$progname: $e\n"; exit 1; } } sub capitalize($) { my ($s) = @_; $s =~ s/_/ /g; # capitalize words, from the perl faq... $s =~ s/((^\w)|(\s\w))/\U$1/g; $s =~ s/([\w\']+)/\u\L$1/g; # lowercase the rest # conjuctions and other small words get lowercased $s =~ s/\b((a)|(and)|(in)|(is)|(it)|(of)|(the)|(for)|(on)|(to))\b/\L$1/ig; # initial and final words always get capitalized, regardless $s =~ s/^(\w)/\u$1/; $s =~ s/(\s)(\S+)$/$1\u\L$2/; return $s; } # expands the first URL relative to the second. # sub expand_url($$) { my ($url, $base) = @_; return ($url) unless defined($url); $url =~ s/^\s+//gs; # lose whitespace at front and back $url =~ s/\s+$//gs; $url =~ s@^//@http://@; # slashdot does this stupidity if (! ($url =~ m/^[a-z]+:/)) { $base =~ s@(\#.*)$@@; # strip anchors $base =~ s@(\?.*)$@@; # strip arguments $base =~ s@/[^/]*$@/@; # take off trailing file component my $tail = ''; if ($url =~ s@(\#.*)$@@) { $tail = $1; } # save anchors if ($url =~ s@(\?.*)$@@) { $tail = "$1$tail"; } # save arguments my $base2 = $base; $base2 =~ s@^([a-z]+:/+[^/]+)/.*@$1@ # if url is an absolute path if ($url =~ m@^/@); my $ourl = $url; $url = $base2 . $url; $url =~ s@/\./@/@g; # expand "." 1 while ($url =~ s@/[^/]+/\.\./@/@g); # expand ".." $url .= $tail; # put anchors/args back print STDERR "$progname: relative URL: $ourl --> $url\n" if ($verbose > 5); } else { print STDERR "$progname: absolute URL: $url\n" if ($verbose > 6); } return $url; } # converts all relative URLs in SRC= or HREF= to absolute URLs, # relative to the given base. # sub expand_urls($$) { my ($html, $base) = @_; return '' unless defined($html); $html =~ s/ '¡', "\xC2\xA2" => '¢', "\xC2\xA3" => '£', "\xC2\xA4" => '¤', "\xC2\xA5" => '¥', "\xC2\xA6" => '¦', "\xC2\xA7" => '§', "\xC2\xA8" => '¨', "\xC2\xA9" => '©', "\xC2\xAA" => 'ª', "\xC2\xAB" => '«', "\xC2\xAC" => '¬', "\xC2\xAD" => '­', "\xC2\xAE" => '®', "\xC2\xAF" => '¯', "\xC2\xB0" => '°', "\xC2\xB1" => '±', "\xC2\xB2" => '²', "\xC2\xB3" => '³', "\xC2\xB4" => '´', "\xC2\xB5" => 'µ', "\xC2\xB6" => '¶', "\xC2\xB7" => '·', "\xC2\xB8" => '¸', "\xC2\xB9" => '¹', "\xC2\xBA" => 'º', "\xC2\xBB" => '»', "\xC2\xBC" => '¼', "\xC2\xBD" => '½', "\xC2\xBE" => '¾', "\xC2\xBF" => '¿', "\xC3\x80" => 'À', "\xC3\x81" => 'Á', "\xC3\x82" => 'Â', "\xC3\x83" => 'Ã', "\xC3\x84" => 'Ä', "\xC3\x85" => 'Å', "\xC3\x86" => 'Æ', "\xC3\x87" => 'Ç', "\xC3\x88" => 'È', "\xC3\x89" => 'É', "\xC3\x8A" => 'Ê', "\xC3\x8B" => 'Ë', "\xC3\x8C" => 'Ì', "\xC3\x8D" => 'Í', "\xC3\x8E" => 'Î', "\xC3\x8F" => 'Ï', "\xC3\x90" => 'Ð', "\xC3\x91" => 'Ñ', "\xC3\x92" => 'Ò', "\xC3\x93" => 'Ó', "\xC3\x94" => 'Ô', "\xC3\x95" => 'Õ', "\xC3\x96" => 'Ö', "\xC3\x97" => '×', "\xC3\x98" => 'Ø', "\xC3\x99" => 'Ù', "\xC3\x9A" => 'Ú', "\xC3\x9B" => 'Û', "\xC3\x9C" => 'Ü', "\xC3\x9D" => 'Ý', "\xC3\x9E" => 'Þ', "\xC3\x9F" => 'ß', "\xC3\xA0" => 'à', "\xC3\xA1" => 'á', "\xC3\xA2" => 'â', "\xC3\xA3" => 'ã', "\xC3\xA4" => 'ä', "\xC3\xA5" => 'å', "\xC3\xA6" => 'æ', "\xC3\xA7" => 'ç', "\xC3\xA8" => 'è', "\xC3\xA9" => 'é', "\xC3\xAA" => 'ê', "\xC3\xAB" => 'ë', "\xC3\xAC" => 'ì', "\xC3\xAD" => 'í', "\xC3\xAE" => 'î', "\xC3\xAF" => 'ï', "\xC3\xB0" => 'ð', "\xC3\xB1" => 'ñ', "\xC3\xB2" => 'ò', "\xC3\xB3" => 'ó', "\xC3\xB4" => 'ô', "\xC3\xB5" => 'õ', "\xC3\xB6" => 'ö', "\xC3\xB7" => '÷', "\xC3\xB8" => 'ø', "\xC3\xB9" => 'ù', "\xC3\xBA" => 'ú', "\xC3\xBB" => 'û', "\xC3\xBC" => 'ü', "\xC3\xBD" => 'ý', "\xC3\xBE" => 'þ', "\xC3\xBF" => 'ÿ', "\xE2\x80\x93" => '--', "\xE2\x80\x94" => '--', "\xE2\x80\x98" => '`', "\xE2\x80\x99" => '\'', "\xE2\x80\x9C" => "``", "\xE2\x80\x9D" => "''", "\xE2\x80\xA6" => '...', ); # Convert any Unicode characters to Latin1 if possible. # Unconvertable bytes are left alone. # sub de_unicoddle($) { my ($text) = @_; foreach my $key (keys (%unicode_latin1_table)) { my $val = $unicode_latin1_table{$key}; $text =~ s/$key/$val/gs; } return $text; } # Runs wget to pull an updated copy of the given site into the cache # directory, unless we've done so very recently (according to the # expirey/freshness value in this URL's configuration.) # sub pull_html($) { my ($url) = @_; my $now = time; # find the expirey of this URL. This will error if it's an unknown URL. # my $expirey = undef; { my ($fn, $title, $desc, $rss_img, $rss_img_w, $rss_img_h); ($fn, $title, $desc, $rss_img, $rss_img_w, $rss_img_h, $expirey) = get_filter_data ($url); error ("no expirey for $url") unless (defined ($expirey)); } $_ = $url; s@/+$@@; s@^http://@@gi; s@[^-_a-z\d]@_@gi; my $file = $_; my $hfile = "$html_cache_dir/$_"; # check the expirey, and if we've checked this URL recently, don't # re-download the content. # { my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($hfile); if (!defined ($mtime)) { print STDERR "$progname: $hfile does not exist\n" if ($verbose > 3); } elsif ($size < 512) { print STDERR "$progname: $hfile is only $size bytes\n" if ($verbose > 3); } else { my $minutes_ago = int (($now - $mtime) / 60); if ($minutes_ago < $expirey) { print STDERR "$progname: $hfile: modified < $expirey ($minutes_ago) " . "minutes ago.\n" if ($verbose > 2); return $file; } else { print STDERR "$progname: $hfile last modified $minutes_ago " . "minutes ago.\n" if ($verbose > 3); } } } # my @cmd = ("wget", "-O", $hfile, "-N"); my @cmd = ("wget", "-O", $hfile); push @cmd, ($verbose > 2 ? "-nv" : "-q"); push @cmd, "--debug" if ($verbose > 4); push @cmd, $url; print "$progname: executing: " . join (" ", @cmd) . "\n" if ($verbose > 2); if (system (@cmd) == 0) { # Touch the file so that its date is the current time, so that we # know when we last polled it -- rather than the last date it had # on the server. (There doesn't seem to be an option to make # wget do this.) # utime ($now, $now, $hfile) || error ("$hfile: touch: $!"); print STDERR "$progname: wrote $hfile\n" if ($verbose > 2); } else { print STDERR "$progname: error retrieving $url\n"; unlink $hfile; $file = undef; } return $file; } # returns the parse rules associated with this URL: # ( parse_function, "site name", "site description", # "site logo image url", image_width, image_height, # expirey_minutes ) # sub get_filter_data($) { my ($url) = @_; my $ref = $filter_table{$url}; error ("unknown URL: $url") unless defined ($ref); return @{$ref}; } # Parses the given HTML into log entries, based on the parse function # associated with the URL. Returns a list of the form: # # ( "site-title" "site-desc" "site-logo-img" img_width img_height # "entry-url-1" "entry-date-1" "entry-title-1" "entry-body-1" # "entry-url-2" "entry-date-2" "entry-title-2" "entry-body-2" # ... ) # sub split_entries($$) { my ($url, $html) = @_; $html = de_unicoddle ($html); # convert UTF8 to Latin1 my ($fn, $title, $desc, $rss_img, $rss_img_w, $rss_img_h, $expirey) = get_filter_data ($url); print STDERR "$progname: parsing \"$url\" with \"$title\" rules\n" if ($verbose > 3); my @entries = &$fn ($url, $html); return ($title, $desc, $rss_img, $rss_img_w, $rss_img_h, @entries); } # Parses the given HTML file into log entries, based on the parse function # associated with the URL. Writes an RSS file into the cache directory. # Does not change the existing RSS file if there have been no changes. # sub convert_to_rss($$) { my ($url, $html_file) = @_; my $rss_file = "$html_file.rss"; $html_file = "$html_cache_dir/$html_file"; $rss_file = "$rss_output_dir/$rss_file"; my $html = ""; local *IN; open (IN, "<$html_file") || error ("$html_file: $!"); while () { $html .= $_; } close IN; # Check to see whether this file seems to be empty. # Strip out all HTML comments and tags, compress whitespace, # and count how many characters are left. # $_ = $html; 1 while (s@@ @gsi); s@<(SCRIPT)\b[^<>]*>.*?@@gsi; s/<[^<>]+>//gsi; s/\s+/ /gsi; error ("$html_file is empty") if ($html =~ m/^\s*$/s); error ("$html_file is almost empty") if (length($_) < 400 && (! ($url =~ m/(softerworld|doodie|nothingnice)/))); my $items = ''; my @entries = split_entries ($url, $html); my $rss_title = shift @entries; my $rss_desc = shift @entries; my $rss_img = shift @entries; my $rss_img_w = shift @entries; my $rss_img_h = shift @entries; my $count = 0; while ($#entries >= 0) { my $eurl = shift @entries; my $date = shift @entries; my $title = shift @entries; my $body = shift @entries; $eurl = expand_url ($eurl, $url); $date = expand_urls ($date, $url); $title = expand_urls ($title, $url); $body = expand_urls ($body, $url); $date =~ s/&/&/g; # de-HTMLify $date =~ s//>/g; $title =~ s/&/&/g; # de-HTMLify $title =~ s//>/g; $body =~ s/&/&/g; # de-HTMLify $body =~ s//>/g; $eurl =~ s/&/&/g; my $item = ("\n" . " $title\n" . " $eurl\n" . " \n" . " $body\n" . " \n" . "\n"); $item =~ s/^/ /gm; print STDERR "$progname: entry: " . ($title || $eurl) . "\n" if ($verbose > 3); if (++$count < $max_entries) { $items .= $item; } } error ("$html_file: no entries parsed!") if ($count <= 0); if ($verbose > 2) { if ($count > $max_entries) { print STDERR "$progname: $count entries (trimmed to $max_entries)\n"; } else { print STDERR "$progname: $count entries\n"; } } my $pubdate = strftime ("%a, %e %b %Y %H:%M:%S GMT", gmtime); my $builddate = $pubdate; my $rurl = $url; $rurl =~ s/&/&/g; my $rss = ("\n" . "\n" . "\n" . "\n" . " \n" . " $progclass $version -- $progurl\n" . " $rss_title\n" . " $rurl\n" . " $rss_desc\n" . " $rss_lang\n" . " $rss_webmaster\n" . # " $rss_editor\n" . " $pubdate\n" . " $builddate\n" . ($rss_img ? (" \n" . " $rss_title\n" . " $rss_img\n" . " $rss_img_w\n" . " $rss_img_h\n" . " $rurl\n" . " \n") : "") . $items . " \n" . "\n"); # de-Windows-ify. (convert common CP-1252 to ISO-8859-1.) # $rss =~ s/\205/ --/gs; $rss =~ s/\221/\`/gs; $rss =~ s/\222/\'/gs; $rss =~ s/\223/``/gs; $rss =~ s/\224/''/gs; $rss =~ s/\225/*/gs; $rss =~ s/\226/-/gs; $rss =~ s/\227/ --/gs; $rss =~ s/\230/~/gs; $rss =~ s/\240/ /gs; # nbsp $rss =~ s/\201/E/gs; # euro symbol? # strip out other unknowns, since some RSS parsers are super anal about it. $rss =~ s/[\000-\010\013-\037\177-\237]/?/gs; my $body = $rss; my $nbody = "$body"; my $obody = ""; local *IN; if (open (IN, "<$rss_file")) { while () { $obody .= $_; } close IN; } # strip the dates out of both files, for comparison purposes # $nbody =~ s@<([a-z]+Date)>(.*?)@<$1>...@gsi; $obody =~ s@<([a-z]+Date)>(.*?)@<$1>...@gsi; if ($nbody eq $obody) { print STDERR "$progname: $rss_file unchanged\n" if ($verbose > 2); } else { local *OUT; open (OUT, ">$rss_file") || error ("$rss_file: $!"); print OUT $body || error ("$rss_file: $!"); close OUT || error ("$rss_file: $!");; print STDERR "$progname: wrote $rss_file\n" if ($verbose); } } # Downloads the given URL, and updates the RSS file if necessary. # sub scrape($) { my ($url) = @_; @_ = eval { $inside_eval_p = 1; my $html_file = pull_html ($url); return unless defined ($html_file); convert_to_rss ($url, $html_file); return (); }; $inside_eval_p = 0; if ($@) { print STDERR "\n" if ($verbose); print STDERR "$progname: ERROR: " . join(' ', $@) . "\n"; print STDERR "\n" if ($verbose); $@ = undef; return 1; } else { return 0; } } ############################################################################# # # Site-specific parse functions # These are referenced by the %filter_table at the top of the file. # ############################################################################# sub do_thismodernworld_blog($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments # s@^.*?]*\bCLASS=\"posts\"[^<>]*>\s*@@is || # error ("unable to trim head in $url"); # s@\s*]*\bCLASS=\"mt\"[^<>]*>.*$@@is || # error ("unable to trim tail in $url"); s@(]*\bCLASS=\"post\"[^<>]*>)@\n\001\001\001\n$1@gi; my @sec1 = split (/\n\001\001\001\n/s); my @sec2 = (); shift @sec1; pop @sec1; foreach (@sec1) { next if (m/^\s*$/s); s/[\r\n]/ /gs; # next if (m/^(<[pb]>\s*)*Attention Tom-Mart Shoppers/i); # kludge... # next if (m/^(<[pb]>\s*)*Support this site!\s*)*New design in the store:\s*)*THE GREAT BIG BOOK OF /i); # kludge... # next unless (m/^\s*]*CLASS=\"postdate\"[^<>]*>.*?\s*@@is; s@]*CLASS=\"postdate\"[^<>]*>.*?\s*@@is; # # lose "posts" class # s@\s*]*CLASS=\"posts\"[^<>]*>\s*@@is; # s@^\s*]*?\bREL=\"([^<>\"]+)\"[^<>]*>\s*\s*@@is || # error ("unparsable entry (anchor) in $url"); # my $anchor = $1; s@]*>@@is || error ("unparsable entry (anchor) in $url"); my $anchor = $1; my $date = ""; s@]*?\bCLASS=\"posttitle\"[^<>]*>\s*(.*?)\s*\s*@@is || error ("unparsable entry (title) in $url"); my $title = $1; $title =~ s@@@gi; m@]*>[^a-z<>]*\blink\b\s*@i || error ("unparsable entry (link) in $url"); my $eurl = $1; # loose footer crud s@]*CLASS=\"postfoot\"[^<>]*>.*?\s*@@is; # lose all DIVs s@]*>\s*@@gis; # lose trailing P and /DIV 1 while (s@\s*]*>\s*$@@is); s@\s*\s*$@@is; # lose trailing crap on last entry, so it doesn't update every # time something falls off the log. # s@.*?$@@is; s@\s+@ @gsi; my $body = $_; push @sec2, ($eurl, $date, $title, $body); } return @sec2; } # from salon.com, which no longer works sub do_thismodernworld_comic($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@[\r\n]+@ @gs; s@(]*\bHREF=\"([^\"]+)\"[^<>]*>@@i || error ("unparsable entry (href) in $url"); my $eurl = $1; my $date = ""; s@]*>.*?@@gi; s@<[^<>]*>@ @g; s@^\s*@@s; s@\s*$@@s; my @text = split (/\s\s+/); # my $title = "$text[0]: $text[1]"; my $title = "$text[1]"; $eurl =~ s@/index\.html$@/@; my $iurl = $eurl; $iurl .= "story.jpg"; my $body = ""; push @sec, ($eurl, $date, $title, $body); } return @sec; } # from workingforchange.com sub do_thismodernworld_comic2($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@]*>\s*@@gsi; # lost most tags s@ @ @g; s@[\r\n]+@ @gs; s@(]*\bHREF=\"([^<>\"]+)\"[^<>]*>\s* This\s+Modern\s+World:\s*([^<>]+)(.*)@xi; my $eurl = $1; my $title = $2; my $rest = $3; $_ = $rest; my ($mm, $dd, $yy) = m@\b(\d\d?)\.(\d\d?)\.(\d\d)\b@; error ("noo date?") unless ($yy); my $date = sprintf("%02d-%02d-%04d", $yy+2000, $mm, $dd); $mm = sprintf("%02d", $mm); $dd = sprintf("%02d", $dd+1); # don't ask me, man... my $iurl = ("http://workingforchange.speedera.net/" . "www.workingforchange.com/webgraphics/wfc/" . "TMW$mm-$dd-$yy.jpg"); my $body = ""; push @sec, ($eurl, $date, $title, $body); } return @sec; } sub do_straightdope($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@^.*?>\s*Current Columns\s*<[^<>]*>@@is || error ("unable to trim head in $url"); s@\s*]*>@@si; s@]*>\s*@@gsi; # lost most tags s@[\r\n]+@ @gs; s@(]*\bHREF=\"([^\"]+)\"[^<>]*>(.*?)(.*)$@@i || error ("unparsable entry (href) in $url"); my $eurl = $1; my $date = $2; my $title = $3; my $rtitle = $date; my $rbody = $title; my $body = "$rbody"; unshift @sec, ($eurl, $date, $rtitle, $body); } return @sec; } # generate a single-entry RSS file with an inline image. # sub do_apod_inline($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/[\r\n]+/ /gs; s@^.*?discover the cosmos.*?

\s*@@si || error ("unable to trim head in $url"); s@\s*(<[^<>]+>\s*)*Tomorrow\'s picture:.*?$@@ || error ("unable to trim tail in $url"); s@^(\s*<[^<>]+>\s*)+@@s; # lose leading tags s@^\s*((\d{4})[- ]([a-z]+)[- ](\d\d?))\b\s*(<(BR|P)>\s*)*@@i || error ("$url: unable to find date"); my $date = $1; my $year = $2; my $month = $3; my $dotm = $4; s@]*>\s*@@gsi; # lose table tags s@@

@gsi; s@( ]*> \s* ]*> \s* \s* ) ( .* )$@

$1
$2
@six; s/[\r\n]+/ /gs; s/\s+/ /gs; my $body = $_; my %m = ( "Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12); $month =~ s/^(...).*/$1/; $month = $m{$month} || error ("unparsable month: $month"); my $eurl = $url . sprintf ("ap%02d%02d%02d.html", $year % 100, $month, $dotm); s@<(P|BR)\b[^<>]*>@\n@gsi; # expand newlines s@<[^<>]*>@@g; # lose other tags my ($title) = m@^\s*(.*?)\s*$@m; return ($eurl, $date, $title, $body); } sub do_redmeat($$) { my ($url, $html) = @_; $url =~ s@/[^/]*$@/@; # take off last path component $_ = $html; 1 while (s@@ @gsi); # lose comments s@[\r\n]+@ @gs; s@^.*?(]*\bHREF=\"([^\"]+)\"[^<>]*>(.*?).*$@@i || error ("unparsable entry in $url"); my $eurl = $1; my $title = $2; $_ = $eurl; my ($date) = m@\b(\d{4}-\d{2}-\d{2})\b@; $date = '' unless $date; $eurl =~ s@/index\.html?$@/@i; my $rbody = ("
" . "" . "
"); my $body = "$rbody"; push @sec, ($eurl, $date, $title, $body); } return @sec; } sub do_space_com($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments 1 while (s@]*>.*?]*>@@gsi); # lose javascript s@[\r\n]+@ @gs; s@(]*>@@gsi; m@^.*]*\bHREF=\"([^\"]+)\"[^<>]*>@i || error ("unparsable entry (url) in $url"); my $eurl = $1; m@^.*]*\bSRC=\"([^\"]+)\"[^<>]*>@i || error ("unparsable entry (image) in $url"); my $img = $1; next if ($eurl =~ m@(/navigation|/template|doubleclick)@i); next if ($img =~ m@(/navigation|/template|doubleclick)@i); # now let's munge the HTML a little... s@(]*>@@igs; $_ .= "
"; s@>>+@>>@gsi; # dummies my $body = $_; my $date = ''; # note: could parse this from $eurl pathname s@<(BR|P)\b[^<>]*>@\n@gsi; # put newlines back s@<[^<>]*>@@gs; # lose all tags s@\n.*$@@s; # delete all but first line my $title = $_; next if ($eurl =~ m/\bads\.space\.com/i); push @sec, ($eurl, $date, $title, $body); } return @sec; } # generate a single-entry RSS file with an inline image of the latest cartoon. # sub do_catandgirl_inline($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/[\r\n]+/ /gs; s@(<(A|IMG)\b)@\n$1@gsi; my $number = 0; my ($eurl, $title, $body); foreach (split (/\n/)) { if (m@^]*\bHREF=\"(view\.(cgi|php)\?)(loc=)?(\d+)\"@i) { my $base = $1; my $n = $4; if ($n > $number) { $number = $n + 1; $eurl = $base . $number; } } elsif (m@^]*\bSRC=\"([^<>\"]+)\"[^<>]*\bALT=\"([^<>\"]+)\"@i) { my ($img, $alt) = ($1, $2); s/>.*$/>/s; if ($img =~ m@\barchive/@) { $body = $_; $title = $alt; } } } error ("couldn't find image number? ($number) in $url") unless ($number > 123); error ("couldn't find image in $url") unless (defined ($body)); my $date = ''; return ($eurl, $date, $title, $body); } sub do_geisha_asobi($$) { my ($url, $html) = @_; $_ = $html; s@\r\n@\n@gs; s@\r@\n@gs; s@^.*@@is || error ("unable to trim head in $url"); s@.*$@@is || error ("unable to trim tail in $url"); s@^.*?()@$1@is || error ("unable to trim head-2 in $url"); s@(\s*$1@gsi; # make start/end differ s@.*?@@gis; s@\s+@ @gs; # lose newlines, compress whitespace s@]*>@\n@gsi; my @sec = (); foreach (split ('\n', $_)) { next if (m/^\s*$/s); s@]*>@@gsi; # lose fonts s@(]*?)\s*\bTARGET=\"?[^<>\"]+\"@$1@gsi; # lose TARGETs in A s/\s+/ /gs; s/^\s+//gs; s/\s+$//gs; my $body = $_; m@\bposted by .* (on|at)\s+\"]+)\">([^<>]+)@i || error ("unparsable time in $url"); my ($eurl, $date) = ($2, $3); $date =~ s/\s+$//g; $date =~ s/^\s+//g; # try for a title on the first line my $title = $body; $title =~ s@<(BR|P|IMG)\b[^<>]*>@\n@gsi; $title =~ s@]*>@ @gs; $title =~ s@^[ \t]*@@; $title =~ s@[ \t]*$@@; $title =~ s@\n.*$@@s; $title =~ s@\s+$@@s; push @sec, ($eurl, $date, $title, $body); } return @sec; } sub do_penn($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/\s+/ /gs; s@@@gi; # bogosity s/(])/\n$1/gi; s@( 0 && m@\"]+)\"@i; error ("unable to find url: $url") unless ($eurl); $_ =~ s@<[^<>]+>@@g; my ($title, $date) = m@^(.+)(\d\d?[\s/]+\d\d?[\s/]+\d+)[^a-z\d]+$@; error ("unable to find title: $url $_") unless ($title); error ("unable to find date: $url") unless ($date); $title =~ s@^\s+@@s; $title =~ s@[^a-z]+$@@s; $date =~ s@^\s+@@s; $date =~ s@\s+$@@s; my $body = "$title ($date)"; push @sec, ($eurl, $date, $title, $body); } return @sec; } sub do_gywo($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/\s+/ /gs; s/(])/\n$1/gi; s@(\"]+)\"@i; next unless ($eurl && $eurl =~ m/\bwar(\d+)\.html$/); my $n = $1; my $title = "get your war on page $n"; my $date = ""; my $body = "$title"; unshift @sec, ($eurl, $date, $title, $body); } return @sec; } sub do_slashdot($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/\s+/ /gs; s/(
)/\n$1/gsi; s/^[^\n]*\n//si; # lose head s/\n[^\n]*$//si; # lose tail my @sec = (); foreach (split (/\n/)) { s@.*?
@@si; s@.*?@@si; my ($title) = m@(.*?)@si; my ($body) = m@(.*?)@si; next unless defined ($title); $_ = $title; s@]+/?\"[^<>]*>.*?@@si; # lose first one my ($eurl) = m@\"]+)\"@si; error ("unable to find entry URL in $url") unless defined ($eurl); $title =~ s@<[^<>]*>@@gs; $body =~ s@

@@gsi; $body =~ s@]*>@ @gsi; $title =~ s/\s+/ /gsi; $body =~ s/\s+/ /gsi; $title =~ s/(^\s+|\s+$)//gsi; $body =~ s/(^\s+|\s+$)//gsi; 1 while ($body =~ s@\s*]*>\s*$@@gsi); my $date = ''; push @sec, ($eurl, $date, $title, $body); } return @sec; } sub do_linkfilter($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@ @ @gs; s@\s+@ @gsi; s@\s*(]*?\bCLASS=\"td-head\">)\s*@\n$1@gsi; my @sec1 = split (/\n/); my @sec2 = (); shift @sec1; foreach (@sec1) { next if (m/^\s*$/s); s@^\s*]*>\s*@@si || error ("no TD in entry in $url"); s@^\s*]*?\bHREF=\"([^<>\"]+)\"[^<>]*>\s*(.*?)\s*@@is || error ("unparsable entry (anchor) in $url"); my $eurl = $1; my $title = $2; $eurl =~ s@;cmd=go$@@; # bah. s@(]*>@$1P>@gsi; # fuck SPAN # lose the category and submitter links s@]*>.*?\s*@@gsi; # lose leading P, BR, and DIV 1 while (s@^\s*]*>\s*@@is); # compact

s@(]*>\s*)+@

@gsi; my ($date) = m@submitted(.*?)
@si; s@^Link\b(.*?)
\s*@@gsi || error ("no date line in $url"); $date =~ s@^.* on @@; $date =~ s@\s*\.?\s*\(.*$@@; s@\s+\bONCLICK=\"[^\"]+\"@@gsi; # you chumps. undo link-tracking BS. s@]*>\s*(http:[^<>\"]+)\s*\s*@@si; my $turl = $1; my $oturl = $turl; error ("no url found in $url") unless defined ($turl); # FUCK! assholes! we have to use their redirector if the link was # long, because they truncate it. if ($turl =~ m/\.\.\.$/) { $turl = "$eurl;cmd=go"; } # lose trailing Comments links s@]*>\s*Comments\b.*$@@si; # lose trailing P, BR, and DIV 1 while (s@\s*]*>\s*$@@is); my $body = $_; $body = "$oturl

$body"; push @sec2, ($eurl, $date, $title, $body); } return @sec2; } sub do_creaturesinmyhead($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@ @ @gs; s@\s+@ @gsi; my ($img, $title) = m@(]*>) \s* (.*?) @xsi; error ("no image in $url") unless defined ($img); $img =~ s/\s+ALT\s*=\s*\"[^<>\"]*\"\s*/ /gsi; $title =~ s@<[^<>]*>@ @gsi; $title =~ s@\s+@ @g; $title =~ s@^\s+@@g; $title =~ s@\s+$@@g; my ($date) = ($img =~ m@creatures/(\d{6})[^\d]@); error ("no date in $img") unless defined ($date); my $eurl = "/creature.php?date=$date"; $date =~ s@^(\d\d)(\d\d)(\d\d)$@$1/$2/$3@; $title =~ s@^[-\s\d/]+:\s*@@gs; my $body = "

" . "$img
$title

"; my @sec = ($eurl, $date, $title, $body); return @sec; } # generate a single-entry RSS file with an inline image of the latest cartoon. # sub do_asofterworld($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@\s+@ @gsi; my ($img, $eurl) = m@(\"]*?\.jpg)\"[^<>]*>)@si; my ($title) = ($img =~ m@\bTITLE=\"([^<>\"]*)\"@si); my $date = ''; error ("couldn't find url in $url") unless ($eurl); error ("couldn't find title in $url") unless ($title); $eurl = $url; return ($eurl, $date, $title, $img); } sub do_bobharris($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@^.*?(]*\bCLASS=\"contentpaneopen\"[^<>]*>)@$1@is || error ("unable to trim head in $url"); s@\s*]*\bCLASS=\"pagenav\"[^<>]*>.*$@@is || error ("unable to trim tail in $url"); s/\s+/ /gs; s@(]*\bCLASS=\"contentheading\"[^<>]*>)@\n$1@gi; my @sec1 = split (/\n/s); my @sec2 = (); shift @sec1; foreach (@sec1) { next if (m/^\s*$/s); # m/[\"\']([^<>\"\']+?task=view[^<>\"\']+)[\"\']/ || m/\"]+)\"[^<>]*class=\"contentpagetitle\"/si || error ("no href in $url"); my $href = $1; s@]*\bCLASS=\"createdate\"[^<>]*>\s*(.*?)\s*@
@si || error ("no date in $url"); my $date = $1; s@CLASS=\"contentheading\"[^<>]*>\s*(.*?)]*>/ /gsi; $title =~ s/\s+/ /gsi; $title =~ s/^\s+|\s+$//gsi; s@]*>@@gsi; s@]*>.*?@@gsi; s@]*?\bSRC=[\"'][^<>]*?/(components|tooltips)/[^<>]*>@@gsi; #s@]*>@@gsi; s@
@
@gsi; s@
\s*
@

@gsi; s@<(/P|P */)>@

@gsi; s@

(\s*)+@

@gsi; s@\b(CLASS|TARGET)=\"[^\"]+\"@@gsi; s@^\s*\s*@@gsi; my $body = $_; $href =~ s/&/&/g; next if ($title =~ m/\bpudublog/i); # fuck this shit push @sec2, ($href, $date, $title, $body); } return @sec2; } sub do_antville_videos($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/\s+/ /gs; s@(]*\bCLASS=\"storyDate)@\n$1@gi; s@]*\bCLASS=\"storyDate\"[^<>]*>\s*(.*?)\s*@@si || error ("no date in $url"); my $date = $1; $date =~ s@]*>@@gs; s@]*\bCLASS=\"storyTitle\"[^<>]*>\s*(.*?)\s*@@si || error ("no title in $url"); my $title = $1; $title =~ s@]*>@@gs; m@]*\bHREF=\"([^<>\"]*?/stories/\d[^<>\"]*?)\"@si || error ("no url in $url"); my $href = $1; s@\(?\s*]*>\s*\d*\s*comments?!?\s*\s*\)?\s*@@gsi; s@ @ @gs; s@\b(CLASS|TARGET)=\"[^\"]+\"@@gsi; s@@

@gsi; s@
@
@gsi; s@
\s*
@

@gsi; s@<(/P|P */)>@

@gsi; s@

(\s*)+@

@gsi; s@\s+@ @gsi; my $body = $_; $href =~ s/&/&/g; push @sec2, ($href, $date, $title, $body); } return @sec2; } sub do_flightrisk($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@^.*?(\s*(.*?)\s*\s*@@gsi; s@(\s*(\s*)?(\s*)?\s*$@@si; my @sec1 = split (/\n/s); my @sec2 = (); shift @sec1; foreach (@sec1) { next if (m/^\s*$/s); s@]*>@@gsi; m@\s*(.*?)\s*@si || error ("no title in $url"); my $title = $1; m@

[^<>]*\"]+)\"@si || error ("no href in $url"); my $href = $1; s@\s*[^<>]*\s*@@gsi; s@\s*\"]+\">\s*\s*@@gsi; s@\s*
.*?
\s*@@gsi; s@\s*]*>\s*@@gsi; s@^\s*]*>\s*(.*)\s*
\s*$@$1@si; 1 while s@\s*(

| )\s*$@@gsi; s@^(\s*)+\s*@@gsi; s@\s*(\s*)+$@@gsi; my $body = $_; my $date = ''; push @sec2, ($href, $date, $title, $body); } return @sec2; } sub do_doodie($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/[ \t]+/ /gs; s/\r\n/\n/gs; my ($target) = m@(]*>@@gsi; $target =~ s@^\s+|\s+$@@gsi; $target =~ s@\n@
@gsi; # my ($date) = ($img =~ m/\.(\d+)$/s); # error ("no date in $img") unless ($date); # my $href = "/index.php?date=$date"; my $href = $url; my $body = (""); $target =~ s@
@ -- @gsi; return ($href, '', $target, $body); } sub do_nothingnice($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@@@gsi; s/[ \t]+/ /gs; s/\r\n/\n/gs; my ($img) = m@(]*?SRC=\"[^<>\"]*?/comics/[^<>\"]*\">)@si; error ("$url unparsable") unless ($img); my ($n) = m@/(\d+)/">\s*Previous@si; $n++; my $href = "$n/"; my ($title) = m@\[\s*(.*?)\s*\]@si; my $body = (""); return ($href, '', $title, $body); } sub do_girlswithslingshots($$) { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@@@gsi; s/[ \t]+/ /gs; s/\r\n/\n/gs; my ($img) = m@(]*?SRC=\"[^<>\"]*?images/gws/[^<>\"]*\"[^<>]*>)@si; error ("$url unparsable") unless ($img); my ($title) = ($img =~ m@/(GWS\d+)\.@si); my $href = "/$title.html"; my $body = (""); return ($href, '', $title, $body); } sub do_crooks($$) { my ($url, $html) = @_; $_ = $html; s/()/\001/gs; my @sec1 = split(m/\001/s); my @sec2 = (); shift @sec1; foreach (@sec1) { next if (m/^\s*$/s); my ($href) = m@(.*?)@si; my ($date) = m@(.*?)@si; my ($title) = m@(.*?)@si; my ($body) = m@(.*?)@si; $body =~ s@^\s*<\!\[CDATA\[(.*)\]\]>\s*$@$1@si; next if ($title =~ m/Open Thread|Music Club|Blog Round/i); push @sec2, ($href, $date, $title, $body); } return @sec2; } ############################################################################# # # Command line and glue. # ############################################################################# sub usage() { print STDERR "usage: $progname [--verbose] [urls...]\n"; exit 1; } sub main() { my @urls = (); my $file = undef; while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "--verbose") { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^-./) { usage; } elsif (m/^http:/) { push @urls, $_; } else { usage; } } usage unless ($#urls >= 0); error "$rss_output_dir: output directory does not exist" unless (-d $rss_output_dir); error "$html_cache_dir: cache directory does not exist" unless (-d $html_cache_dir); my $err_count = 0; foreach (@urls) { $err_count += scrape ($_); } exit ($err_count); } main; exit 0;