#!/opt/local/bin/perl -w # Copyright © 2019-2022 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Rewrite the links in an HTML file to point to the Wayback Machine instead # of the original site. Attempts to use a contemporaneous version from the # archive based on the file date (or earliest git date) of the HTML file. # # Created: 1-Feb-2019. require 5; use diagnostics; use strict; use POSIX; use LWP::UserAgent; use JSON::Any; use HTML::Entities; use open ":encoding(utf8)"; my $progname = $0; $progname =~ s@.*/@@g; my ($version) = ('$Revision: 1.7 $' =~ m/\s(\d[.\d]+)\s/s); my $verbose = 1; my $debug_p = 0; # Don't waybackify these my $exclude_domains = join ('|', ('archive.org', # Required. 'youtube.com', # Wayback doesn't save videos. 'vimeo.com', # Wayback doesn't save videos. 'wikipedia.org', # Sites with stable URLs. 'jwz.org', 'dnalounge.com', 'dnapizza.com', 'codeword-sf.com', 'mcom.com', 'patreon.com', # I need live links to these 'google.com', 'googleapis.com', 'facebook.com', 'irs.gov', )); sub url_quote($) { my ($u) = @_; return $u unless defined($u); $u =~ s|([^-a-zA-Z0-9.\@_\r\n])|sprintf("%%%02X", ord($1))|ge; return $u; } sub url_unquote($) { my ($url) = @_; return $url unless defined($url); $url =~ s/[+]/ /g; $url =~ s/%([a-z0-9]{2})/chr(hex($1))/ige; return $url; } sub html_quote($) { my ($s) = @_; return undef unless defined($s); return HTML::Entities::encode_entities ($s, # Match characters to be encoded: # Exclude &=046 <=074 >=076 '^\t\n\040-\045\047-\073\075\077-\176'); } # Convert any HTML entities to Unicode characters. # sub html_unquote($) { my ($s) = @_; return undef unless defined($s); return HTML::Entities::decode_entities ($s); } # Returns the Wayback URL closest to $date. # sub wayback_url($$) { my ($url, $date) = @_; $date = 1970 unless $date; $date =~ s/^(\d{4})-(\d\d)-(\d\d)$/$1$2$3/; error ("unparsable date: $date") unless ($date =~ m/^(\d{4}|\d{8})$/s); my $host = $2 if ($url =~ m@^(https?:)?//([^/:]+)@si); if (! $host) { print STDERR "$progname: skipping $url\n" if ($verbose > 1); return undef; } elsif ($host =~ m/\b($exclude_domains)$/sio) { print STDERR "$progname: excluding $url\n" if ($verbose > 1); return undef; } my $anchor = undef; if ($url =~ m/^(.+?)\#(.*)$/) { ($url, $anchor) = ($1, $2); } my $url2 = ('https://archive.org/wayback/available' . '?url=' . url_quote($url) . '×tamp=' . $date); my $ua = LWP::UserAgent->new; $ua->agent("$progname/$version"); print STDERR "$progname: loading $url2\n" if ($verbose > 1); my $res = $ua->get ($url2); my $ret = ($res && $res->code) || 'null'; error ("$url2: $ret") unless ($ret eq '200'); $ret = undef; eval { $ret = JSON::Any->new->jsonToObj ($res->content); }; error ("unparsable JSON: $url2: " . $res->content) unless $ret; if ($ret && $ret->{archived_snapshots} && $ret->{archived_snapshots}->{closest} && $ret->{archived_snapshots}->{closest}->{url}) { print STDERR "$progname: WARNING: archive status " . $ret->{archived_snapshots}->{closest}->{status} . " for $url\n" unless ($ret->{archived_snapshots}->{closest}->{status} eq '200'); my $url2 = $ret->{archived_snapshots}->{closest}->{url}; $url2 =~ s/\#.*$//s; # WTF? $url2 =~ s/\?+$//s; # WTF? # Sometimes the Wayback Machine returns crazy shit. E.g., # http://www.compuserve.com/ timestamp=20010604 # => http://www.iomp.org:80artin@compuserve.com # So make sure the returned URL appears to be archiving the URL # that we asked for. # my ($cmp1, $cmp2) = ($url, $url2); foreach ($cmp1, $cmp2) { s@^https?://web\.archive\.org/web/\d+/@@s; s@^https?://@@s; s@^([^:/]+):\d+(/|$)@$1$2@s; s/^www\d*\.//s; s@//+@/@gs; $_ = url_unquote($_); s@/+$@@s; } if (lc($cmp1) ne lc($cmp2)) { print STDERR "$progname: WARNING: mismatch:\n\t$cmp1\n\t$cmp2\n" . "\n\t$url\n\t$url2\n\n"; return undef; } $url2 .= "#$anchor" if $anchor; $url2 =~ s/^http:/https:/s; # WTF guys # Lose the redundant ":80" or ":443" in some archived URLs. $url2 =~ s@^(https?://.*?/)(https?://[^:/]+):(80|443)@$1$2@si; # Quote the embedded http colon. $url2 =~ s@^(https?://.*?/)(https?):@$1$2%3A@si; print STDERR "$progname: $url -> $url2\n" if ($verbose > 1); return $url2; } else { print STDERR "$progname: no archive: $url\n" if ($verbose > 1); return undef; } } sub file_date($$) { my ($file, $git_p) = @_; my $f2 = $file; $f2 =~ s/([^-_A-Za-z\d.,])/\\$1/gs; # Date of oldest git log entry. if ($git_p) { my $cmd = "git log --reverse --pretty=format:%aI $f2 | head -1"; my $ret = `$cmd 2>/dev/null`; return "$1$2$3" if ($ret && $ret =~ m/^(\d{4})-(\d\d)-(\d\d)/s); print STDERR "WARNING: no git date: $file\n" if ($verbose); } # Otherwise file date. my @st = stat($file); error ("$file does not exist") unless @st; return strftime ("%Y%m%d", localtime($st[9])); } sub waybackify($$$) { my ($file, $def_date, $git_p) = @_; my $body = ''; open (my $in, '<:utf8', $file) || error ("$file: $!"); print STDERR "$progname: reading $file\n" if ($verbose > 1); { local $/ = undef; # read entire file while (<$in>) { $body .= $_; } } close $in; my $date = ($def_date ? $def_date : file_date($file, $git_p)); if (!$def_date && $body =~ m@©\s*(\d{4})\b@si) { my $year = $1; if ("${year}9999" lt $date) { $date = "${year}0101"; # Believe copyright notice in HTML if older } } my $obody = $body; $body =~ s@\b ( HREF \s* = \s* ) ( [\"\'] ) ( .*? ) ( \2 ) @{ my $a = "$1$2"; my $b = $4; my $url = $3; my $url2 = wayback_url (html_unquote ($url), $date); $url = html_quote($url2) if $url2; "$a$url$b"; }@gsexi; if ($body eq $obody) { print STDERR "$progname: $file unchanged\n" if ($verbose > 1); } else { my $tmp = "$file.tmp"; open (my $out, '>:utf8', $tmp) || error ("$tmp: $!"); print $out $body; close $out; if ($debug_p) { system ("diff", "-u", $file, $tmp); unlink ($tmp); } else { if (! rename ($tmp, $file)) { unlink ($tmp); error ("mv $tmp $file: $!"); } else { print STDERR "$progname: wrote $file\n" if ($verbose); } } } } sub error($) { my ($err) = @_; print STDERR "$progname: $err\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] html-files ...\n"; print STDERR " $progname [--verbose] --year YEAR --url URL\n"; exit 1; } sub main() { my @files = (); my $git_p = 1; my @urls = (); while (@ARGV) { $_ = shift @ARGV; if (m/^--?verbose$/) { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^--?debug$/) { $debug_p++; } elsif (m/^--?q(uiet)?$/) { $verbose = 0; } elsif (m/^--?git$/) { $git_p++; } elsif (m/^--?no-git$/) { $git_p = 0; } elsif (m/^--?year$/) { push @files, ['year', shift @ARGV]; } elsif (m/^--?url$/) { push @files, ['url', shift @ARGV]; } elsif (m/^-./) { usage; } elsif (m/^https?:/) { push @files, ['url', $_]; } else { push @files, ['file', $_]; } } usage unless (@files); my $year = undef; foreach my $f (@files) { my ($type, $ff) = @$f; if ($type eq 'year') { $year = $ff; } elsif ($type eq 'file') { waybackify ($ff, $year, $git_p); } elsif ($type eq 'url') { $exclude_domains =~ s@\|facebook\.com@@s; # Kludge my $u2 = wayback_url ($ff, $year) || $ff; print "$u2\n"; } } } main(); exit 0;