#!/usr/bin/perl -w # Copyright © 2005-2010 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Downloads an archive of your Livejournal, saving each of your LJ entries # into its own file. # # Options: # # --verbose Be louder. # --quiet Or not. # --debug Don't actually write any files (run diff). # --since DATE Only look at entries that were modified after DATE, # which can be "DD-MMM-YYYY", "N days", etc. Note that # any change to an entry, including changing tags, # resets its modification date. # --before DATE Only look at entries that were modified before DATE. # # --comments Download comments too. # # Note: this is *very* expensive, since there is no way # to download only the comments of a particular post or # date range. So even if you're only downloading posts # from the last week, this has to download all comments # since the beginning of time. # # --wordpress Instead of writing each entry into a file, generate a # single XML file on stdout in "WordPress eXtended RSS" # format. This can be used to import your LJ into WP. # # If the output file is very large, you might need to # increase 'post_max_size' and 'upload_max_filesize' in # /etc/php.ini before WordPress will take it. # # --lock Instead of downloading entries, set them on the # server "comments locked" mode, so that no new comments # can be made. (Note that locking/unlocking resets # modification date.) # --unlock Unlock instead. # # Usage: # # ljgrabber.pl -v --since 30d Download updated copies of anything # modified this month. # # ljgrabber.pl -v --lock --before 30d Lock any entries modified more than # a month ago. # # ljgrabber.pl -v --lock --before 30d --since 38d # # Lock any entries modified between 30 and 38 days ago # (do this if you are running it weekly from cron, so # that it doesn't have to examine *every* old entry!) # # Created: 26-Jun-2005. require 5; use diagnostics; use strict; use FileHandle; use POSIX qw(locale_h mktime strftime); use LWP::UserAgent; use LWP::Simple; use DateTime::Format::W3CDTF; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.17 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 1; my $debug_p = 0; my $ljuser = $ENV{USER}; # what is your LJ user name? # This logs in to LJ by grabbing your LJ cookie out of Safari's cookie DB. # If you aren't logged in, or don't use Safari, you're on your own. # my $cookie_file = "$ENV{HOME}/Library/Cookies/Cookies.plist"; # Where to put the archive. # my $output_dir = "$ENV{HOME}/Documents/LJ"; sub get_cookie() { local *IN; if (open (IN, "<$cookie_file")) { local $/ = undef; # read entire file my $body = ; close IN; foreach (split (m//, $body)) { my ($domain) = m@Domain\s*([^<>]+)@si; my ($name) = m@Name\s*([^<>]+)@si; my ($value) = m@Value\s*([^<>]+)@si; next unless ($domain && $domain eq '.www.livejournal.com'); next unless ($name eq 'ljmastersession'); return $value; } } error ("no ljmastersession cookie in $cookie_file"); } sub url_quote($) { my ($u) = @_; $u =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge; return $u; } sub url_unquote($) { my ($u) = @_; $u =~ s/[+]/ /g; $u =~ s/%([a-z0-9]{2})/chr(hex($1))/ige; return $u; } sub html_unquote($) { my ($h) = @_; $h =~ s/<//gs; $h =~ s/&/&/gs; return $h; } # Returns true if the two files differ (by running "cmp") # sub cmp_files($$) { my ($file1, $file2) = @_; my @cmd = ("cmp", "-s", "$file1", "$file2"); print STDERR "$progname: executing \"" . join(" ", @cmd) . "\"\n" if ($verbose > 3); system (@cmd); my $exit_value = $? >> 8; my $signal_num = $? & 127; my $dumped_core = $? & 128; error ("$cmd[0]: core dumped!") if ($dumped_core); error ("$cmd[0]: signal $signal_num!") if ($signal_num); return $exit_value; } sub diff_files($$) { my ($file1, $file2) = @_; my @cmd = ("diff", "-U2", "--unidirectional-new-file", "$file1", "$file2"); print STDERR "$progname: executing \"" . join(" ", @cmd) . "\"\n" if ($verbose > 3); system (@cmd); my $exit_value = $? >> 8; my $signal_num = $? & 127; my $dumped_core = $? & 128; error ("$cmd[0]: core dumped!") if ($dumped_core); error ("$cmd[0]: signal $signal_num!") if ($signal_num); return $exit_value; } # If the two files differ: # mv file2 file1 # else # rm file2 # sub rename_or_delete($$) { my ($file, $file_tmp) = @_; my $changed_p = cmp_files ($file, $file_tmp); if ($changed_p && $debug_p) { print STDOUT "\n" . ('#' x 79) . "\n"; diff_files ("$file", "$file_tmp"); $changed_p = 0; } if ($changed_p) { if (!rename ("$file_tmp", "$file")) { unlink "$file_tmp"; error ("mv $file_tmp $file: $!"); } # print STDERR "$progname: wrote $file\n"; print STDERR "wrote $file\n" if ($verbose); } else { unlink "$file_tmp" || error ("rm $file_tmp: $!\n"); print STDERR "$file unchanged\n" if ($verbose > 1); print STDERR "$progname: rm $file_tmp\n" if ($verbose > 3); } } # Write the given body to the file, but don't alter the file's # date if the new content is the same as the existing content. # sub write_file_if_changed($$) { my ($outfile, $body) = @_; local *OUT; my $file_tmp = "$outfile.tmp"; open(OUT, ">$file_tmp") || error ("$file_tmp: $!"); (print OUT $body) || error ("$file_tmp: $!"); close OUT || error ("$file_tmp: $!"); rename_or_delete ("$outfile", "$file_tmp"); } sub ljcmd($$;%) { my ($mode, $cookie, %extras) = @_; my $url = 'http://www.livejournal.com/interface/flat'; my $ua = LWP::UserAgent->new; $ua->default_header ('X-LJ-Auth' => "cookie"); $ua->default_header ('Cookie' => "ljsession=$cookie"); my %opts = ( 'mode' => $mode, 'user' => $ljuser, 'auth_method' => 'cookie', 'ver' => '1' ); foreach my $k (keys (%extras)) { $opts{$k} = $extras{$k}; } if ($verbose > 4) { print STDERR "$progname: posting:\n\n"; foreach my $k (keys %opts) { my $v = $opts{$k}; print STDERR "\t$k:\t\t$v\n"; } print STDERR "\n"; } my $res = $ua->post ($url, \%opts); my $ret = ($res && $res->code) || 'null'; error ("$url: bad response: $ret") unless ($ret eq '200'); return $res->content || ''; } sub get_friends_groups() { my $cookie = get_cookie(); my $url = 'http://www.livejournal.com/interface/flat'; print STDERR "$progname: getting friends groups...\n" if ($verbose > 1); my $data = ljcmd ('getfriendgroups', $cookie); if ($verbose > 4) { my $x = $data; $x =~ s/^/\t/gm; print STDERR "$progname: result:\n$x\n"; } $data =~ s/([^\n]*)\n([^\n]*)\n/$1\t$2\n/gs; if ($data =~ m/^errmsg\t([^\n]*)$/m) { error ("LJ error: $1"); } elsif ($data !~ m/^success\tOK$/m) { error ("LJ error: unknown error"); } my @groups; $groups[0] = 'friends'; foreach (split (/\n/, $data)) { if (m/^frgrp_(\d+)_name\t(.*)$/s) { $groups[$1] = url_unquote($2); } } return @groups; } # Livejournal does this stupid "ljtoys.com" nonsense where they move all # of your posted EMBED and OBJECT tags off to another server, replacing # them with an IFRAME that loads that page. This goes out to that other # server and reconstructs the original EMBED/OBJECT tag. Without this, # the actual embedded video (or whatever) would be lost. # sub fix_ljembed($$$) { my ($body, $url, $cookie) = @_; # Bail out early if no embeds. return $body unless ($body =~ m@new; $ua->default_header ('X-LJ-Auth' => "cookie"); $ua->default_header ('Cookie' => "ljsession=$cookie"); my $res = $ua->get ($url); my $ret = ($res && $res->code) || 'null'; error ("$url: bad response: $ret") unless ($ret eq '200'); my $raw = $res->content || ''; my %embeds; $raw =~ s@(]+)"[^<>]*?name="embed_\d+_(\d+)\b)@{ # Extract the URLs and lj-embed numbers of all iframes in the HTML. my ($a, $u, $n) = ($1, $2, $3); $u = html_unquote($u); # Get the body of the underlying iframed lj-toys.com URL. $u = LWP::Simple::get ($u); error ("$url: null data") unless $u; $u =~ s!^.*\s*(.*?)\s*.*$!$1!si; # Store it in the table. $embeds{$n} = $u; print STDERR "$progname: $url: embed $n = $u\n" if ($verbose > 4); $a; }@gsexi; # Replace each with the underlying embed from lj-toys.com. # my $count = 0; my $fixed = 0; $body =~ s@()@{ my ($old, $id) = ($1, $2); my $embed = $embeds{$id}; if ($embed) { $fixed++; } else { $embed = $old; } $count++; $embed; }@gsexi; if ($fixed == $count) { print STDERR "(updated embeds) " if ($verbose > 1); } else { print STDERR "(FAIL! updated $fixed of $count embeds) " if ($verbose > 1); } return $body; } # Convert Livejournal-specific markup to something that works on Wordpress. # sub munge_lj_html($) { my ($s) = @_; # Newlines but no markup? must be "preformatted". Convert to BR. # if ($s =~ m/\n[ \t]*\n/si && $s !~ m//gs; } # Convert newlines inside of PRE to BR. # $s =~ s@(]*>)(.*?)()@{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/\n/
/gsi; "$a$b$c"; }@gsexi; # Delete all other newlines. # We need to do that because WP chokes on, e.g., ]*>)(.*?)()@{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/

/\n\n/gsi; $b =~ s/
/\n/gsi; "$a$b$c"; }@gsexi; # Convert all tags to normal A HREF links back to LJ. # $s =~ s@]+)\s*["']?\s*/?> @$1@gsix; # Convert self-links to LJ posts to /blog/?p=... # $s =~ s@(["'])http://$ljuser\.livejournal\.com/([0-9]+)\.html (?:\?thread=\d+)? (["'#]) @$1/blog/?p=$2$3@gsix; # Convert self-links to our LJ account to / # $s =~ s@(["'])http://$ljuser\.livejournal\.com/(["'])@$1/$2@gsix; # Convert links to /tags pages to point locally. # $s =~ s@(["'])http://$ljuser\.livejournal\.com/(?:tag/|\?tag=)([^"'<>]+)@{ my ($a, $b) = ($1, $2); $b =~ s/(\+|%20)/-/gs; "$a/blog/tag/$b/" }@gsexi; # LJ Polls. $s =~ s@ @

[ LJ Poll $1 ]

@gsix; # and are unnecessary. $s =~ s@\s*\s*@@gsi; # All other LJ tags become literal. # $s =~ s@<(lj[^<>]*)>@<$1>@gsi; return $s; } sub get_entries($$$$$$@) { my ($since, $before, $lock_p, $comments_p, $wordpress_p, $redirect_p, @items) = @_; my %items; my $cookie = get_cookie(); my $url = 'http://www.livejournal.com/interface/flat'; my $lastsync = $since; my @all_items = (); my $comments = get_comments() if $comments_p; if (! defined($lock_p) && !$redirect_p && ! -d $output_dir) { mkdir ($output_dir) || error ("$output_dir: $!"); } foreach (@items) { $items{$_} = 1; } my @groups = get_friends_groups(); if ($#items < 0) { while (1) { print STDERR "$progname: getting item IDs since " . ($lastsync ? $lastsync : "the beginning") . "...\n" if ($verbose > 1); my %e; $e{lastsync} = $lastsync if $lastsync; my $data = ljcmd ('syncitems', $cookie, %e); if ($verbose > 4) { my $x = $data; $x =~ s/^/\t/gm; print STDERR "$progname: result:\n$x\n"; } $data =~ s/([^\n]*)\n([^\n]*)\n/$1\t$2\n/gs; if ($data =~ m/^errmsg\t([^\n]*)$/m) { error ("LJ error: $1"); } elsif ($data !~ m/^success\tOK$/m) { error ("LJ error: unknown error"); } my %itemids; my %itemdates; my $count = 0; foreach (split (/\n/, $data)) { if (m/^sync_(\d+)_item\t[LC]-(\d+)$/s) { $itemids{$1} = $2; print STDERR "$progname: id $1 = $2\n" if ($verbose > 4); $count++; } elsif (m/^sync_(\d+)_time\t(.+)$/s) { if (defined($itemids{$1})) { $itemdates{$1} = $2; } if (!defined($lastsync) || $lastsync lt $2) { $lastsync = $2; } } } if ($count == 0) { print STDERR "$progname: no more syncitems.\n" if ($verbose > 3); last; } foreach my $id (sort {$a <=> $b} (keys %itemids)) { my $item = $itemids{$id}; my $date = $itemdates{$id}; if (!$before || $date le $before) { # string comparing dates... push @items, $item unless $items{$item}; $items{$item} = 1; } else { print STDERR "$progname: skip item $item ($date)\n" if ($verbose > 4); } } undef %itemids; undef %itemdates; if ($before && $lastsync gt $before) { # string comparing dates... print STDERR "$progname: past 'before' point with $lastsync\n" if ($verbose > 3); last; } } print STDERR "$progname: " . ($#items + 1) . " items.\n" if ($verbose > 1); } foreach my $item (sort {$b <=> $a} @items) { print STDERR "$progname: getting item $item... " if ($verbose > 1); my $data = ljcmd ('getevents', $cookie, 'selecttype' => 'one', 'lineendings' => 'unix', 'itemid' => $item ); if ($verbose > 4) { my $x = $data; $x =~ s/^/\t/gm; print STDERR "$progname: result:\n$x\n"; } $data =~ s/([^\n]*)\n([^\n]*)\n/$1\t$2\n/gs; if ($data =~ m/^errmsg\t([^\n]*)$/m) { error ("LJ error: $1"); } elsif ($data eq '') { error ("LJ error: null response"); } elsif ($data !~ m/^success\tOK$/m) { error ("LJ error: unknown error:\n$data"); } # WTF! error ("no data for item $item: LJ database fucked?") if ($data =~ m/^events_count\t0$/m); my %props; my @propskeys; my @propsvals; my $body = undef; foreach (split (/\n/, $data)) { if (m/^events_1_event\t(.*)$/s) { $body = url_unquote($1); } elsif (m/^events_1_([a-z]+)\t(.*)$/s) { $props{$1} = $2; } elsif (m/^prop_(\d+)_name\t(.*)$/s) { $propskeys[$1] = $2; } elsif (m/^prop_(\d+)_value\t(.*)$/s) { $propsvals[$1] = $2; } } # my $propcount = 0; for (my $i = 0; $i <= $#propskeys; $i++) { # this protocol is so stupid my $key = $propskeys[$i]; my $val = $propsvals[$i]; next unless defined ($key); $props{$key} = $val; # $propcount++; } # error ("LJ error: no properties for item $item") # unless ($propcount > 0); # error ("LJ error: no date for item $item") # unless ($props{eventtime}); if ($verbose > 4) { print STDERR "$progname: props:\n\n"; foreach my $k (keys %props) { my $v = $props{$k}; print STDERR "\t$k:\t\t$v\n"; } print STDERR "\n"; } my $id; if ($props{url}) { ($id) = ($props{url} =~ m@/(\d+)\.html$@s); } else { $id = $item >> 8; error ("no url, unknown item id") unless $id; } $props{comments} = $comments->{$item} if $comments; if (defined ($lock_p)) { # altering the entry on the server if ($props{opt_lockcomments} && $lock_p) { print STDERR "$id already locked\n" if ($verbose > 1); } elsif (!$props{opt_lockcomments} && !$lock_p) { print STDERR "$id already unlocked\n" if ($verbose > 1); } else { my %post = ( 'lineendings' => 'unix', 'itemid' => $item, 'opt_lockcomments' => $lock_p, 'current_music' => '', 'current_location' => '', ); $post{event} = $body if $body; $post{subject} = $props{subject} if $props{subject}; $post{security} = $props{security} if $props{security}; $post{allowmask} = $props{allowmask} if $props{allowmask}; # If we are locking comments, turn off the "no comments" flag # (which was the old way to lock, but also made comments invisible) $props{opt_nocomments} = 0 if ($lock_p && $props{opt_nocomments}); foreach my $key (keys %props) { next if ($key =~ m/^(itemid|subject|eventtime|revnum|interface| url|security|allowmask|anum|revtime)$/six); my $val = $props{$key}; $post{"prop_$key"} = $val; } my $which = $lock_p ? "locking" : "unlocking"; if ($debug_p) { print STDERR "not $which item $id...\n"; } else { print STDERR "$which item $id...\n" if ($verbose); my $data = ljcmd ('editevent', $cookie, %post); if ($verbose > 4) { my $x = $data; $x =~ s/^/\t/gm; print STDERR "$progname: result:\n$x\n"; } $data =~ s/([^\n]*)\n([^\n]*)\n/$1\t$2\n/gs; if ($data =~ m/^errmsg\t([^\n]*)$/m) { error ("$item: LJ error: $1"); } elsif ($data eq '') { # error ("$item: LJ error: null response"); print STDERR "LJ ERROR: NULL RESPONSE\n"; } elsif ($data !~ m/^success\tOK$/m) { error ("$item: LJ error: unknown error:\n$data"); } } } } elsif ($redirect_p) { # altering the entry on the server my $new_site = "$ljuser.org"; my $new_url = "http://www.$new_site/blog/"; my $new_url2 = "$new_url?p=$id"; my $new_body = ("

\n" . "
\n" . " 301\n" . " \n" . "


\n" . " This blog has moved to $new_site.
\n" . " This post has been archived\n" . " here.
\n" . " Please update your links.
\n" . "


\n" . "
\n" . " 301\n" . "
\n" . "
"); $new_body =~ s/\s+/ /gs; my %post = ( 'lineendings' => 'unix', 'itemid' => $item, # 'opt_lockcomments' => "0", 'opt_nocomments' => "1", 'event' => $new_body, 'subject' => '301 Moved Permanently', 'taglist' => '301', 'current_music' => '', 'current_location' => '', ); if ($debug_p) { print STDERR "not redirecting item $item to $new_url2\n"; } else { print STDERR "redirecting item $item to $new_url2\n" if ($verbose); my $data = ljcmd ('editevent', $cookie, %post); if ($verbose > 4) { my $x = $data; $x =~ s/^/\t/gm; print STDERR "$progname: result:\n$x\n"; } $data =~ s/([^\n]*)\n([^\n]*)\n/$1\t$2\n/gs; if ($data =~ m/^errmsg\t([^\n]*)$/m) { error ("$item: LJ error: $1"); } elsif ($data eq '') { # error ("$item: LJ error: null response"); print STDERR "LJ ERROR: NULL RESPONSE\n"; } elsif ($data !~ m/^success\tOK$/m) { error ("$item: LJ error: unknown error:\n$data"); } } } else { # downloading the entry if ($props{security} && $props{security} eq 'usemask') { my $mask = $props{allowmask}; error ("security is usemask with no mask") unless $mask; my @g = (); for (my $bit = 0; $bit < 32; $bit++) { push @g, $groups[$bit] if ($mask & (1 << $bit)); } $props{security} = join (", ", @g); } $props{subject} = '' unless $props{subject}; $body = fix_ljembed ($body, $props{url}, $cookie); if ($wordpress_p) { $props{body} = $body; push @all_items, \%props; } else { # Writing to a file. my $hdrs = ("Subject: $props{subject}\n" . "Date: $props{eventtime}\n" . "URL: $props{url}\n"); $hdrs .= "Tags: $props{taglist}\n" if defined ($props{taglist}); $hdrs .= "Music: $props{current_music}\n" if defined ($props{current_music}); $hdrs .= "Location: $props{current_location}\n" if defined ($props{current_location}); $hdrs .= "Security: $props{security}\n" if defined ($props{security}); $body = "$hdrs\n$body\n"; $body .= format_comments ($wordpress_p, $props{comments}) if ($comments_p); print STDERR "NOCOMM " if ($props{opt_nocomments} && ($verbose > 1)); print STDERR "locked " if ($props{opt_lockcomments} && ($verbose > 1)); write_file_if_changed ("$output_dir/$id", $body); } } } if ($wordpress_p) { wordpressify (@all_items); } } sub format_comments_1($@); sub format_comments($$) { my ($wordpress_p, $comm) = @_; my @root = (); foreach my $id (keys (%$comm)) { my $c = $comm->{$id}; my $date = $c->{date}; next unless defined($date); # must be a deleted comment? my ($yyyy, $mm, $dd, $h, $m, $s) = ($date =~ m@^(\d{4})-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)Z$@s); error ("$id: unparsable comment date: $date") unless $yyyy; my $w3c = DateTime::Format::W3CDTF->new; $date = $w3c->parse_datetime($date); $date = $date->epoch(); $c->{date} = $date; my $p = $c->{parent} ? $comm->{$c->{parent}} : undef; if ($p) { my $L = $p->{children}; my @L = ($L ? @$L : ()); push @L, $c; $p->{children} = \@L; } else { push @root, $c; } } my $output = format_comments_1 ($wordpress_p, @root); if (! $wordpress_p) { $output =~ s/[ \t]+$//gm; $output =~ s/\n\n+$/\n/gs; if ($output) { $output = ("\n
\n" . "


\n" . "$output\n

\n"); $output =~ s/^/ /gm; } } else { $output =~ s/^/ /gm; } return $output; } sub format_comments_1($@) { my ($wordpress_p, @comments) = @_; my $output = ''; foreach my $c (sort { $a->{date} <=> $b->{date} } @comments) { my $body = $c->{body}; $body =~ s/\s+$//s; $body =~ s/[ \t]+$//gm; $body =~ s/\n/
/gs; # Ok, WTF LJ. Why is this coming through as quoted HTML? # HTML is allowed in comments! # $body =~ s/<//gs; $body =~ s/'/'/gs; $body =~ s/"/"/gs; $body =~ s/&/&/gs; if (!$wordpress_p) { # Plain old HTML output my $subj = $c->{subject}; $output .= ("

\n" . "From: " . ($c->{name} || 'anonymous') . "
\n" . ($subj ? "Subject: $subj
\n" : "") . "Date: " . strftime ("%a, %d %b %Y %I:%M %p", localtime($c->{date})) . "

\n" . "\n$body\n" . "
\n\n"); } else { my $author = $c->{name}; my $url = ($author ? "http://$author.livejournal.com/" : ''); $output .= ("\n" . rss_prop('wp:comment_id', $c->{id}) . rss_prop('wp:comment_author', ($author || 'anonymous')) . rss_prop('wp:comment_author_email', '') . rss_prop('wp:comment_author_url', $url) . rss_prop('wp:comment_author_IP', '') . rss_prop('wp:comment_date', strftime ("%Y-%m-%d %H:%M:%S", localtime ($c->{date}))) . rss_prop('wp:comment_date_gmt', strftime ("%Y-%m-%d %H:%M:%S", gmtime ($c->{date}))) . rss_prop('wp:comment_content', $body) . rss_prop('wp:comment_approved', '1') . rss_prop('wp:comment_type', '') . rss_prop('wp:comment_parent', $c->{parent}) . rss_prop('wp:comment_user_id', $c->{poster}) . "\n"); } my $L = $c->{children}; my @L = ($L ? @$L : ()); if ($#L >= 0) { my $o = format_comments_1 ($wordpress_p, @L); if ($o && !$wordpress_p) { $o = "
    \n$o\n
"; $o =~ s/^/ /gm; } $output .= $o; } } return $output; } # There doesn't seem to be a way to download the comments of a particular # post! This is very lame, because it means that even if we're only # downloading posts from the last week, we still need to download every # comment since the beginning of time. # # This grabs all of them and returns a hash. Key is entry_id, value # is a hash of the comments on that entry. # sub get_comments() { my $cookie = get_cookie(); my $maxid = 1; my $start = 0; my %comments; my %users; my %by_entry; while ($start < $maxid) { my $url = ("http://www.livejournal.com/export_comments.bml" . "?get=comment_meta" . "&startid=$start"); print STDERR "$progname: getting comment metadata $start-$maxid...\n" if ($verbose > 1); my $ua = LWP::UserAgent->new; $ua->default_header ('X-LJ-Auth' => "cookie"); $ua->default_header ('Cookie' => "ljsession=$cookie"); my $res = $ua->get ($url); my $ret = ($res && $res->code) || 'null'; error ("$url: bad response: $ret") unless ($ret eq '200'); my $body = $res->content || ''; my ($max2) = ($body =~ m@(\d+)@s); error ("no maxid in result: $url") unless $max2; my ($next) = ($body =~ m@(\d+)@s); $maxid = $max2 if ($max2 > $maxid); $next = $maxid+1 unless defined($next); $start = $next if ($start < $next); my ($cs) = ($body =~ m@(.*?)@si); my ($us) = ($body =~ m@(.*?)@si); $cs = '' unless $cs; $us = '' unless $us; foreach my $cc (split (/]+)'/s); error ("unparsable comment: $cc") unless ($id); $poster = -1 unless $poster; # deleted user, maybe? my %cc; $cc{id} = $id; $cc{poster} = $poster; $cc{state} = $state if $state; $comments{$id} = \%cc; } foreach my $uu (split (/']+)'/s); $users{$id} = $user; } } $start = 0; while ($start < $maxid) { my $url = ("http://www.livejournal.com/export_comments.bml" . "?get=comment_body" . "&startid=$start"); print STDERR "$progname: getting comments $start-$maxid...\n" if ($verbose > 1); my $ua = LWP::UserAgent->new; $ua->default_header ('X-LJ-Auth' => "cookie"); $ua->default_header ('Cookie' => "ljsession=$cookie"); my $res = $ua->get ($url); my $ret = ($res && $res->code) || 'null'; error ("$url: bad response: $ret") unless ($ret eq '200'); my $body = $res->content || ''; my ($cs) = ($body =~ m@(.*?)@si); $cs = '' unless $cs; foreach my $cc (split (/]+)>@s); my ($id) = ($cc1 =~ m@\bid='(\d+)'@si); my ($jid) = ($cc1 =~ m@\bjitemid='(\d+)'@si); my ($poster) = ($cc1 =~ m@\bposterid='(\d+)'@si); my ($parent) = ($cc1 =~ m@\bparentid='(\d+)'@si); my ($subj) = ($cc =~ m@(.*?)@si); my ($body) = ($cc =~ m@(.*?)@si); my ($date) = ($cc =~ m@(.*?)@si); my $ch = $comments{$id}; $ch->{jitemid} = $jid; $ch->{name} = ($poster ? $users{$poster} : undef); $ch->{parent} = $parent if defined($parent); $ch->{subject} = munge_lj_html($subj) if defined($subj); $ch->{body} = munge_lj_html($body || ''); $ch->{date} = $date; my $etable = $by_entry{$jid}; if (!defined($etable)) { my %t; $etable = \%t; $by_entry{$jid} = $etable; } $etable->{$id} = $ch; $start = $id if ($start < $id); } } return \%by_entry; } sub rss_prop($$) { my ($key, $val) = @_; $val = '' unless defined($val); $val = "" if ($val =~ m/[<>&]/s); my $k2 = $key; $k2 =~ s/\s.*$//s; return " <$key>$val\n"; } sub rss_meta($$) { my ($key, $val) = @_; return (defined($val) ? (" \n" . " " . rss_prop('wp:meta_key', $key) . " " . rss_prop('wp:meta_value', $val) . " \n") : ""); } sub wordpressify(@) { my (@entries) = @_; my %all_tags; foreach my $e (@entries) { my ($id) = ($e->{url} =~ m@/(\d+)\.html$@s); my $tags = ''; foreach my $tag (split(/\s*,\s*/, ($e->{taglist} || ''))) { my $slug = lc($tag); $slug =~ s/['"]//gs; $slug =~ s/[^-a-z\d_]+/_/gs; $slug =~ s/^_+|_+$//gs; $slug =~ s/_+/_/gs; $all_tags{$tag} = $slug; $tags .= rss_prop('category domain="tag"', $tag); $tags .= rss_prop('category domain="tag" nicename="' . $slug . '"', $tag); } my $date = $e->{eventtime}; # "2002-12-01 20:41:00" my ($yyyy, $mm, $dd, $h, $m, $s) = ($date =~ m@^(\d{4})-(\d\d)-(\d\d) (\d\d):(\d\d):(\d\d)$@s); error ("$id: unparsable date: $date") unless $yyyy; $date = mktime ($s, $m, $h, $dd, $mm-1, $yyyy-1900, 0, 0, -1); my $pubdate = strftime ("%a, %d %b %Y %H:%M:%S %Z", localtime ($date)); my $wpdate = strftime ("%Y-%m-%d %H:%M:%S", localtime ($date)); my $wpgmt = strftime ("%Y-%m-%d %H:%M:%S", gmtime ($date)); my $postname = lc($e->{subject}); $postname =~ s/['"]//gs; $postname =~ s/[^-a-z\d_]+/-/gs; $postname =~ s/^-+|-+$//gs; $postname =~ s/-+/-/gs; $postname =~ s/^(.{20}[^-]+).*$/$1/gs; $postname =~ s/^(.{30}).*$/$1/gs; my $comments = $e->{comments}; my $item = ("\n" . rss_prop('title', munge_lj_html($e->{subject})) . rss_prop('link', $e->{url}) . rss_prop('pubDate', $pubdate) . rss_prop('dc:creator', $ljuser) . $tags . rss_prop('category', 'Uncategorized') . rss_prop('category domain="category" nicename="uncategorized"', 'Uncategorized') . rss_prop('guid isPermaLink="true"', $e->{url}) . rss_prop('description', '') . rss_prop('content:encoded', munge_lj_html($e->{body})) . rss_prop('excerpt:encoded', '') . rss_prop('wp:post_id', $id) . rss_prop('wp:post_date', $wpdate) . rss_prop('wp:post_date_gmt', $wpgmt) . rss_prop('wp:comment_status', ($e->{opt_nocomments} || $e->{opt_lockcomments} ? 'closed' : 'open')) . rss_prop('wp:ping_status', 'open') . rss_prop('wp:post_name', $postname) . rss_prop('wp:status', ($e->{security} ? 'private' : 'publish')) . rss_prop('wp:post_parent', '0') . rss_prop('wp:menu_order', '0') . rss_prop('wp:post_type', 'post') . rss_prop('wp:post_password', '') . rss_prop('wp:is_sticky', '0') . rss_meta('LJ_URL', $e->{url}) . rss_meta('music', $e->{current_music}) . rss_meta('location', $e->{current_location}) . rss_meta('security', $e->{security}) . ($comments ? format_comments (1, $comments) : '') . "\n"); $item =~ s/^/ /gm; $e = $item; } my $ljurl = "http://$ljuser.livejournal.com/"; my $pubdate = strftime ("%a, %d %b %Y %H:%M:%S %Z", localtime (time)); my $output = ' ' . $ljuser . ' ' . $ljurl . ' ' . $ljuser . '\'s Livejournal ' . $pubdate . ' ' . "$progname $version" . ' en 1.0 ' . $ljurl . ' ' . $ljurl . ' uncategorized Uncategorized ' . "\n"; foreach my $tag (sort keys (%all_tags)) { my $slug = $all_tags{$tag}; my $tt = ("\n" . rss_prop('wp:tag_slug', $slug) . rss_prop('wp:tag_name', $tag) . "\n"); $tt =~ s/^/ /gm; $output .= $tt; } $output .= join("", @entries); $output .= " \n\n"; print STDOUT $output; } my $month_re = ('\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|' . 'may|june?|july?|aug(?:ust)?|sep(?:tember)?|sept|' . 'oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\b'); my %monthvals = ( 'jan' => 1, 'january' => 1, 'february' => 2, 'feb' => 2, 'march' => 3, 'mar' => 3, 'april' => 4, 'apr' => 4, 'may' => 5, 'jun' => 6, 'june' => 6, 'jul' => 7, 'july' => 7, 'august' => 8, 'aug' => 8, 'sep' => 9, 'sept' => 9, 'september' => 9, 'oct' => 10, 'october' => 10, 'nov' => 11, 'november' => 11, 'dec' => 12, 'december' => 12 ); sub parse_date($) { my ($date) = @_; usage() unless $date; my ($y, $m, $d); my ($hh, $mm, $ss) = (0, 0, 0); # YYYY-MM-DD if ($date =~ m/^(\d{4})[-\s]+(\d\d)[-\s]+(\d\d)$/s) { ($y,$m,$d) = ($1,$2,$3); # DD-MMM-YYYY } elsif ($date =~ m/^(\d\d?)[-\s]+($month_re)[-\s]+(\d{4})$/si) { ($d,$m,$y) = ($1,$2,$3); $m = $monthvals{lc($m)}; # "30d", "30 days", "1 year". } elsif ($date =~ m/^(\d+)\s*(m(on(ths?)?)?|d(ays?)?|y(ears?)?)$/si) { my ($n, $t) = ($1, $2); $t =~ s/^(.).*$/$1/s; $n *= ($t eq 'd' ? 1 : $t eq 'm' ? 30 : 365); my $tt = time(); $tt -= ($n * 60 * 60 * 24); ($ss, $mm, $hh, $d, $m, $y) = localtime ($tt); $m++; $y += 1900; } error ("unparsable date: $date") unless $m; $date = sprintf ("%04d-%02d-%02d %02d:%02d:%02d", $y, $m, $d, $hh, $mm, $ss); return $date; } sub error($) { my ($err) = @_; print STDERR "$progname: $err\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] [--debug] [--since DATE] [--before DATE] [--lock | --unlock] [--wordpress] [ entry-numbers... ]\n"; exit 1; } sub main() { my @items = (); my ($since, $before, $lock_p, $comments_p, $wordpress_p, $redirect_p); while ($#ARGV >= 0) { $_ = shift @ARGV; if (m/^--?verbose$/s) { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^--?q(uiet)?$/s) { $verbose = 0; } elsif (m/^--?debug$/s) { $debug_p++; } elsif (m/^--?since$/s) { $since = parse_date (shift @ARGV); } elsif (m/^--?before$/s) { $before = parse_date (shift @ARGV); } elsif (m/^--?lock$/s) { $lock_p = 1; } elsif (m/^--?unlock$/s) { $lock_p = 0; } elsif (m/^--?comments$/s) { $comments_p = 1; } elsif (m/^--?wordpress$/s) { $wordpress_p = 1; } elsif (m/^--?redirect$/s) { $redirect_p = 1; } elsif (m/^-./) { usage; } elsif (m/^\d+$/) { push @items, $_; } else { usage; } } usage() if ($wordpress_p && defined($lock_p)); usage() if ($comments_p && defined($lock_p)); # foreach (@items) { $_ = $_ >> 8; } get_entries ($since, $before, $lock_p, $comments_p, $wordpress_p, $redirect_p, @items); } main(); exit 0;