#!/usr/bin/perl -w # Copyright © 2006-2013 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Bulk-downloads images from a variety of web photo galleries. Works on: # # - Flickr # - Facebook # - Picasa # - OvaHere # - SmugMug, SF Station # - SF Weekly, LA Weekly, Village Voice # - Zen Folio # - Google Drive # - SkyDrive # # To customize: add entries to the @handlers list. # # Install "exiftool" to make downloaded file names to be properly sorted # by the date the photo was taken. # # When downloading from Facebook, it uses your Safari cookies to log in. # If you want to use a browser other than Safari, code needs to be written. # # Created: 29-Dec-2006. require 5; use diagnostics; use strict; use LWP::Simple; use HTTP::Cookies; use Date::Parse; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.34 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 1; my $debug_p = 0; my @handlers = ( { url => '^https?://[a-z.]*flickr\.com/', list => \&flickr_list }, { url => '^https?://[a-z.]*facebook\.com/', list => \&facebook_list }, { url => '^https?://picasaweb\.google\.com/', list => \&picasa_list }, { url => '^https?://[a-z.]*ovahere\.com/', list => \&ovahere_list }, { body => 'class="smugmug', # Any site running Smugmug list => \&smug_list }, { body => "class='vvmlb'", # sfweekly.com, villagevoice.com, etc. list => \&weekly_list }, { body => 'cdn\.zenfolio\.net/', # Any site running Zen Folio list => \&zenfolio_list }, { url => '^https?://drive\.google\.com/', list => \&googledrive_list }, { url => '^https?://skydrive\.live\.com/', list => \&skydrive_list }, ); sub url_unquote($) { my ($u) = @_; $u =~ s/[+]/ /g; $u =~ s/%([a-z0-9]{2})/chr(hex($1))/ige; return $u; } # Find a handler for the URL and run it. # sub download_gallery($) { my ($url) = @_; $LWP::Simple::ua->agent ("$progname/$version"); my $body = undef; foreach my $h (@handlers) { my $m1 = $h->{url}; my $m2 = $h->{body}; error ("must specify either 'url' or 'body' pattern") unless ($m1 || $m2); my $matchedp = 0; if ($m1 && $url =~ m/$m1/six) { print STDERR "$progname: matched URL: $m1\n" if ($verbose > 2); $matchedp = 1; } if (!$matchedp && $m2) { $body = LWP::Simple::get($url) unless $body; if ($body =~ m/$m2/six) { print STDERR "$progname: matched body: $m2\n" if ($verbose > 2); $matchedp = 1; } } if ($matchedp) { my ($title, $imgs) = $h->{list} ($url, $body); my $i = 0; my $dir = pick_directory ($url, $title); $dir =~ s@/+$@@s; if (! -d $dir) { print STDERR "$progname: mkdir $dir/\n" if ($verbose); mkdir ($dir) unless ($debug_p); } my @ofiles = (); foreach my $img (@$imgs) { next unless $img; $i++; my ($suf) = ($img =~ m@\.([^/.]+)$@si); error ("no suffix: $img") unless $suf; $img =~ s@\#.*$@@s; # lose anchor my $file = sprintf("%s/%03d.%s", $dir, $i, $suf); if ($debug_p) { print STDERR "$progname: not downloading: $img\n"; } else { unlink $file; if ($h->{down}) { $h->{down} ($img, $file); } else { print STDERR "$progname: downloading: $img\n" if ($verbose > 1); LWP::Simple::getstore ($img, $file); } if (! -f $file) { print STDERR "$progname: ERROR: $url: unsaved!\n"; } else { print STDERR "$progname: wrote $file\n" if ($verbose); parse_exif ($file); } } push @ofiles, $file; } rename_by_date (@ofiles); return; } } error ("unrecognized URL: $url"); } # Convert a gallery's title to a sensible directory name. # sub pick_directory($$) { my ($url, $title) = @_; ($title) = ($url =~ m@([^/]+)/*$@si) unless $title; $title =~ s@ - .+?$@@si; $title = lc($title); $title =~ s/[^a-z\d]+/_/gsi; $title =~ s/_+/_/gsi; $title =~ s/^_+|_+$//gsi; return $title; } # Set file mtime to EXIF's time. # sub parse_exif($) { my ($file) = @_; my $cmd = "exiftool -q -'DateTimeOriginal>FileModifyDate' '$file'"; print STDERR "$progname: exec: $cmd\n" if ($verbose > 2); $cmd .= ' 2>&-' unless ($verbose > 2); # exiftool won't shut up safe_system ($cmd); } # Rename all of the files to be sequential by creation date. # sub rename_by_date(@) { my (@files) = @_; print STDERR "$progname: renaming by date...\n" if ($verbose); print STDERR "\n" if ($debug_p || $verbose > 1); my %dates; foreach my $f (@files) { my $date = (stat($f))[9]; $dates{$f} = ($date || -1); # error ("$f does not exist") unless ($date || $debug_p); if (! $date) { print STDERR "$f does not exist\n" unless ($debug_p); next; } } # Find the target file names in the new order # @files = sort { $dates{$a} <=> $dates{$b} } @files; my $i = 1; my %rename; my %swap; foreach my $f (@files) { my ($dir, $suf) = ($f =~ m@^(.*?)[^/]+\.([^/.]+)$@s); $rename{$f} = sprintf("%s%03d.%s", $dir, $i, $suf); $swap{$f} = sprintf("%s.%03d.%s", $dir, $i, $suf); $i++; } # Rename to dot files # foreach my $f1 (@files) { next unless ($debug_p || $dates{$f1} > 0); my $f2 = $swap{$f1}; rename ($f1, $f2) || error ("mv $f1 $f2: $!") unless $debug_p; print STDERR "$progname: mv $f1 $f2\n" if ($debug_p || $verbose > 1); } # Rename them back # print STDERR "\n" if ($debug_p || $verbose > 1); foreach my $f0 (@files) { next unless ($debug_p || $dates{$f0} > 0); my $f1 = $swap{$f0}; my $f2 = $rename{$f0}; rename ($f1, $f2) || error ("mv $f1 $f2: $!") unless $debug_p; print STDERR "$progname: mv $f1 $f2\n" if ($debug_p || $verbose > 1); } } # Like system() but respects error codes. # sub safe_system(@) { my @cmd = @_; system (@cmd); my $exit_value = $? >> 8; my $signal_num = $? & 127; my $dumped_core = $? & 128; error ("$cmd[0]: core dumped!") if ($dumped_core); error ("$cmd[0]: signal $signal_num!") if ($signal_num); # error ("$cmd[0]: exited with $exit_value!") if ($exit_value); return $exit_value; } # expands the first URL relative to the second. # sub expand_url($$) { my ($url, $base) = @_; $url =~ s/^\s+//gs; # lose whitespace at front and back $url =~ s/\s+$//gs; $url =~ s@^//@http://@; # amazingly, this is legal if (! ($url =~ m/^[a-z]+:/)) { $base =~ s@(\#.*)$@@; # strip anchors $base =~ s@(\?.*)$@@; # strip arguments $base =~ s@/[^/]*$@/@; # take off trailing file component my $tail = ''; if ($url =~ s@(\#.*)$@@) { $tail = $1; } # save anchors if ($url =~ s@(\?.*)$@@) { $tail = "$1$tail"; } # save arguments my $base2 = $base; $base2 =~ s@^([a-z]+:/+[^/]+)/.*@$1@ # if url is an absolute path if ($url =~ m@^/@); my $ourl = $url; $url = $base2 . $url; $url =~ s@/\./@/@g; # expand "." 1 while ($url =~ s@/[^/]+/\.\./@/@g); # expand ".." $url .= $tail; # put anchors/args back print STDERR "$progname: relative URL: $ourl --> $url\n" if ($verbose > 6); } else { print STDERR "$progname: absolute URL: $url\n" if ($verbose > 7); } return $url; } # Use Safari cookies when accessing Facebook. Fucking Facebook. # (If you use Firefox, this code will need to be different.) # BEGIN { eval 'use Foundation;' } sub load_safari_cookies($$) { my ($ua, $match) = @_; my $jar = HTTP::Cookies->new(); my $storage = eval 'NSHTTPCookieStorage->sharedHTTPCookieStorage'; if (!$storage) { print STDERR "$progname: unable to load Safari cookies.\n"; return; } my $count = 0; my $enum = $storage->cookies->objectEnumerator; while (my $cookie = $enum->nextObject) { last unless $$cookie; my $domain = $cookie->domain->cString; my $name = $cookie->name->cString; my $path = $cookie->path->cString; my $value = $cookie->value->cString; my $secure = 0; my $version = 0; my $port = undef; my $maxage = 100000; next unless ($domain =~ m/$match/si); print STDERR "$progname: cookie:" . " domain=$domain" . " name=$name" . " path=$path" . " value=$value" . "\n" if ($verbose > 2); $jar->set_cookie ($version, $name, $value, $path, $domain, $port, $path, $secure, $maxage); $count++; } $ua->cookie_jar ($jar); print STDERR "$progname: loaded $count Safari cookies\n" if ($verbose > 1); } ###################################################################### Flickr sub flickr_list($$) { my ($base_url, $body) = @_; my $title = undef; my @imgs = (); my $url = $base_url; # /photos/NAME/NNN/in/set-MMM/ -> /photos/NAME/sets/MMM/ $url =~ s@^(.*)/\d+/in/set-(\d+)/?$@$1/sets/$2@si; $url =~ m@/(sets|date-taken)/@ || error ("this only works on \"/sets/\" or \"/date-taken/\" URLs: $url"); $url =~ s@/with/\d+/?$@/@si; my ($dir) = ($url =~ m@/([^/]+/sets/[^/]+)/?(\?[^/]+)?$@); ($dir) = ($url =~ m@/([^/]+/archives/date-taken/\d+/\d+)/@) unless $dir; error ("unparsable directory: $url") unless $dir; $dir =~ s@(/sets|/archives/date-taken)/@_@s; $dir =~ s@/@_@gs; $dir =~ s@\?.*$@@s; print STDERR "$progname: parsing $url\n" if ($verbose); $body = (LWP::Simple::get($url) || '') unless $body; error ("must be signed in for $url") if ($body =~ m/You must be signed in to see this content/i); if (! defined($title)) { ($title) = ($body =~ m@]*>\s*(.*?)\s*@si); my ($fn) = ($body =~ m@"first_name":\s*"([^\"]+)"@si); my ($ln) = ($body =~ m@"last_name":\s*"([^\"]+)"@si); $fn = "$fn $ln" if ($fn && $ln); $title = "$fn: $title" if $fn; } ($body) = ($body =~ m@var yconf = {(.*?)@si); error ("unparsable JSON in $url") unless $body; $body =~ s/\s+/ /gs; $body =~ s/("o":)/\n$1/gs; my $count = 0; my @lines = split (/\n/, $body); shift @lines; foreach (@lines) { my ($url2) = m@"url":\s*"(http.+?)"@s; next unless $url2; $url2 =~ s@\\@@gs; push @imgs, $url2; $count++; print STDERR "$progname: queue: $url2\n" if ($verbose > 2); } error ("no images on $url") unless ($count > 0); return ($title, \@imgs); } # This crud is no longer needed with the 2013 Flickr redesign: # #sub flickr_bogus_image_p($) { # my ($img) = @_; # my $in; # return 1 unless open ($in, '<', $img); # #error ("$img: $!"); # my $head = ''; # sysread ($in, $head, 100); # close $in; # $head = substr($head, 0, 6); # return 1 if ($head =~ m/GIF8[7,9]a/); # return 0; #} # # #sub flickr_down($$$) { # my ($url, $file) = @_; # # # Instead of returning an error code when you ask for an image size that # # doesn't exist, Flickr gives you a GIF that says "Image Unavailable". # # So try them all until we get a non-GIF... # # # my @suffixes = ("_o", # original, > 1600 -- different hash # "_h", # large, 1600 max -- different hash # "_b", # large, 1024 max # "_c", # medium, 800 max # "_z", # medium, 640 max # "", # medium, 500 max # "_n", # small, 320 max # "_m", # small, 240 max # # "_t", # thumb, 100 max # # "_s", # square, 75x75 # ); # my ($urlA, $urlB) = ($url =~ m@^(.*)(\.[^/.]+)$@); # my ($fileA, $fileB) = ($file =~ m@^(.*)(\.[^/.]+)$@); # foreach my $suf (@suffixes) { # $url = $urlA . $suf . $urlB; # unlink ($file); # # print STDERR "$progname: downloading: $url\n" if ($verbose > 1); # LWP::Simple::getstore ($url, $file); # # return unless (flickr_bogus_image_p ($file)); # # print STDERR "$progname: bogus: $url\n" if ($verbose > 2); # unlink $file; # } # # error ("no images: $urlA$urlB"); #} ###################################################################### Smugmug sub smug_list($$) { my ($base_url, $body) = @_; my $title = undef; my @imgs = (); my $url = $base_url; $url =~ s@#.*$@@s; my ($host) = ($url =~ m@^(http://[^/]+)@si); my ($dir) = ($url =~ m@/(\d+_[\dA-Z]+)$@si); error ("no id in $url") unless $dir; # Smugmug's galleries are a gigantic pain in the ass to parse, so instead # we parse their RSS feed -- however, some Smugmug sites (maybe the "Pro" # ones?) do not give us RSS at all, and in that case, we're just fucked. $url = ("$host/hack/feed.mg" . "?Type=gallery" . "&Data=$dir" . "&ImageCount=9999" . "&Paging=0" . "&format=atom10"); print STDERR "$progname: parsing $url\n" if ($verbose); $body = (LWP::Simple::get($url) || ''); error ("RSS feed is disabled: $url") unless $body; my @iimgs; $body =~ s!([^<>]+)!{ push @iimgs, $1; }!gsexi; ($title) = ($body =~ m@]*>\s*(.*?)\s*@si); my ($by) = ($body =~ m@photos? by ([^<>]+)@si); $by =~ s/ See event.*//si if $by; $title = "$by: $title" if $by; my %done; foreach my $img (@iimgs) { next unless ($img =~ m/\.jpg$/s); $img =~ s/-\d\.jpg$/.jpg/s; $img =~ s/-Th\.jpg$/-O.jpg/s; $img =~ s@/Th/@/O/@s; next if ($done{$img}); $done{$img} = 1; push @imgs, $img; print STDERR "$progname: queue: $img\n" if ($verbose > 2); } return ($title, \@imgs); } #################################################################### Zen Folio sub zenfolio_list($$) { my ($base_url, $body) = @_; my $title = undef; my @imgs = (); my $url = $base_url; $url =~ s@#.*$@@s; my ($host) = ($url =~ m@^(http://[^/]+)@si); my ($dir) = ($url =~ m@/([\dA-Z]+)/?$@si); error ("no id in $url") unless $dir; my ($rss) = ($body =~ m@href="([^<>\"]+/recent\.rss)"@si); error ("no RSS in $url") unless $rss; print STDERR "$progname: parsing $rss\n" if ($verbose); $body = (LWP::Simple::get($rss) || ''); my @iimgs; $body =~ s!]*? url="([^<>\"]+)"!{ push @iimgs, $1; }!gsexi; ($title) = ($body =~ m@]*>\s*(.*?)\s*@si); my ($by) = ($body =~ m@photos? by ([^<>]+)@si); $by =~ s/ See event.*//si if $by; $title = "$by: $title" if $by; my %done; foreach my $img (@iimgs) { next unless ($img =~ m/\.jpg$/s); next if ($done{$img}); $done{$img} = 1; push @imgs, $img; print STDERR "$progname: queue: $img\n" if ($verbose > 2); } return ($title, \@imgs); } ###################################################################### SFWeekly sub weekly_list($$) { my ($base_url, $body) = @_; my $title = undef; my @imgs = (); my $url = $base_url; print STDERR "$progname: parsing $url\n" if ($verbose); $body = (LWP::Simple::get($url) || '') unless $body; ($title) = ($body =~ m@]*>\s*(.*?)\s*@si) unless defined ($title); $title =~ s@ - Slideshows$@@si; $title =~ s@ - San Francisco$@@si; my ($imgs) = ($body =~ m@photoData:\s*{(.*)}},@si); error ("No photoData in $url") unless $imgs; foreach (split (/\}/, $imgs)) { s/\\//gs; my ($img) = m/"Photo":"(.*?)"/si; next unless $img; $img =~ s@\.\d+\.jpg$@.0.jpg@s; # Get the larger sized image $img = expand_url ($img, $base_url); push @imgs, $img; print STDERR "$progname: queue: $img\n" if ($verbose > 2); } return ($title, \@imgs); } ###################################################################### OvaHere sub ovahere_list($$) { my ($base_url, $body) = @_; my $title = undef; my @imgs = (); my $url = $base_url; $url =~ m@gallerydetail@i || error ("this only works on \"gallerydetail\" URLs: $url"); while ($url) { print STDERR "$progname: parsing $url\n" if ($verbose); $body = (LWP::Simple::get($url) || '') unless $body; ($title) = ($body =~ m@]*>\s*(.*?)\s*@si) unless $title; $body =~ s/\s+/ /gs; $body =~ s/]*SRC=\"([^<>\"]+)\"@si; next unless $url2; next unless ($url2 =~ m@\.jpg$@); next unless ($url2 =~ m@/Gallery/@); ($url2 =~ s@__w\d\d+\.@__w1000.@s) || error ("unable to edit size in URL: $url2"); push @imgs, $url2; print STDERR "$progname: queue: $url2\n" if ($verbose > 2); } # Get the next page too. if ($body =~ m@(]*VALUE="Next"[^<>]*>)@si) { my $u2 = $url; if ($u2 =~ m/PageNum=(\d+)/) { my $p = $1; $p++; $u2 =~ s@(PageNum=)\d+@$1$p@s; } else { $u2 .= "&PageNum=2"; } $url = $u2; } else { $url = undef; } $body = undef; } return ($title, \@imgs); } ###################################################################### Facebook sub facebook_list($$) { my ($url, $body) = @_; $url =~ m@media/set@i || error ("this only works on \"media/set/\" URLs: $url"); # Use the mobile site, because it gives us real HTML instead of AJAX fuckery. $url =~ s@^(https://)www\.@$1m.@si; print STDERR "$progname: parsing $url\n" if ($verbose); # Fucking Facebook. load_safari_cookies ($LWP::Simple::ua, '\bfacebook\.com$'); # { # my $ua = $LWP::Simple::ua; # $ua->add_handler("request_send", sub { shift->dump; return }); # $ua->add_handler("response_done", sub { shift->dump; return }); # } $body = (LWP::Simple::get($url) || '') unless $body; error ("$1") if ($body =~ m/(This content is currently unavailable)/si); error ("$1") if ($body =~ m/class="main_message">(You must log in[^<>]*)/si); error ("null response") unless $body; $body =~ s@\\u003c@>@gsi; my @imgs = (); my ($title) = ($body =~ m@]*>\s*(.*?)\s*@si); error ("no title? that's unlikely.") unless $title; my ($by) = ($body =~ m@ by ]*>(.*?)@si); $title = "$by: $title" if ($by); $title =~ s/ \| .*//s; my $ourl = $url; my $start = 0; my %dup; while (1) { my $count = 0; $body =~ s/ 2); } last if ($count == 0); $start += $count; $url = "$ourl&s=$start"; $body =~ s@\\u003c@>@gsi; print STDERR "$progname: parsing $url\n" if ($verbose); $body = (LWP::Simple::get($url) || ''); } return ($title, \@imgs); } ###################################################################### Picasa sub picasa_list($$) { my ($base_url, $body) = @_; my @imgs = (); my $url = $base_url; print STDERR "$progname: parsing $url\n" if ($verbose); $body = (LWP::Simple::get($url) || '') unless $body; if (! ($body =~ m@]*? \s+ type="application/rss\+xml" [^<>]*? \s+ href="([^<>\"]+)"@sxi)) { error ("no RSS link in $url"); } $url = $1; print STDERR "$progname: parsing $url\n" if ($verbose); $body = (LWP::Simple::get($url) || ''); my ($title) = ($body =~ m@]*>\s*(.*?)\s*@si); my ($author) = ($body =~ m@([^<>]+)@si); $title = "$title, $author" if $author; $body =~ s/\s+/ /gs; $body =~ s/(]*? \b url=[\'\"] ([^\'\"]+) @six); next unless $url2; next unless ($url2 =~ m@\.(jpg|png)$@); push @imgs, $url2; } return ($title, \@imgs); } ############################################################### Google Drive sub googledrive_list($$) { my ($base_url, $body) = @_; my ($title) = ($body =~ m@(.*?)@si); my ($data) = ($body =~ m@\bvar data\b(.*?)};@s); error ("no data") unless $data; ($data) = ($data =~ m@viewerItems:\s*\[(.*?\])\s*\]@s); error ("no viewerItems") unless $data; $data =~ s/\s+/ /gs; my @imgs = (); my %done; foreach my $line (split(/\]\s*,/, $data)) { my ($img) = ($line =~ m@"(https?:.*?)"@si); next unless $img; next if ($done{$img}); $done{$img} = 1; $img .= '#.jpg'; # needs a suffix push @imgs, $img; print STDERR "$progname: queue: $img\n" if ($verbose > 2); } return ($title, \@imgs); } ################################################################## SkyDrive sub skydrive_list($$) { my ($base_url, $body) = @_; my ($title) = ($body =~ m@(.*?)@si); if (! $title) { ($title) = ($body =~ m@"creatorName":\s*"(.*?)"@s); my ($d) = ($body =~ m@"displayCreationDate":\s*"(.*?)"@s); $title .= " $d" if $d; } if ($title) { $title =~ s@/@-@gs; $title =~ s/\\//gs; } my ($data) = ($body =~ m@\bvar primedResponse\b(.*?)};@s); error ("no data") unless $data; $data =~ s/\s+/ /gs; $data =~ s/("download")/\n$1/gs; my @imgs = (); my %done; foreach my $line (split(/\n/, $data)) { my ($img) = ($line =~ m@"(https?:.*?)"@si); next unless $img; $img =~ s/\\//gs; next unless ($img =~ m/\bdownload\b/s); $img =~ s/\?.*$//s; next if ($done{$img}); $done{$img} = 1; $img .= '#.jpg'; # needs a suffix push @imgs, $img; print STDERR "$progname: queue: $img\n" if ($verbose > 2); } return ($title, \@imgs); } ###################################################################### sub error($) { my ($err) = @_; print STDERR "$progname: $err\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] [--debug] gallery-url ...\n"; exit 1; } sub main() { my @urls; while ($#ARGV >= 0) { $_ = shift @ARGV; if (m/^--?verbose$/) { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^--?debug$/) { $debug_p++; } elsif (m/^-./) { usage; } else { push @urls, $_; } } usage unless ($#urls >= 0); foreach my $url (@urls) { download_gallery ($url); } } main(); exit 0;