- 可以关掉脚本重新运行,会判断已完成的部分节省时间。
- 没开 keep_alive,提取效率偏低
Mojo::UserAgent 版,提高效率,请跳转 3楼
Code: [show] | [select all]
=info
Author: 523066680/vicyang
Date: 2018-11
=cut
use Encode;
use LWP::UserAgent;
use Mojo::DOM;
use File::Slurp;
use File::Basename qw/basename/;
use File::Path qw/mkpath/;
STDOUT->autoflush(1);
our $theme = "chemo";
our $wdir = "D:/temp/wallpaper_zol/$theme";
our $main = "http://desk.zol.com.cn";
my $ua = LWP::UserAgent->new();
our @headers = (
"Host" => "desk.zol.com.cn",
"User-Agent" => "Firefox/63.0",
);
mkpath $wdir unless -e $wdir;
chdir $wdir;
# 获取所有主题链接
my @items;
my $iter = 1;
while ( get_item( $main ."/${theme}/${iter}.html", \@items ) >= 1 )
{
$iter++;
}
# 遍历页面、提取图片
my $idx = 1;
for my $item ( @items )
{
printf "[%03d/%d] %s %s\n", $idx++ , $#items+1, $item->{link}, $item->{title};
get_pages( $item->{link}, $item->{title} );
}
sub get_item
{
my ( $link, $ref ) = @_;
my $res = try_to_get( $link );
my $dom = Mojo::DOM->new( $res->content );
for my $e ( $dom->find(".photo-list-padding")->each )
{
printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
push @$ref, {'link' => $main . $e->at("a")->attr("href"),
'title' => $e->at("span")->attr("title") };
}
# 判断是否为最后一页
if ( defined $dom->at("#pageNext") ) { return 1 }
else { return 0 }
}
# --- Get each pages of item --- #
sub get_pages
{
my ($link, $title) = @_;
my $res = try_to_get( $link );
my $dom = Mojo::DOM->new( $res->content );
my $path = "${wdir}/${title}";
mkpath $path unless -e $path;
chdir $path;
# 图片数量
my $pics = $dom->at(".photo-list-box li i")->text;
$pics=~s/[^\d]//; #去除斜杠
my @files = glob "*.jpg";
if ( $#files+1 == $pics ) {
printf "Images already exist\n";
return;
}
for my $e ($dom->find(".photo-list-box a")->each )
{
#printf "%s\n", $e->attr("href");
get_pic( $main . $e->attr("href") );
}
}
sub get_pic
{
my ( $link ) = @_;
my $res = try_to_get( $link );
return unless (defined $res);
my $dom = Mojo::DOM->new( $res->content );
my $pic_url;
my $pic_name;
my $obj = $dom->at(".wallpaper-down dd a");
my $sub_url;
while (1)
{
$sub_url = $obj->attr("href");
# 某些图片没有提供指定分辨率的链接
if ( $sub_url !~/\.html/ ) {
printf "Did not found picture url, skip %s\n", $sub_url;
return;
}
$pic_name = basename($sub_url);
$pic_name =~ s/\.html/\.jpg/i;
printf "%s\n", $pic_name;
return if ( -e $pic_name );
my $res = try_to_get( "${main}${sub_url}" );
return unless (defined $res);
my $dom = Mojo::DOM->new( $res->content );
my $dl = $ua->mirror( $dom->at("")->attr("src"), $pic_name );
# 如果下载失败就选择下一个分辨率的图片
if ( $dl->code != 502 ) { last }
else { $obj = $obj->next }
}
}
sub try_to_get
{
our @headers;
my $link = shift;
my $ua = LWP::UserAgent->new();
my $res;
my $retry = 0;
do
{
$res = $ua->get( $link, @headers );
if ( $retry > 0 and $retry < 5 ) { print "Retry times: $retry\n"; }
elsif ( $retry > 5 ) { print "False\n"; return undef }
$retry++;
}
until ( $res->is_success );
return $res;
}