[Perl]批量爬取ZOL壁纸 - 车模系列

There's more than one way to do it!
https://metacpan.org http://perlmonks.org
回复
头像
523066680
Administrator
Administrator
帖子: 573
注册时间: 2016年07月19日 12:14
联系:

[Perl]批量爬取ZOL壁纸 - 车模系列

帖子 523066680 »

LWP::UserAgent 版
  • 可以关掉脚本重新运行,会判断已完成的部分节省时间。
  • 没开 keep_alive,提取效率偏低
运行环境: Straberry Perl 5.24

Mojo::UserAgent 版,提高效率,请跳转 3楼
=info
Author: 523066680/vicyang
Date: 2018-11
=cut

use Encode;
use LWP::UserAgent;
use Mojo::DOM;
use File::Slurp;
use File::Basename qw/basename/;
use File::Path qw/mkpath/;
STDOUT->autoflush(1);

our $theme = "chemo";
our $wdir = "D:/temp/wallpaper_zol/$theme";
our $main = "http://desk.zol.com.cn";
my $ua = LWP::UserAgent->new();
our @headers = (
"Host" => "desk.zol.com.cn",
"User-Agent" => "Firefox/63.0",
);

mkpath $wdir unless -e $wdir;
chdir $wdir;

# 获取所有主题链接
my @items;
my $iter = 1;
while ( get_item( $main ."/${theme}/${iter}.html", \@items ) >= 1 )
{
$iter++;
}

# 遍历页面、提取图片
my $idx = 1;
for my $item ( @items )
{
printf "[%03d/%d] %s %s\n", $idx++ , $#items+1, $item->{link}, $item->{title};
get_pages( $item->{link}, $item->{title} );
}

sub get_item
{
my ( $link, $ref ) = @_;
my $res = try_to_get( $link );
my $dom = Mojo::DOM->new( $res->content );

for my $e ( $dom->find(".photo-list-padding")->each )
{
printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title");
push @$ref, {'link' => $main . $e->at("a")->attr("href"),
'title' => $e->at("span")->attr("title") };
}
# 判断是否为最后一页
if ( defined $dom->at("#pageNext") ) { return 1 }
else { return 0 }
}

# --- Get each pages of item --- #

sub get_pages
{
my ($link, $title) = @_;
my $res = try_to_get( $link );
my $dom = Mojo::DOM->new( $res->content );

my $path = "${wdir}/${title}";
mkpath $path unless -e $path;
chdir $path;

# 图片数量
my $pics = $dom->at(".photo-list-box li i")->text;
$pics=~s/[^\d]//; #去除斜杠

my @files = glob "*.jpg";
if ( $#files+1 == $pics ) {
printf "Images already exist\n";
return;
}

for my $e ($dom->find(".photo-list-box a")->each )
{
#printf "%s\n", $e->attr("href");
get_pic( $main . $e->attr("href") );
}
}

sub get_pic
{
my ( $link ) = @_;
my $res = try_to_get( $link );
return unless (defined $res);

my $dom = Mojo::DOM->new( $res->content );
my $pic_url;
my $pic_name;

my $obj = $dom->at(".wallpaper-down dd a");
my $sub_url;

while (1)
{
$sub_url = $obj->attr("href");
# 某些图片没有提供指定分辨率的链接
if ( $sub_url !~/\.html/ ) {
printf "Did not found picture url, skip %s\n", $sub_url;
return;
}

$pic_name = basename($sub_url);
$pic_name =~ s/\.html/\.jpg/i;
printf "%s\n", $pic_name;
return if ( -e $pic_name );

my $res = try_to_get( "${main}${sub_url}" );
return unless (defined $res);

my $dom = Mojo::DOM->new( $res->content );
my $dl = $ua->mirror( $dom->at("")->attr("src"), $pic_name );

# 如果下载失败就选择下一个分辨率的图片
if ( $dl->code != 502 ) { last }
else { $obj = $obj->next }
}
}

sub try_to_get
{
our @headers;
my $link = shift;
my $ua = LWP::UserAgent->new();
my $res;
my $retry = 0;
do
{
$res = $ua->get( $link, @headers );
if ( $retry > 0 and $retry < 5 ) { print "Retry times: $retry\n"; }
elsif ( $retry > 5 ) { print "False\n"; return undef }
$retry++;
}
until ( $res->is_success );

return $res;
}
头像
523066680
Administrator
Administrator
帖子: 573
注册时间: 2016年07月19日 12:14
联系:

Re: [Perl]批量爬取ZOL壁纸 - 车模系列

帖子 523066680 »

在尝试改为Mojo::UserAgent 的时候。发现官网给出的 save_to 方法是无效的

代码: 全选

# Follow redirects to download Mojolicious from GitHub
$ua->max_redirects(5)
  ->get('https://www.github.com/mojolicious/mojo/tarball/master')
  ->result->save_to('/home/sri/mojo.tar.gz');
暂时用 write_file 函数把 $res->body 写到文件
头像
523066680
Administrator
Administrator
帖子: 573
注册时间: 2016年07月19日 12:14
联系:

Mojo::UserAgent

帖子 523066680 »

=info Author: 523066680/vicyang Date: 2018-11 =cut use Encode; use Mojo::UserAgent; use Mojo::DOM; use File::Slurp; use File::Basename qw/basename/; use File::Path qw/mkpath/; STDOUT->autoflush(1); our $theme = "meishi"; our $wdir = "F:/Wallpaper/zol/$theme"; our $main = "http://desk.zol.com.cn"; our $ua = Mojo::UserAgent->new(); our @headers = ( "Host" => "desk.zol.com.cn", "User-Agent" => "Firefox/63.0", ); mkpath $wdir unless -e $wdir; chdir $wdir; # 获取所有主题链接 my @items; my $iter = 1; while ( get_item( $main ."/${theme}/${iter}.html", \@items ) >= 1 ) { $iter++; } # 遍历页面、提取图片 my $idx = 1; for my $item ( @items ) { printf "[%03d/%d] %s %s\n", $idx++ , $#items+1, $item->{link}, $item->{title}; get_pages( $item->{link}, $item->{title} ); } sub get_item { my ( $link, $ref ) = @_; my $res = try_to_get( $link ); my $dom = $res->dom; for my $e ( $dom->find(".photo-list-padding")->each ) { printf "%s %s\n", $e->at("a")->attr("href"), $e->at("span")->attr("title"); push @$ref, {'link' => $main . $e->at("a")->attr("href"), 'title' => $e->at("span")->attr("title") }; } # 判断是否为最后一页 if ( defined $dom->at("#pageNext") ) { return 1 } else { return 0 } } # --- Get each pages of item --- # sub get_pages { my ($link, $title) = @_; my $res = try_to_get( $link ); my $dom = $res->dom; my $path = "${wdir}/${title}"; mkpath $path unless -e $path; chdir $path; # 图片数量 my $pics = $dom->at(".photo-list-box li i")->text; $pics=~s/[^\d]//; #去除斜杠 my @files = glob "*.jpg"; if ( $#files+1 == $pics ) { printf "Images already exist\n"; return; } for my $e ($dom->find(".photo-list-box a")->each ) { #printf "%s\n", $e->attr("href"); get_pic( $main . $e->attr("href") ); } } sub get_pic { my ( $link ) = @_; my $res = try_to_get( $link ); return unless (defined $res); my $dom = $res->dom; my $pic_url; my $pic_name; my $obj = $dom->at(".wallpaper-down dd a"); my $sub_url; while (1) { $sub_url = $obj->attr("href"); # 某些图片没有提供指定分辨率的链接 if ( $sub_url !~/\.html/ ) { printf "Did not found picture url, skip %s\n", $sub_url; return; } $pic_name = basename($sub_url); $pic_name =~ s/\.html/\.jpg/i; printf "%s\n", $pic_name; return if ( -e $pic_name ); my $res = try_to_get( "${main}${sub_url}" ); return unless (defined $res); my $dom = $res->dom; my $res = $ua->get( $dom->at("")->attr("src") )->result; # 如果下载失败就选择下一个分辨率的图片 if ( $res->code == 502 ) { $obj = $obj->next; next; } write_file( $pic_name, {binmode=>":raw"}, $res->body ); last; } } sub try_to_get { our ($ua, @headers); my $link = shift; my $res; my $retry = 0; do { $res = $ua->get( $link )->result; if ( $retry > 0 and $retry < 5 ) { print "Retry times: $retry\n"; } elsif ( $retry > 5 ) { print "False\n"; return undef } $retry++; } until ( $res->is_success ); return $res; }
xueshumeng
初来炸道
初来炸道
帖子: 1
注册时间: 2019年07月09日 13:38
联系:

Re: [Perl]批量爬取ZOL壁纸 - 车模系列

帖子 xueshumeng »

如果不限制下载频率,错误发生的次数是不是会激增
回复

在线用户

正浏览此版面之用户: 没有注册用户 和 0 访客