[Perl]批量抓取电影资料

There's more than one way to do it!
https://metacpan.org http://perlmonks.org
回复
头像
523066680
Administrator
Administrator
帖子: 573
注册时间: 2016年07月19日 12:14
联系:

[Perl]批量抓取电影资料

帖子 523066680 »

不做过多的解释~ 有空会继续完善
=info
电影资料下载 V0.6
Code by 523066680@163.com
2017-03
=cut

use utf8;
use Encode;
use IO::Handle;
use LWP::Simple 'getstore';
use LWP::UserAgent;
STDOUT->autoflush(1);
binmode(STDOUT, ":encoding(gbk)");

our $main = "http://www.bttiantangs.com";
our $ua = LWP::UserAgent->new;
$ua->agent("Mozilla/5.0");

our $title = "科幻"; #Unicode

our $fold = "F:\\Temp\\BTtiantangs";
mkdir $fold if ( ! -e $fold );
chdir $fold;

PAGELIST:
{
my $page = getContent( "$main/sb/$title.html" );

#Get max page number
my $last;
if ( $page =~/a href="[^"^\d]+(\d+)\.html" class="extend"/i )
{
$last = $1;
}
else
{
error("获取最后页码失败\n");
}

for my $n ( 0 .. $last )
{
print "Page : $n\n";
getMoviesFromPage( "$main/sb/$title/$n.html" );
}
}

sub getMoviesFromPage
{
my $url = shift;
my $page = getContent( $url );

#获取每个电影子页面的链接和电影名称
my ($pgcode, $name);
my $txtfile;
while ($page =~s/a href="([^"]+)" rel[^>]+>([^<]+)<//)
{
($pgcode, $name) = ($1, encode('gbk', $2) );

$name =~tr/\/:*?<>|/_/; #去除特殊字符
printf("%s - %s\n", $1, $2);

next if ( ( -e "$name.txt" ) and (-s "$name.txt" > 1024) );
getDetail( $main .$pgcode, $name );
}
}

sub getDetail
{
my ($url, $name) = (shift, shift);
my $page = getContent( $url );

#获取磁力链接
my @links = ();
getLinks( \$page, \@links );

#获取封面
my $pic = getPic(\$page);

#获取资源信息 关键字 post_content
my $info = getInfo(\$page);

getstore($pic, "$name.jpg") if ( $pic );

open WRT, ">:encoding(utf8)", "$name.txt";
print WRT $pic, "\n";
print WRT $info, "\n\n";

for my $ref ( @links )
{
print WRT sprintf("%s - %s\n%s\n\n",
$ref->{'type'},
$ref->{'name'},
$ref->{'link'}
);
}
close WRT;
}

sub getLinks
{
my ( $ref, $aref ) = (shift, shift);

while ($$ref =~s/a href=.*<em>([^<]+)<\/em>.*href="([^"]+)"[^>]+>([^<]+)<//i)
{
push @{$aref}, { 'type' => $1, 'name' => $3, 'link' => $2 };
}
}

sub getInfo
{
my $ref = shift;

#先把换行合并,然后去掉post_content前面的所有内容
$$ref =~s/\r?\n//g;
$$ref =~s/^(.+)post_content"\>//;

#去除 </div> 之后的内容
$$ref =~s/<\/div>.*//;

$$ref=~s/<[^>]+>//g;
$$ref=~s/(\s+)?◎/\n◎/g;
$$ref=~s/(\s+)?\&nbsp;\&nbsp;/\n /ig;

return $$ref;
}

sub getPic
{
my $ref = shift;
if ( $$ref =~ /post_content.*(http.*\.jpg)/i )
{
return $1;
}
else
{
print "JPG NOT FOUND!\n";
return undef;
}
}

sub getContent
{
my $url = shift;
my $res = $ua->get( $url );
if (! $res->is_success )
{
error("Failed\n");
}

return decode('utf8', $res->content() );
}

sub error
{
printf shift;
<STDIN>;
exit;
}
头像
523066680
Administrator
Administrator
帖子: 573
注册时间: 2016年07月19日 12:14
联系:

[Perl]批量抓取电影资料 V0.8

帖子 523066680 »

考虑到部分韩文日文编码范围的文字,改为 使用 Win32::Unicode 操作文件。
=info
bttiantangs 电影资料下载
Code by 523066680@163.com
2017-03
=cut

use utf8;
use Encode;
use IO::Handle;
use Win32::Unicode;
use LWP::Simple qw/get getstore/;
use LWP::UserAgent;
STDOUT->autoflush(1);
binmode(STDOUT, ":encoding(gbk)");

our $main = "http://www.bttiantangs.com";
our $ua = LWP::UserAgent->new;
$ua->agent("Mozilla/5.0");

#type: 科幻 恐怖 爱情 剧情 战争 动画 TV (top/3d/index.html)
our $title = "剧情"; #Unicode

our $fold = "F:\\Temp\\BTtiantangs";
mkdir $fold if ( ! -e $fold );
chdir $fold;

our $htmlFold = "html";
mkdir $htmlFold if ( ! -e $htmlFold );

PAGELIST:
{
my $page = getContent( "$main/sb/$title.html" );

#Get max page number
my $last;
if ( $page =~/a href="[^"^\d]+(\d+)\.html" class="extend"/i )
{
$last = $1;
}
else
{
error("获取最后页码失败\n");
}

print "from 0 to $last\n";
for my $n ( 0 .. $last )
{
print "Page : $n\n";
getMoviesFromPage( "$main/sb/$title/$n.html" );
}
print "End\n";
<STDIN>;
}

sub getMoviesFromPage
{
my $url = shift;
my $page = getContent( $url );

#获取每个电影子页面的链接和电影名称
my ($pgcode, $name);
my $txtfile;
while ($page =~s/a href="([^"]+)" rel[^>]+>([^<]+)<//)
{
($pgcode, $name) = ($1, $2);

$name =~tr/\/:*?<>|"/_/; #去除特殊字符
printf("%s - %s\n", $1, $2);

next if ( file_type('e', $name.".txt" ) and (file_size($name .".txt") > 1024) );
getDetail( $main .$pgcode, $name );
}
}

sub getDetail
{
my ($url, $name) = (shift, shift);
my $page = getContent( $url );

saveHtml( $url, \$page ); # 在 $page 替换修改之前保存html

#获取磁力链接
my @links = ();
getLinks( \$page, \@links );

#获取封面
my $pic = getPic(\$page) || "No Pic";

#获取资源信息 关键字 post_content
my $info = getInfo(\$page);

my $fh = Win32::Unicode::File->new;
my $res;
if ( $pic ne "No Pic" )
{
$res = $ua->get($pic);
if ( $res->is_success )
{
$fh->open('>:raw', $name.".jpg" ) or warn "jpg $!";
print $fh $res->content();
$fh->close;
}
}

$fh->open('>:utf8', $name.".txt" ) or die "txt $!";

print $fh $pic, "\r\n";
print $fh $info, "\r\n\r\n";

for my $ref ( @links )
{
printf $fh ("%s - %s\r\n%s\r\n\r\n",
$ref->{'type'},
$ref->{'name'},
$ref->{'link'});
}
$fh->close;
}

sub saveHtml
{
my ($url, $content_ref) = (shift, shift);
my $fname;
my $content;

$url=~/\/(\d+.html)$/;
$fname = $1;
return if ( (-e $fname) and ( -s $fname > 10240 ) );

open WRT, ">:utf8", "./html/$fname";
print WRT $$content_ref;
close WRT;
}

sub getLinks
{
my ( $ref, $aref ) = (shift, shift);

while ($$ref =~s/a href=.*<em>([^<]+)<\/em>.*href="([^"]+)"[^>]+>([^<]+)<//i)
{
push @{$aref}, { 'type' => $1, 'name' => $3, 'link' => $2 };
}
}

sub getInfo
{
my $ref = shift;

#先把换行合并,然后去掉post_content前面的所有内容
$$ref =~s/\r?\n//g;
$$ref =~s/^(.+)post_content"\>//;

#去除 </div> 之后的内容
$$ref =~s/<\/div>.*//;

$$ref=~s/<[^>]+>//g;
$$ref=~s/(\s+)?◎/\r\n◎/g;
$$ref=~s/(\s+)?\&nbsp;\&nbsp;/\r\n /ig;

return $$ref;
}

sub getPic
{
my $ref = shift;
if ( $$ref =~ /post_content.*(http.*\.jpg)/i )
{
return $1;
}
else
{
print "JPG NOT FOUND!\n";
return undef;
}
}

sub getContent
{
my $url = shift;
my $res;
my $times = -1;
my $maxtimes = 5;

do
{
$res = $ua->get( $url );
$times++;

print "Retry : $times\n" if ($times > 0);
}
until ( $res->is_success or ($times > $maxtimes) );

error("Failed to get $url\n") if ( $times > $maxtimes );

return decode('utf8', $res->content() );
}

sub error
{
printf shift;
<STDIN>;
exit;
}
头像
523066680
Administrator
Administrator
帖子: 573
注册时间: 2016年07月19日 12:14
联系:

对已下载的资料进行分类整理

帖子 523066680 »

use utf8;
use Encode;
use File::Copy;
use Win32::Unicode;
use IO::Handle;
STDOUT->autoflush(1);
#binmode(STDOUT, ":encoding(gbk)");

our $fold = "F:\\Temp\\BTtiantangs";
our $fh = Win32::Unicode::File->new;
our $dh = Win32::Unicode::Dir->new;
chdir $fold;

our @files;

my $skip = 0;
my $rate;
my $path;
my $jpg;
my %info;

$dh->open( $fold );
while (my $f = $dh->fetch() )
{
if ( $f =~ /\.txt/ )
{
$skip = getRate( $f, \%info );
next if ($skip eq 'skip');

$jpg = $f;
$jpg =~s/\.txt$/\.jpg/;

#为不同类型创建副本
for my $n ( 0 .. $#{$info{'types'}} )
{
$path = join("/",
$info{'rate'},
$info{'types'}[$n],
);

mkpathW( $path );
copyW( $f, "$path/".$info{'year'}." ".$f );
copyW( $jpg, "$path/".$info{'year'}." ".$jpg );
}
}
}

sub getRate
{
my ($file, $href) = (shift, shift);
my $all;

$fh->open('<:utf8', $file);
$all = $fh->slurp;
$fh->close;

return "skip" if ( $all=~/此剧集/ );
return "skip" if ( not $all=~/magnet\:\?/i );

#类型
if ($all =~ /类  型 (.*+)/ )
{
$href->{'types'} = [split(" ", $1)];
}

if ($all =~ /年  代 (\d+)/ )
{
$href->{'year'} = $1;
}

if ( $all=~/豆瓣评分\s+(\d+)\.\d/ )
{
$href->{'rate'} = $1;
}
else
{
$href->{'rate'} = 'NoRate';
}

return "";
}
回复

在线用户

正浏览此版面之用户: 没有注册用户 和 2 访客