脚本应保存为utf8编码格式,坦白说,pep人教网上面扫描的图片像素太低了……
书的列表在这个数组中,从必修1到选修4-9
my @booklist = qw/
bx1 bx2 bx3 bx4 bx5 xx11 xx12 xx21
xx22 xx23 xx31 xx33 xx34 xx41 xx42 xx44
xx45 xx46 xx47 xx49 /;
下载文件所在的目录设置
$WORKDIR = "D:\\Book\\Math\\" . $book;
- DownMathBook.pl
#????Code:?523066680
#????Date:?2016-06
use?v5.16;
use?utf8;
use?Encode;
use?LWP::UserAgent;
use?LWP::Simple?qw/getstore?get/;
use?IO::Handle;
STDOUT->autoflush(1);
our?$website?=?"http://www.pep.com.cn/gzsx/jszx_1/czsxtbjxzy/xkbsyjc/dzkb/";
our?$bookpage;
our?$WORKDIR;
our?$page1maps;
my??($path,?$begin,?$end);
my?@booklist?=?qw/?
????bx1??bx2??bx3??bx4??bx5??xx11?xx12?xx21?
????xx22?xx23?xx31?xx33?xx34?xx41?xx42?xx44?
????xx45?xx46?xx47?xx49?/;
for?my?$book?(?@booklist[10?..?$#booklist]??)
{
????print?"Now?is?downloading:?$book\n";
????$bookpage?=?$website.?$book?.?"/";
????$WORKDIR??=??"D:\\Book\\Math\\"?.?$book;
????CreatePath($WORKDIR);
????($path,?$begin,?$end)?=?get_pgnum_range(?$bookpage?);
????print?"Path:?$path,?$begin?to?$end\n";
????#页面1?对应的?页码
????$page1maps?=?get_who_map_page1(?$bookpage?);?
????get_picture(?$bookpage,?$path,?$begin,?$end?);
}
system("pause");
sub?get_pgnum_range
{
????my?$bookpage?=?shift;
????my?$all;
????$all?=?get($bookpage);???#使用lwp::simple?得到的是unicode,
?????????????????????????????#使用lwp::UserAgent?得到的是GB2312
????my?@pglist;
????my?$path;
????$all?=~s/.*封面//s;??????#如果有封面,剔除
????#./201102/t20110217_1021412.htm
????for?my?$e?(?split("\r?\n",?$all)??)
????{
????????if?($e=~/href="\.\/([^"]*_)(\d+).htm"/)
????????{
????????????$path?=?$1;
????????????push?@pglist,?$2;
????????}
????}
????@pglist?=?sort?@pglist;
????return?$path,?@pglist[?0,?$#pglist?];
}
sub?get_who_map_page1
{
????my?$bookpage?=?shift;
????my?$all?=?get($bookpage);
????$all=~s/\r?\n//g;
????if?($all=~/\d+_(\d+)\.htm[^.]+第一(章|讲)/?)
????{
????????return?$1;
????}
????else
????{
????????die?"first?page?code?not?found!?";
????}
}
sub?get_picture
{
????my?($bookpage,?$path,?$begin,?$end)?=?@_;
????our?$page1maps;
????our?$WORKDIR;
????my??$all;???????????#网页内容
????my??$subpage;???????#子页面
????my??$pic;???????????#图片名
????my??$count?=?0;?????#页码计数
????my??$fname;?????????#文件名
????#该网页地址是逆序的,书本page+1,网址代码-1
????for?(my?$n?=?$end;?$n?>=?$begin;?$n--?)
????{
????????$subpage?=?$bookpage?.?$path?.?$n?.?".htm";
????????$all?=?get(?$subpage?)?or?next;
????????if?(?$all=~/IMG?src="?\.\/([^".]*.jpg)"?/i?)??#不一定有""符号
????????{
????????????$pic?=?$1;
????????????$subpage?=~s?/[^\/]+$/$pic/;
????????????if?(?$n?<=?$page1maps?)
????????????{
????????????????$count++;
????????????????$fname?=?sprintf("%03d.jpg",?$count);
????????????}
????????????else
????????????{
????????????????$fname?=?$pic;
????????????}
????????????getstore($subpage,?$WORKDIR."\\".$fname)?or?die?"$!";
????????????print?"$pic\n";
????????}
????}
}
sub?CreatePath?
{
????my?$path?=?shift;
????my?@arr=split(/[\\\/]/,?$path);
????my?$main;
????$main?=?shift?@arr;??#以盘符开始
????for?my?$s?(@arr)
????{
????????$main?.=?"/"?.?$s;
????????mkdir(?$main?)?if?(?!?-d?$main?);
????}
}?