[Perl]爬虫 - 抓取并积累某平台订单记录

There's more than one way to do it!
https://metacpan.org http://perlmonks.org
回复
头像
523066680
Administrator
Administrator
帖子: 573
注册时间: 2016年07月19日 12:14
联系:

[Perl]爬虫 - 抓取并积累某平台订单记录

帖子 523066680 »

某平台订单记录,因为在线获取的记录每一页都有重复的部分,所以代码就长了一些,要判断重叠。
每次执行会叠加更新数据,保存为 perldb 文件,使用 Storable 模块做转储
=info
    爬取订单记录、积累数据
    Author: 523066680/vicyang
    Date: 2019-03

    方案:获取所有旧数据,获取所有在线数据,将数据合并。
=cut

use Modern::Perl;
use Encode;
use File::Slurp;
use Storable;
use FindBin;
use lib "$FindBin::Bin/../Function";
use GetOrders;
STDOUT->autoflush(1);

my $item = "2050788225";
my $dbfile = "Brief_${item}.perldb";

my $old = get_original( $dbfile );
my $online = GetOrders::get_online_orders( $item );

printf "orders in db: %d\n", 1+$#$old;
printf "orders online: %d\n", 1+$#$online;

say "Merging";
GetOrders::merge( $old, $online );
printf "Dumping, items: %d\n", $#$old+1;
store $old, $dbfile;
say "Done";
exit;

sub get_original
{
    my $dbfile = shift;
    my $data = -e $dbfile ? retrieve( $dbfile ) : [];
    return $data;
}
模块代码(路径 ..\Function\GetOrders.pm):
package GetOrders;
use Modern::Perl;
use Mojo::UserAgent;
use Mojo::DOM;
use Date::Parse;
use Date::Format;

our $main = "https://feedback.aliexpress.com";
our $url = "https://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm";
our $ua = Mojo::UserAgent->new();
our @headers = (
        "Host" => "www.aliexpress.com",
        "User-Agent" => "Firefox/63.0",
    );

sub get_online_orders
{
    my ($item) = @_;
    my $orders = [];
    my %args = (
        "productId" => $item,
        "type" => "default",
        "page" => "1"
    );

    my $pgcode = 1;
    my $total = 1;
    my $parts;
    my @all;
    # 按页码顺序获取,数据存储到 @all
    while ( $pgcode <= $total and $pgcode <= 10 )
    {
        printf "Current Page %d\n", $pgcode;
        $args{'page'} = $pgcode;
        my $res = $ua->get( $url, form => \%args )->result;
        redo unless $res->is_success;
        ($parts, $total) = read_json( $res, $pgcode );
        # $parts 返回升序序列,但不同页码的时间段是降序,unshift 将其反转。
        unshift @all, $parts;
        $pgcode++;
    }

    # 按日期(升序)合并
    grep { merge( $orders, $_ ) } @all;

    return $orders;
}

# merge 是可复用函数,可以在外部使用。
sub merge
{
    my ( $orders, $parts ) = @_;
    if ( $#$orders < 0 ) {
        @$orders = @$parts;
    } else {
        my $sect = 0;
        for my $id ( 0 .. $#$parts-1 ) {
            if ( $parts->[$id][0] == $orders->[-2][0]  and 
                 $parts->[$id+1][0] == $orders->[-1][0] )
            {
                $sect = $id+2;
                last;
            }
        }
        #printf "Sect: %d\n", $sect;
        push @$orders, @{$parts}[$sect .. $#$parts];
    }
}

sub read_json
{
    my ($res, $page) = @_;
    my $node = $res->json;
    my $data = [];

    # 如果页码不对,返回空数据
    return [] if $node->{page}->{current} != $page;
    for my $e ( @{$node->{records}} )
    {
        # 降序反转为升序
        unshift @$data, [
                str2time($e->{date}),
                $e->{countryCode},
                $e->{quantity},
                $e->{name},
                $e->{buyerAccountPointLeval},
            ];
    }
    
    return ( $data, $node->{page}->{total} );
}

1;
回复

在线用户

正浏览此版面之用户: 没有注册用户 和 2 访客