[Perl]批量抓取(非阻塞)跨境电商平台市场大盘数据

There's more than one way to do it!
https://metacpan.org http://perlmonks.org
回复
头像
523066680
Administrator
Administrator
帖子: 573
注册时间: 2016年07月19日 12:14
联系:

[Perl]批量抓取(非阻塞)跨境电商平台市场大盘数据

帖子 523066680 »

环境 Win10, Strawberry Perl
Mojolicious的非阻塞请求时不限制请求量的,为了避免阻塞,额外安装模块 Mojo::UserAgent::Role::Queued,用于限制同一时间的请求数
因为非阻塞请求是给 $ua 传一个回调函数,这种情况参数是固定的($ua, $tx),为了给回调函数增加一个参数(文件名),用了闭包实现匿名函数传参
$ua->get( $url, form => $args, closure->($file) ) ;

代码: 全选

sub closure ($file) 
{
    return 
    sub ($ua, $tx) {
        printf "%s\n", $file;
        write_file( $file, to_json( $tx->result->json, {pretty => 1, utf8 => 1} ) );
    }
}
Login函数是一个黑盒,总之是登录平台用的。

代码: 全选

use Login;

代码: 全选

=info
    AliExpress 市场大盘数据
    Author: 523066680/vicyang
    2020-04
=cut

use File::Slurp;
use utf8;
use Encode;
use Modern::Perl;
use Mojo::UserAgent -signatures;
use JSON qw/from_json to_json/;
use List::Util qw/sum/;
use Try::Tiny;
use DateTime;
STDOUT->autoflush(1);

use FindBin;
use lib encode('gbk',"D:/发货辅助/lib");
use Login;

my $ua = Mojo::UserAgent->new()->with_roles('+Queued');
my $loop = Mojo::IOLoop->singleton;
$loop   = $loop->max_accepts(5);
$loop   = $loop->max_connections(5);
$ua->max_active(3); 

$ua->request_timeout(10);
Login::login_by_cookies($ua);

our $lastday = DateTime->last_day_of_month(year => 2021, month => 11);
#our $lastday = DateTime->last_day_of_month(year => 2021, month => 10, day => 26);
our $firstday = DateTime->new(year => 2021, month => 11, day => 1);
our $wdir = sprintf("./Data_%d%02d_Month", $firstday->year(), $firstday->month());

mkdir $wdir unless -e $wdir;
my $content = read_file( "Category.json" );
my $data = from_json( $content );

# 获取子类目数据
recur_tree( $data->{44}{child}, 44, 1 );

# 获取一级类目数据
getjson( 0, 44, $data->{44}{label}, 0 );

sub recur_tree
{
    my ( $node, $parentID, $lv ) = @_;
    for my $id ( keys %$node )
    {
        printf "%s%s %s\n", " "x($lv*2), u2gbk($node->{$id}{label}), $id;
        getjson( $parentID, $id, $node->{$id}{label}, $lv );
        recur_tree( $node->{$id}{child}, $id, $lv+1 ) if exists $node->{$id}{child};
    }
}

$loop->start unless $loop->is_running;

sub getjson
{
    my ($parentID, $cateID, $name, $level) = @_;
    my $res;
    my $url;
    my %args;
    my $file;
    my $dateRange = sprintf "%s|%s", $firstday->ymd('-'), $lastday->ymd('-');

    #  数据摘要
    $file = "${wdir}/${cateID}_core.json";
    $url = 'https://sycm.aliexpress.com/api/market-dashboard/core-indicators';
    %args = (
        'dateType' => 'month',
        'dateRange' => $dateRange,
        'country' => 'ALL',
        'platform' => 'ALL',
        'cateId' => $cateID,
        'cateLevel' => $level,
        'parentCateId' => $parentID == 0 ? -9999 : $parentID,
        #'_' => time(),
    );
    get_and_dump( $ua, $url, \%args, $file );

    #  趋势数据
    $file = "${wdir}/${cateID}_trend.json";
    $url = 'https://sycm.aliexpress.com/api/market-dashboard/indicator-trend';
    get_and_dump( $ua, $url, \%args, $file );

    #  国家构成
    $file = "${wdir}/${cateID}_country_cst.json";
    $url = 'https://sycm.aliexpress.com/api/market-dashboard/country-constitute/core-indicators';
    $args{'orderBy'} = "uvIndex";
    $args{'indexCode'} = "uvIndex,visitedItemCnt,vstItemPercent,supplyDemandIndex,payPerBuyerAmt,itemAddCartBuyerCnt,wishlistBuyerCnt";
    get_and_dump( $ua, $url, \%args, $file );
}

sub get_and_dump
{
    my ($ua, $url, $args, $file) = @_;
    return if -e $file;
    $ua->get( $url, form => $args, closure->($file) ) ;
}

sub closure ($file) 
{
    return 
    sub ($ua, $tx) {
        printf "%s\n", $file;
        write_file( $file, to_json( $tx->result->json, {pretty => 1, utf8 => 1} ) );
    }
}

sub gbk { encode('gbk', $_[0]) }
sub utf8 { encode('utf8', $_[0]) }
sub u2gbk { encode('gbk', decode('utf8', $_[0])) }
回复

在线用户

正浏览此版面之用户: 没有注册用户 和 0 访客