一直以来都听说python可以爬虫,近日老大让将微信端发布在微信小蚂蚁上的文章中链接,类型,缩略图,概述等迁移到当前项目,一条条迁的话肯定是一脸懵逼,老大有提示通过爬虫来搞。之前其实有写过简单的脚本去爬取陕西省手机号码的列表,这次想在网上找找轮子。
第一天入门了一个PHPSpider的爬虫框架,但在实际使用中看了半天手册,愣是在xpath书写获取指定元素的属性值难住了,在各群发了一圈问题,有位仁兄提醒可以试试QueryList,该爬虫框架通过composer安装后,在TP5直接use就可以使用,但有一点,该框架会依赖symfony/vardumper包,TP中安装后会接管原来的var_dump输出,很是困恼
直接上代码
<?php
/**
* @authors ZL
* @email 987968469@qq.com
* @date 2018-08-15 15:45:53
*/
namespace app\spider\controller;
use think\Controller;
use think\Db;
use QL\QueryList;
use app\admin\service\AppService;
/***************爬虫框架自带var_dump会影响原来的TP5打印输出,先去掉,用时再用composer安装,运行的composer命令是composer require jaeger/querylist:~V4*******************/
class WechatZhiShui extends Controller {
//爬取微信的治水风采
public function index(){
//不设置失效时间
set_time_limit(0);
// 待采集的链接集合
for ($i=1; $i <72 ; $i++) {
$urls[] = 'http://www.xmypage.com/getmore.php?artical=,309105,308610,308608,308274,306754,305431,303463,303462,303461,303460,301926,301297,300920,300535,299762,299761,299727,298771,298769,298204,298186,298185,298183,298182,298180,298178,298176,298175,298174,297998,297589,297582,297584,297308,297302,297298,297295,296297,296211,295922,295513,295919,295511,295496,295141,294802,294800,294798,294043,294042,293907,293151,293149,292998,292801,292799,292537,292535,292300,292298,292158,292163,292090,291494,291064,290970,290830,290794,290663,290241,290237,290231,290228,290226,289822,289454,289206,289140,288827,287781,287181,287178,286795,286792,286734,286404,286401,286282,285782,285780,285779,285652,285650,285466,284974,284972,284486,284255,284067,284065,283625,283622,283621,283445,283443,283439,283243,283240,282967,281985,281934,281932,281607,281340,281338,279935,279815,279813,279809,279806,279252,279259,279257,279253,278350,277501,277499,277277,276704,276614,276591,276588,276430,275583,274631,274093,273704,273702,273656,273068,272648,272646,272620,272531,272529,272183,272181,272179,272176,272174,272108,272106,272015,272011,272009,271670,271564,271562,270817,270815,270593,270589,270485,270483,270244,270119,270095,270025,270021,269858,269856,269789,269787,269362,269358,269344,269341,269275,269274,268495,268130,268129,268128,267789,267649,267791,267647,267437,267432,267307,267305,267302,267301,267298,267297,266993,266991,266462,266119,266116,266115,266113,266111,265964,265962,265595,265593,265339,265338,265248,265244,264968,264848,263915,263234,263233,263232,263130,261473,261470,261270,261268,260585,260093,260095,260091,258970,258395,258264,258262,258266,257956,257460,257458,257456,257454,256626,255830,255577,255576,255147,255145,255098,254692,254113,254110,254118,253922,253596,254125,253457,253453,253451,253446,252557,251856,251855,251257,251168,250984,250982,250977,250975,249913,249493,249989,249988,249990,249982,249987,249974,249975,249977,249976,249969,249968,249967,249966,235018,232373,231723,231719,230407&page='.$i;
}
// 设置采集规则
$ql = QueryList::rules([
'url' => ['a.list_item','href'],
'path' => ['img.img','src'],
'title' => ['h2.title','text'],
'summary' => ['p.desc','text']
]);
$data = [];
foreach($urls as $url){
// 每条链接都会应用上面设置好的采集规则
$each_page_data = $ql->get($url)->query()->getData()->all();
$data = array_merge($data, array_values($each_page_data));
// 释放Document内存占用
$ql->destruct();
}
$dir_public = "uploads/images/article/".date('Y-m-d', time())."/";
if(!is_dir($dir_public)){
mkdir($dir_public,0777,true);
}
$dir_root = ROOT_PATH ."public" . DS . $dir_public;
$log_root = ROOT_PATH ."public" . DS ."uploads/images/article/".date('Y-m-d', time())."/";
// halt($data);
//治水风采有两条空数据,又多了一条数据没爬到
unset($data[217]);
unset($data[242]);
unset($data[243]);
// $origin = sha1(microtime(true));
// halt($dir_root.$origin);
$AppService = new AppService;
$i = 0;
foreach ($data as $value) {
/*********查看atrticle表是否有该条数据,有就跳过,没有就写入*********/
$article = Db::table('lyhz_article')->where('url', $value['url'])->find();
if(!$article){
/********************添加日志,方便定位问题*****************/
$log_info = 'index:'.$i.'——'.json_encode($value, 320);
file_put_contents($log_root.'爬虫导出出错.txt',$log_info.PHP_EOL, FILE_APPEND);
/********************下载图片保存到本地对应的文件夹中*****************/
/****文件名(根据循环不断生成,否则图片为覆盖)*****/
$file_name = sha1(microtime(true)).'.jpg';
//这个常规获取url图片尾缀的方式这里不能生效,默认jpg后缀
// $save_name = pathinfo(parse_url($value['path'], PHP_URL_PATH ), PATHINFO_EXTENSION );
// halt($save_name);
$image = file_get_contents($value['path']);
file_put_contents($dir_root.$file_name, $image);
$path = $dir_public.$file_name;
/********************将该条文章信息数据写入article表*****************/
$param = [
"url" => $value['url'],
"title" => $value['title'],
"path" => $path,
"summary" => $value['summary'],
"title_bold" => 1,
"summary_hidden" => 0,
"label" => '',
"label_bg" => '#0074dd',
"label_color" => '#fff',
"picture_show" => 'left',
"is_recommend" => 0,
"type_id" => 2
];
$AppService->getArticleAdd($param);
}
$i++;
}
//执行结束后
return 'finsh';
}
}
网友评论