QueryList PHP爬虫初学者指南-CSDN博客

这篇博客介绍了QueryList的入门使用，包括规则库的编写和如何采集文章页面的内容，帮助初学者掌握PHP爬虫的基本操作。

QueryList入门

规则库的编写

$rules = array(
   '规则名' => array('jQuery选择器','要采集的属性'),
   '规则名2' => array('jQuery选择器','要采集的属性'),
    ..........
);

利用编写的规则库，收集对应的元素

<?php
namespace app\index\controller;
//require 'vendor/autoload.php';

//导入QueryList的库
use QL\QueryList;

class Index
{
    public function index()
    {
        //把采集目标放在html变量中，包含在STR里
        $html = <<<STR
<div id="one">
    <div class="two">
        <a href="http://querylist.cc">QueryList官网</a>
        <img src="http://querylist.com/1.jpg" alt="这是图片">
        <img src="http://querylist.com/2.jpg" alt="这是图片2">
    </div>
    <span>其它的<b>一些</b>文本</span>
</div>      
STR;

        //定义采集规则
        $rules = array(
            //采集id为one这个元素里面的纯文本内容
            'text' => array('#one','text'),
            //采集class为two下面的超链接的链接
            'link' => array('.two>a','href'),
            //采集class为two下面的第二张图片的链接
            'img' => array('.two>img:eq(1)','src'),
            //采集span标签中的HTML内容
            'other' => array('span','html')
        );
		
        //使用QueryList中的Query(参数1 采集内容，参数2 采集规则)方法进行采集，在方法中获取了data变量
        $data = QueryList::Query($html,$rules)->data;
		
        //打印结果
        print_r($data);
       
    }
}

结果

Array ( [0] => Array ( [text] => QueryList官网 其它的一些文本 
[link] => http://querylist.cc 
[img] => http://querylist.com/2.jpg
[other] => 其它的一些文本 ) )

采集文章页

<?php
namespace app\index\controller;
//require 'vendor/autoload.php';

use QL\QueryList;


class Index
{
    public function demo2()
        {

   		 	// 待采集的页面地址
            $url = 'https://www.cnbeta.com/articles/tech/779841.htm';

    		// 采集规则
            $rules = [
                // 文章标题
                'title' => ['.title>h1', 'text'],
                // 发布日期
                'date' => ['.meta>span:eq(0)', 'text'],
                // 文章内容
                'content' => ['#artibody', 'html']
            ];

            $data = QueryList::Query($url, $rules)->data;

            print_r($data);
        }
}