php与python实现的线程池多线程爬虫功能示例


Posted in PHP onOctober 12, 2016

本文实例讲述了php与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:

多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看php与python 线程池多线程爬虫的例子,代码如下:

php例子

<?php
class Connect extends Worker //worker模式
{
public function __construct()
{
}
public function getConnection()
{
if (!self::$ch)
{
self::$ch = curl_init();
curl_setopt(self::$ch, CURLOPT_TIMEOUT, 2);
curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt(self::$ch, CURLOPT_HEADER, 0);
curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true);
curl_setopt(self::$ch, CURLOPT_USERAGENT, "Firefox");
curl_setopt(self::$ch, CURLOPT_FOLLOWLOCATION, 1);
}
/* do some exception/error stuff here maybe */
return self::$ch;
}
public function closeConnection()
{
curl_close(self::$ch);
}
/**
* Note that the link is stored statically, which for pthreads, means thread local
* */
protected static $ch;
}
class Query extends Threaded
{
public function __construct($url)
{
$this->url = $url;
}
public function run()
{
$ch = $this->worker->getConnection();
curl_setopt($ch, CURLOPT_URL, $this->url);
$page = curl_exec($ch);
$info = curl_getinfo($ch);
$error = curl_error($ch);
$this->deal_data($this->url, $page, $info, $error);
$this->result = $page;
}
function deal_data($url, $page, $info, $error)
{
$parts = explode(".", $url);
$id = $parts[1];
if ($info['http_code'] != 200)
{
$this->show_msg($id, $error);
} else
{
$this->show_msg($id, "OK");
}
}
function show_msg($id, $msg)
{
echo $id."\t$msg\n";
}
public function getResult()
{
return $this->result;
}
protected $url;
protected $result;
}
function check_urls_multi_pthreads()
{
global $check_urls; //定义抓取的连接
$check_urls = array( 'http://xxx.com' => "xx网",);
$pool = new Pool(10, "Connect", array()); //建立10个线程池
foreach ($check_urls as $url => $name)
{
$pool->submit(new Query($url));
}
$pool->shutdown();
}
check_urls_multi_pthreads();
python 多线程
def handle(sid)://这个方法内执行爬虫数据处理
pass
class MyThread(Thread):
"""docstring for ClassName"""
def __init__(self, sid):
Thread.__init__(self)
self.sid = sid
def run():
handle(self.sid)
threads = []
for i in xrange(1,11):
t = MyThread(i)
threads.append(t)
t.start()
for t in threads:
t.join()

python 线程池爬虫:

from queue import Queue
from threading import Thread, Lock
import urllib.parse
import socket
import re
import time
seen_urls = set(['/'])
lock = Lock()
class Fetcher(Thread):
  def __init__(self, tasks):
    Thread.__init__(self)
    self.tasks = tasks
    self.daemon = True
    self.start()
  def run(self):
    while True:
      url = self.tasks.get()
      print(url)
      sock = socket.socket()
      sock.connect(('localhost', 3000))
      get = 'GET {} HTTP/1.0\r\nHost: localhost\r\n\r\n'.format(url)
      sock.send(get.encode('ascii'))
      response = b''
      chunk = sock.recv(4096)
      while chunk:
        response += chunk
        chunk = sock.recv(4096)
      links = self.parse_links(url, response)
      lock.acquire()
      for link in links.difference(seen_urls):
        self.tasks.put(link)
      seen_urls.update(links)
      lock.release()
      self.tasks.task_done()
  def parse_links(self, fetched_url, response):
    if not response:
      print('error: {}'.format(fetched_url))
      return set()
    if not self._is_html(response):
      return set()
    urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',
               self.body(response)))
    links = set()
    for url in urls:
      normalized = urllib.parse.urljoin(fetched_url, url)
      parts = urllib.parse.urlparse(normalized)
      if parts.scheme not in ('', 'http', 'https'):
        continue
      host, port = urllib.parse.splitport(parts.netloc)
      if host and host.lower() not in ('localhost'):
        continue
      defragmented, frag = urllib.parse.urldefrag(parts.path)
      links.add(defragmented)
    return links
  def body(self, response):
    body = response.split(b'\r\n\r\n', 1)[1]
    return body.decode('utf-8')
  def _is_html(self, response):
    head, body = response.split(b'\r\n\r\n', 1)
    headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:])
    return headers.get('Content-Type', '').startswith('text/html')
class ThreadPool:
  def __init__(self, num_threads):
    self.tasks = Queue()
    for _ in range(num_threads):
      Fetcher(self.tasks)
  def add_task(self, url):
    self.tasks.put(url)
  def wait_completion(self):
    self.tasks.join()
if __name__ == '__main__':
  start = time.time()
  pool = ThreadPool(4)
  pool.add_task("/")
  pool.wait_completion()
  print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))

希望本文所述对大家PHP程序设计有所帮助。

PHP 相关文章推荐
提升PHP执行速度全攻略(上)
Oct 09 PHP
PHP获取网卡地址的代码
Apr 09 PHP
php下删除一篇文章生成的多个静态页面
Aug 08 PHP
用Php编写注册后Email激活验证的实例代码
Mar 11 PHP
推荐10个提供免费PHP脚本下载的网站
Dec 31 PHP
php中使用key,value,current,next和prev函数遍历数组的方法
Mar 17 PHP
PHP下载生成的csv文件及问题总结
Aug 06 PHP
PHP实现补齐关闭的HTML标签
Mar 22 PHP
yii2中结合gridview如何使用modal弹窗实例代码详解
Jun 12 PHP
laravel实现批量更新多条记录的方法示例
Oct 22 PHP
PHP实现正则表达式分组捕获操作示例
Feb 03 PHP
微信小程序和php的登录实现
Apr 01 PHP
php实现的SSO单点登录系统接入功能示例分析
Oct 12 #PHP
php用户密码加密算法分析【Discuz加密算法】
Oct 12 #PHP
基于php实现的php代码加密解密类完整实例
Oct 12 #PHP
php fseek函数读取大文件两种方法
Oct 12 #PHP
PHP从二维数组得到N层分类树的实现代码
Oct 11 #PHP
php 无限分类 树形数据格式化代码
Oct 11 #PHP
PHP简单判断iPhone、iPad、Android及PC设备的方法
Oct 11 #PHP
You might like
php disk_free_space 返回目录可用空间
2010/05/10 PHP
php读取文件内容至字符串中,同时去除换行、空行、行首行尾空格(Zjmainstay原创)
2012/07/31 PHP
分享8个最佳的代码片段在线测试网站
2013/06/29 PHP
浅析十款PHP开发框架的对比
2013/07/05 PHP
Laravel 5框架学习之Laravel入门和新建项目
2015/04/07 PHP
PHP滚动日志的代码实现
2015/06/10 PHP
详解PHP编码转换函数应用技巧
2016/10/22 PHP
php数组指针操作详解
2017/02/14 PHP
JavaScript 联动的无限级封装类,数据采用非Ajax方式,随意添加联动
2010/06/29 Javascript
js字符编码函数区别分析
2011/12/28 Javascript
jquery zTree异步加载简单实例分享
2013/02/05 Javascript
Jquery中$.post和$.ajax的用法小结
2015/04/28 Javascript
理解javascript异步编程
2016/01/27 Javascript
JavaScript6 let 新语法优势介绍
2016/07/15 Javascript
Jquery Easyui菜单组件Menu使用详解(15)
2016/12/18 Javascript
js微信支付实现代码
2016/12/22 Javascript
Vue.js tab实现选项卡切换
2017/05/16 Javascript
原生js调用json方法总结
2018/02/22 Javascript
node puppeteer(headless chrome)实现网站登录
2018/05/09 Javascript
python求众数问题实例
2014/09/26 Python
深入理解Python中命名空间的查找规则LEGB
2015/08/06 Python
深入解析Python编程中super关键字的用法
2016/06/24 Python
Tensorflow中的placeholder和feed_dict的使用
2018/07/09 Python
python实现Zabbix-API监控
2018/09/17 Python
使用Python进行体育竞技分析(预测球队成绩)
2019/05/16 Python
Anaconda3+tensorflow2.0.0+PyCharm安装与环境搭建(图文)
2020/02/18 Python
Python实现链表反转的方法分析【迭代法与递归法】
2020/02/22 Python
python实现的分层随机抽样案例
2020/02/25 Python
如何在python中判断变量的类型
2020/07/29 Python
HTML5输入框下拉菜单功能的示例代码
2020/09/08 HTML / CSS
风险评估实施方案
2014/03/09 职场文书
医学生自荐信范文
2015/03/05 职场文书
工作收入证明范本
2015/06/12 职场文书
党课主持词大全
2015/06/30 职场文书
某某幼儿园的教育教学管理调研分析报告
2019/11/29 职场文书
nginx刷新页面出现404解决方案(亲测有效)
2022/03/18 Servers