Python多线程、异步+多进程爬虫实现代码


Posted in Python onFebruary 17, 2016

安装Tornado
省事点可以直接用grequests库,下面用的是tornado的异步client。 异步用到了tornado,根据官方文档的例子修改得到一个简单的异步爬虫类。可以参考下最新的文档学习下。
pip install tornado

异步爬虫

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import time
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
import traceback


class AsySpider(object):
  """A simple class of asynchronous spider."""
  def __init__(self, urls, concurrency=10, **kwargs):
    urls.reverse()
    self.urls = urls
    self.concurrency = concurrency
    self._q = queues.Queue()
    self._fetching = set()
    self._fetched = set()

  def fetch(self, url, **kwargs):
    fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
    return fetch(url, **kwargs)

  def handle_html(self, url, html):
    """handle html page"""
    print(url)

  def handle_response(self, url, response):
    """inherit and rewrite this method"""
    if response.code == 200:
      self.handle_html(url, response.body)

    elif response.code == 599:  # retry
      self._fetching.remove(url)
      self._q.put(url)

  @gen.coroutine
  def get_page(self, url):
    try:
      response = yield self.fetch(url)
      print('######fetched %s' % url)
    except Exception as e:
      print('Exception: %s %s' % (e, url))
      raise gen.Return(e)
    raise gen.Return(response)

  @gen.coroutine
  def _run(self):
    @gen.coroutine
    def fetch_url():
      current_url = yield self._q.get()
      try:
        if current_url in self._fetching:
          return

        print('fetching****** %s' % current_url)
        self._fetching.add(current_url)

        response = yield self.get_page(current_url)
        self.handle_response(current_url, response)  # handle reponse

        self._fetched.add(current_url)

        for i in range(self.concurrency):
          if self.urls:
            yield self._q.put(self.urls.pop())

      finally:
        self._q.task_done()

    @gen.coroutine
    def worker():
      while True:
        yield fetch_url()

    self._q.put(self.urls.pop())  # add first url

    # Start workers, then wait for the work queue to be empty.
    for _ in range(self.concurrency):
      worker()

    yield self._q.join(timeout=timedelta(seconds=300000))
    assert self._fetching == self._fetched

  def run(self):
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(self._run)


class MySpider(AsySpider):

  def fetch(self, url, **kwargs):
    """重写父类fetch方法可以添加cookies,headers,timeout等信息"""
    cookies_str = "PHPSESSID=j1tt66a829idnms56ppb70jri4; pspt=%7B%22id%22%3A%2233153%22%2C%22pswd%22%3A%228835d2c1351d221b4ab016fbf9e8253f%22%2C%22_code%22%3A%22f779dcd011f4e2581c716d1e1b945861%22%7D; key=%E9%87%8D%E5%BA%86%E5%95%84%E6%9C%A8%E9%B8%9F%E7%BD%91%E7%BB%9C%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8; think_language=zh-cn; SERVERID=a66d7d08fa1c8b2e37dbdc6ffff82d9e|1444973193|1444967835; CNZZDATA1254842228=1433864393-1442810831-%7C1444972138"  # 从浏览器拷贝cookie字符串
    headers = {
      'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
      'cookie': cookies_str
    }
    return super(MySpider, self).fetch(  # 参数参考tornado文档
      url, headers=headers, request_timeout=1
    )

  def handle_html(self, url, html):
    print(url, html)


def main():
  urls = []
  for page in range(1, 100):
    urls.append('http://www.baidu.com?page=%s' % page)
  s = MySpider(urls)
  s.run()


if __name__ == '__main__':
  main()

可以继承这个类,塞一些url进去,然后重写handle_page处理得到的页面。

异步+多进程爬虫
还可以再变态点,加个进程池,使用了multiprocessing模块。效率飕飕的,

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import time
from multiprocessing import Pool
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues


class AsySpider(object):
  """A simple class of asynchronous spider."""
  def __init__(self, urls, concurrency):
    urls.reverse()
    self.urls = urls
    self.concurrency = concurrency
    self._q = queues.Queue()
    self._fetching = set()
    self._fetched = set()

  def handle_page(self, url, html):
    filename = url.rsplit('/', 1)[1]
    with open(filename, 'w+') as f:
      f.write(html)

  @gen.coroutine
  def get_page(self, url):
    try:
      response = yield httpclient.AsyncHTTPClient().fetch(url)
      print('######fetched %s' % url)
    except Exception as e:
      print('Exception: %s %s' % (e, url))
      raise gen.Return('')
    raise gen.Return(response.body)

  @gen.coroutine
  def _run(self):

    @gen.coroutine
    def fetch_url():
      current_url = yield self._q.get()
      try:
        if current_url in self._fetching:
          return

        print('fetching****** %s' % current_url)
        self._fetching.add(current_url)
        html = yield self.get_page(current_url)
        self._fetched.add(current_url)

        self.handle_page(current_url, html)

        for i in range(self.concurrency):
          if self.urls:
            yield self._q.put(self.urls.pop())

      finally:
        self._q.task_done()

    @gen.coroutine
    def worker():
      while True:
        yield fetch_url()

    self._q.put(self.urls.pop())

    # Start workers, then wait for the work queue to be empty.
    for _ in range(self.concurrency):
      worker()
    yield self._q.join(timeout=timedelta(seconds=300000))
    assert self._fetching == self._fetched

  def run(self):
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(self._run)


def run_spider(beg, end):
  urls = []
  for page in range(beg, end):
    urls.append('http://127.0.0.1/%s.htm' % page)
  s = AsySpider(urls, 10)
  s.run()


def main():
  _st = time.time()
  p = Pool()
  all_num = 73000
  num = 4  # number of cpu cores
  per_num, left = divmod(all_num, num)
  s = range(0, all_num, per_num)
  res = []
  for i in range(len(s)-1):
    res.append((s[i], s[i+1]))
  res.append((s[len(s)-1], all_num))
  print res

  for i in res:
    p.apply_async(run_spider, args=(i[0], i[1],))
  p.close()
  p.join()

  print time.time()-_st


if __name__ == '__main__':
  main()

多线程爬虫
线程池实现.

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import Queue
import sys
import requests
import os
import threading
import time

class Worker(threading.Thread):  # 处理工作请求
  def __init__(self, workQueue, resultQueue, **kwds):
    threading.Thread.__init__(self, **kwds)
    self.setDaemon(True)
    self.workQueue = workQueue
    self.resultQueue = resultQueue


  def run(self):
    while 1:
      try:
        callable, args, kwds = self.workQueue.get(False)  # get task
        res = callable(*args, **kwds)
        self.resultQueue.put(res)  # put result
      except Queue.Empty:
        break

class WorkManager:  # 线程池管理,创建
  def __init__(self, num_of_workers=10):
    self.workQueue = Queue.Queue()  # 请求队列
    self.resultQueue = Queue.Queue()  # 输出结果的队列
    self.workers = []
    self._recruitThreads(num_of_workers)

  def _recruitThreads(self, num_of_workers):
    for i in range(num_of_workers):
      worker = Worker(self.workQueue, self.resultQueue)  # 创建工作线程
      self.workers.append(worker)  # 加入到线程队列


  def start(self):
    for w in self.workers:
      w.start()

  def wait_for_complete(self):
    while len(self.workers):
      worker = self.workers.pop()  # 从池中取出一个线程处理请求
      worker.join()
      if worker.isAlive() and not self.workQueue.empty():
        self.workers.append(worker)  # 重新加入线程池中
    print 'All jobs were complete.'


  def add_job(self, callable, *args, **kwds):
    self.workQueue.put((callable, args, kwds))  # 向工作队列中加入请求

  def get_result(self, *args, **kwds):
    return self.resultQueue.get(*args, **kwds)


def download_file(url):
  #print 'beg download', url
  requests.get(url).text


def main():
  try:
    num_of_threads = int(sys.argv[1])
  except:
    num_of_threads = 10
  _st = time.time()
  wm = WorkManager(num_of_threads)
  print num_of_threads
  urls = ['http://www.baidu.com'] * 1000
  for i in urls:
    wm.add_job(download_file, i)
  wm.start()
  wm.wait_for_complete()
  print time.time() - _st

if __name__ == '__main__':
  main()

这三种随便一种都有很高的效率,但是这么跑会给网站服务器不小的压力,尤其是小站点,还是有点节操为好。

Python 相关文章推荐
Python的Flask框架中使用Flask-Migrate扩展迁移数据库的教程
Jun 14 Python
python itchat实现微信好友头像拼接图的示例代码
Aug 14 Python
Python排序搜索基本算法之选择排序实例分析
Dec 09 Python
python切片及sys.argv[]用法详解
May 25 Python
python3 selenium自动化 frame表单嵌套的切换方法
Aug 23 Python
python 模拟贷款卡号生成规则过程解析
Aug 30 Python
python使用 request 发送表单数据操作示例
Sep 25 Python
jupyter note 实现将数据保存为word
Apr 14 Python
opencv python 图片读取与显示图片窗口未响应问题的解决
Apr 24 Python
详解python 条件语句和while循环的实例代码
Dec 28 Python
手残删除python之后的补救方法
Jun 26 Python
Python写情书? 10行代码展示如何把情书写在她的照片里
Apr 21 Python
玩转python爬虫之爬取糗事百科段子
Feb 17 #Python
玩转python爬虫之正则表达式
Feb 17 #Python
玩转python爬虫之URLError异常处理
Feb 17 #Python
玩转python爬虫之cookie使用方法
Feb 17 #Python
Python 爬虫爬取指定博客的所有文章
Feb 17 #Python
Using Django with GAE Python 后台抓取多个网站的页面全文
Feb 17 #Python
python实现RSA加密(解密)算法
Feb 17 #Python
You might like
PHP 删除文件与文件夹操作 unlink()与rmdir()这两个函数的使用
2011/07/17 PHP
腾讯微博提示missing parameter errorcode 102 错误的解决方法
2014/12/22 PHP
深入讲解PHP的对象注入(Object Injection)
2017/03/01 PHP
PHP时间函数使用详解
2019/03/21 PHP
php设计模式之享元模式分析【星际争霸游戏案例】
2020/03/23 PHP
javascript实现 在光标处插入指定内容
2007/05/25 Javascript
JS事件Event元素(兼容IE,Firefox,Chorme)
2012/11/01 Javascript
js下拉菜单语言选项简单实现
2013/09/23 Javascript
在javascript中如何得到中英文混合字符串的长度
2014/01/17 Javascript
基于iframe实现类似于ajax的页面无刷新
2014/05/31 Javascript
js控制文本框只输入数字和小数点的方法
2015/03/10 Javascript
jQuery拖拽通过八个点改变div大小
2020/11/29 Javascript
vue 组件中添加样式不生效的解决方法
2018/07/06 Javascript
简述pm2常用命令集合及配置文件说明
2019/05/30 Javascript
swiper Scrollbar滚动条组件详解
2019/09/08 Javascript
JS浏览器BOM常见操作实例详解
2020/04/27 Javascript
vue-video-player实现实时视频播放方式(监控设备-rtmp流)
2020/08/10 Javascript
Python基于PycURL实现POST的方法
2015/07/25 Python
Python中关键字nonlocal和global的声明与解析
2017/03/12 Python
pandas.DataFrame 根据条件新建列并赋值的方法
2018/04/08 Python
Pandas读取MySQL数据到DataFrame的方法
2018/07/25 Python
利用Django-environ如何区分不同环境
2018/08/26 Python
python实现贪吃蛇游戏
2020/03/21 Python
Python While循环语句实例演示及原理解析
2020/01/03 Python
Python使用shutil模块实现文件拷贝
2020/07/31 Python
Python Pygame实现俄罗斯方块
2021/02/19 Python
amazeui树节点自动展开折叠面板并选中第一个树节点的实现
2020/08/24 HTML / CSS
英国50岁以上人群的交友网站:Ourtime
2018/03/28 全球购物
阿联酋航空丹麦官方网站:Emirates DK
2019/08/25 全球购物
校园新闻广播稿
2014/01/10 职场文书
学校大课间活动方案
2014/01/30 职场文书
会务接待方案
2014/02/27 职场文书
艾滋病宣传活动总结
2014/05/08 职场文书
mysql 8.0.24 安装配置方法图文教程
2021/05/12 MySQL
JS实现数组去重的11种方法总结
2022/04/04 Javascript
MySQL 逻辑备份 into outfile
2022/05/15 MySQL