python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
Python中为什么要用self探讨
Apr 14 Python
python字符串对其居中显示的方法
Jul 11 Python
Python输出带颜色的字符串实例
Oct 10 Python
matlab中实现矩阵删除一行或一列的方法
Apr 04 Python
python中不能连接超时的问题及解决方法
Jun 10 Python
python实现K近邻回归,采用等权重和不等权重的方法
Jan 23 Python
Python3安装psycopy2以及遇到问题解决方法
Jul 03 Python
face++与python实现人脸识别签到(考勤)功能
Aug 28 Python
使用pandas实现连续数据的离散化处理方式(分箱操作)
Nov 22 Python
Python面向对象程序设计之静态方法、类方法、属性方法原理与用法分析
Mar 23 Python
python实现四人制扑克牌游戏
Apr 22 Python
pandas提升计算效率的一些方法汇总
May 30 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
php trim 去除空字符的定义与语法介绍
2010/05/31 PHP
使用composer 安装 laravel框架的方法图文详解
2019/08/02 PHP
再次更新!MSClass (Class Of Marquee Scroll通用不间断滚动JS封装类 Ver 1.6)
2007/02/05 Javascript
YUI的Tab切换实现代码
2010/04/11 Javascript
jquery浏览器滚动加载技术实现方案
2014/06/03 Javascript
原生javascript实现DIV拖拽并计算重复面积
2015/01/02 Javascript
基于jQuery实现自动轮播旋转木马特效
2015/11/02 Javascript
总结javascript中的六种迭代器
2016/08/16 Javascript
jQuery实现的购物车物品数量加减功能代码
2016/11/16 Javascript
js+div+css下拉导航菜单完整代码分享
2016/12/28 Javascript
微信小程序实现瀑布流布局与无限加载的方法详解
2017/05/12 Javascript
对于input 框限定输入值为浮点型的js代码
2017/09/25 Javascript
Vue CLI 3搭建vue+vuex最全分析(推荐)
2018/09/27 Javascript
浅谈ECMAScript 中的Array类型
2019/06/10 Javascript
vuex + keep-alive实现tab标签页面缓存功能
2019/10/17 Javascript
js里面的变量范围分享
2020/07/18 Javascript
[01:50]WODOTA制作 DOTA2中文宣传片《HERO》
2013/04/28 DOTA
[36:20]完美世界DOTA2联赛PWL S3 access vs Rebirth 第一场 12.17
2020/12/18 DOTA
详解Python当中的字符串和编码
2015/04/25 Python
Python使用pylab库实现画线功能的方法详解
2017/06/08 Python
[原创]pip和pygal的安装实例教程
2017/12/07 Python
PyTorch搭建多项式回归模型(三)
2019/05/22 Python
python实现对服务器脚本敏感信息的加密解密功能
2019/08/13 Python
python爬虫爬取幽默笑话网站
2019/10/24 Python
python中count函数简单用法
2020/01/05 Python
纯CSS3实现地球自转实现代码(图文教程附送源码)
2012/12/26 HTML / CSS
全球知名巧克力品牌:Godiva
2016/07/22 全球购物
俄罗斯宠物用品网上商店:ZooMag
2019/12/12 全球购物
岗位职责的定义
2013/11/10 职场文书
大专应届生个人的自我评价
2013/11/21 职场文书
文秘档案管理岗位职责
2014/03/06 职场文书
买卖协议书范本
2014/04/21 职场文书
企业文明单位申报材料
2014/05/16 职场文书
责任书范本
2014/08/25 职场文书
学院党的群众路线教育实践活动第一阶段情况汇报
2014/10/25 职场文书
学校国庆节活动总结
2015/03/23 职场文书