python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
安装Python的web.py框架并从hello world开始编程
Apr 25 Python
栈和队列数据结构的基本概念及其相关的Python实现
Aug 24 Python
Python连接MySQL并使用fetchall()方法过滤特殊字符
Mar 13 Python
深入理解Python中的*重复运算符
Oct 28 Python
Python callable()函数用法实例分析
Mar 17 Python
python数据结构之线性表的顺序存储结构
Sep 28 Python
Python实现链表反转的方法分析【迭代法与递归法】
Feb 22 Python
Python通过zookeeper实现分布式服务代码解析
Jul 22 Python
Python QT组件库qtwidgets的使用
Nov 02 Python
Ubuntu20下的Django安装的方法步骤
Jan 24 Python
解决pytorch 模型复制的一些问题
Mar 03 Python
Selenium浏览器自动化如何上传文件
Apr 06 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
PHP 数字左侧自动补0
2008/03/31 PHP
在PHP中设置、使用、删除Cookie的解决方法
2013/05/06 PHP
实例介绍PHP删除数组中的重复元素
2019/03/03 PHP
Mootools 1.2教程(3) 数组使用简介
2009/09/14 Javascript
Jquery进度条插件 Progress Bar小问题解决
2011/07/12 Javascript
基于jquery库的tab新形式使用
2012/11/16 Javascript
JQuery入门——移除绑定事件unbind方法概述及应用
2013/02/05 Javascript
js获取当前日期代码适用于网页头部
2013/06/27 Javascript
jQuery中attr()和prop()在修改checked属性时的区别
2014/07/18 Javascript
NodeJS制作爬虫全过程(续)
2014/12/22 NodeJs
使用Jquery实现每日签到功能
2015/04/03 Javascript
JavaScript实现Base64编码转换
2016/04/23 Javascript
node.js插件nodeclipse安装图文教程
2020/10/19 Javascript
vux uploader 图片上传组件的安装使用方法
2018/05/15 Javascript
Electron中实现大文件上传和断点续传功能
2018/10/28 Javascript
vue将单页面改造成多页面应用的方法
2018/11/25 Javascript
解决layer弹出层的内容页点击按钮跳转到新的页面问题
2019/09/14 Javascript
python实现监控linux性能及进程消耗性能的方法
2014/07/25 Python
使用python实现简单五子棋游戏
2019/06/18 Python
python列表,字典,元组简单用法示例
2019/07/11 Python
Python函数的定义方式与函数参数问题实例分析
2019/12/26 Python
一款纯css3实现的圆形旋转分享按钮旋转角度可自己调整
2014/09/02 HTML / CSS
英国美术用品购物网站:Cass Art
2019/10/08 全球购物
世界上最伟大的马产品:Equiderma
2020/01/07 全球购物
精彩的广告词
2014/03/19 职场文书
《第一次抱母亲》教学反思
2014/04/16 职场文书
2014乡镇党委副书记对照检查材料思想汇报
2014/10/09 职场文书
滞留工资返还协议书
2014/10/19 职场文书
初中作文评语
2014/12/25 职场文书
体育活动总结
2015/02/04 职场文书
2015年12.4全国法制宣传日活动总结
2015/03/24 职场文书
python如何做代码性能分析
2021/04/26 Python
基于Python实现的购物商城管理系统
2021/04/27 Python
教你如何使用Python实现二叉树结构及三种遍历
2021/06/18 Python
SpringCloud之@FeignClient()注解的使用方式
2021/09/25 Java/Android
win10如何更改appdata文件夹的默认位置?
2022/07/15 数码科技