python实现多线程采集的2个代码例子


Posted in Python onJuly 07, 2014

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = '127.0.0.1'
DB_USER = "XXXX"
DB_PASSWD = "XXXXXXXX"
DB_NAME = "xxxx"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print 'Thead {0} started.'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print "The queue is full !"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print "Waiting for threads to finish."
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r'<title>(.*?)</title>'
    title = re.compile(reg).findall(html)
    # title = title[0].decode('gb2312','replace').encode('utf-8')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print 'This {0} is {1}'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == '__main__':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = ['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print 'Thead {0} started.'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print "The queue is full !"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print "Waiting for threads to finish."
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r'<title>(.*?)</title>'
  title = re.compile(reg).findall(html)
  title = title[0].decode('gb2312','replace').encode('utf-8')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print 'This {0} is {1}'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == '__main__':
  workerbee(urls)
Python 相关文章推荐
python中的内置函数getattr()介绍及示例
Jul 20 Python
举例讲解Python程序与系统shell交互的方式
Apr 09 Python
Python 中 list 的各项操作技巧
Apr 13 Python
matplotlib调整子图间距,调整整体空白的方法
Aug 03 Python
对Python中的条件判断、循环以及循环的终止方法详解
Feb 08 Python
在python plt图表中文字大小调节的方法
Jul 08 Python
python获取指定日期范围内的每一天,每个月,每季度的方法
Aug 08 Python
pytorch 图像预处理之减去均值,除以方差的实例
Jan 02 Python
python GUI库图形界面开发之PyQt5拖放控件实例详解
Feb 25 Python
pycharm软件实现设置自动保存操作
Jun 08 Python
Python局部变量与全局变量区别原理解析
Jul 14 Python
python logging 重复写日志问题解决办法详解
Aug 04 Python
Python程序员开发中常犯的10个错误
Jul 07 #Python
python采用requests库模拟登录和抓取数据的简单示例
Jul 05 #Python
浅析python 中__name__ = '__main__' 的作用
Jul 05 #Python
python在windows下实现备份程序实例
Jul 04 #Python
python调用短信猫控件实现发短信功能实例
Jul 04 #Python
Python实现类继承实例
Jul 04 #Python
Django集成百度富文本编辑器uEditor攻略
Jul 04 #Python
You might like
php中get_headers函数的作用及用法的详细介绍
2013/04/27 PHP
PHP编写学校网站上新生注册登陆程序的实例分享
2016/03/21 PHP
php中加密解密DES类的简单使用方法示例
2020/03/26 PHP
jQuery 瀑布流 绝对定位布局(二)(延迟AJAX加载图片)
2012/05/23 Javascript
jquery阻止冒泡事件使用模拟事件
2013/09/06 Javascript
javascript history对象(历史记录)使用方法(实现浏览器前进后退)
2014/01/07 Javascript
JavaScript制作的可折叠弹出式菜单示例
2014/04/04 Javascript
使用GruntJS构建Web程序之构建篇
2014/06/04 Javascript
window.open()实现post传递参数
2015/03/12 Javascript
jQuery菜单插件用法实例
2015/07/25 Javascript
JavaScript实现字符串与日期的互相转换及日期的格式化
2016/03/07 Javascript
AngularJS控制器继承自另一控制器
2016/05/09 Javascript
bootstarp modal框居中显示的实现代码
2017/02/18 Javascript
JavaScript该如何学习 怎样轻松学习JavaScript
2017/06/12 Javascript
Vue computed计算属性的使用方法
2017/07/14 Javascript
react router 4.0以上的路由应用详解
2017/09/21 Javascript
ES6扩展运算符用法实例分析
2017/10/31 Javascript
微信小程序使用wxParse解析html的方法示例
2019/01/17 Javascript
pm2发布node配置文件ecosystem.json详解
2019/05/15 Javascript
npm 语义版本控制详解
2019/09/10 Javascript
jquery传参及获取方式(两种方式)
2020/02/13 jQuery
Python处理文本文件中控制字符的方法
2017/02/07 Python
详解Python 2.6 升级至 Python 2.7 的实践心得
2017/04/27 Python
django自带调试服务器的使用详解
2019/08/29 Python
python 函数嵌套及多函数共同运行知识点讲解
2020/03/03 Python
Python作用域与名字空间原理详解
2020/03/21 Python
TUMI马来西亚官方网站:国际领先的高品质商旅箱包品牌
2018/04/26 全球购物
英国羊绒服装购物网站:Pure Collection
2018/10/22 全球购物
制定岗位职责的原则
2013/11/08 职场文书
初三物理教学反思
2014/01/21 职场文书
小学生感恩演讲稿
2014/04/25 职场文书
统计员岗位职责范本
2015/04/14 职场文书
公司出差管理制度范本
2015/08/05 职场文书
mysql分表之后如何平滑上线详解
2021/11/01 MySQL
javascript的setTimeout()使用方法总结
2021/11/20 Javascript
vue封装数字翻牌器
2022/04/20 Vue.js