Python爬取国外天气预报网站的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下:

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python 相关文章推荐
python实现多线程采集的2个代码例子
Jul 07 Python
编写简单的Python程序来判断文本的语种
Apr 07 Python
Python编码爬坑指南(必看)
Jun 10 Python
Python 通过URL打开图片实例详解
Jun 01 Python
解决python写入mysql中datetime类型遇到的问题
Jun 21 Python
对numpy Array [: ,] 的取值方法详解
Jul 02 Python
Python魔法方法详解
Feb 13 Python
python 设置输出图像的像素大小方法
Jul 04 Python
Flask框架请求钩子与request请求对象用法实例分析
Nov 07 Python
python爬取本站电子书信息并入库的实现代码
Jan 20 Python
Django修改app名称和数据表迁移方案实现
Sep 17 Python
Python3的进程和线程你了解吗
Mar 16 Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
python简单文本处理的方法
Jul 10 #Python
Python实现把json格式转换成文本或sql文件
Jul 10 #Python
Python中的一些陷阱与技巧小结
Jul 10 #Python
Python中的fileinput模块的简单实用示例
Jul 09 #Python
Python中的anydbm模版和shelve模版使用指南
Jul 09 #Python
python冒泡排序简单实现方法
Jul 09 #Python
You might like
浏览器关闭后,能继续执行的php函数(ignore_user_abort)
2012/08/01 PHP
PHP中file_exists使用中遇到的问题小结
2016/04/05 PHP
ExtJs3.0中Store添加 baseParams 的Bug
2010/03/10 Javascript
Colortip基于jquery的信息提示框插件在IE6下面的显示问题修正方法
2010/12/06 Javascript
JavaScript栏目列表隐藏/显示简单实现
2013/04/03 Javascript
js中的事件捕捉模型与冒泡模型实例分析
2015/01/10 Javascript
jQuery网页右侧广告跟随滚动代码分享
2020/04/20 Javascript
bootstrap实现弹窗和拖动效果
2016/01/03 Javascript
第四章之BootStrap表单与图片
2016/04/25 Javascript
js+html5实现canvas绘制椭圆形图案的方法
2016/05/21 Javascript
JavaScript字符集编码与解码详谈
2017/02/02 Javascript
JQuery EasyUI的一些常用组件
2017/07/12 jQuery
Angularjs 事件指令详细整理
2017/07/27 Javascript
Vue中多个元素、组件的过渡及列表过渡的方法示例
2019/02/13 Javascript
实现一个 Vue 吸顶锚点组件方法
2019/07/10 Javascript
基于layPage插件实现两种分页方式浅析
2019/07/27 Javascript
Vue引入Stylus知识点总结
2020/01/16 Javascript
JavaScript设计模式--桥梁模式引入操作实例分析
2020/05/23 Javascript
详解vue 组件注册
2020/11/20 Vue.js
[33:19]完美世界DOTA2联赛PWL S2 PXG vs InkIce 第一场 11.26
2020/11/30 DOTA
搭建Python的Django框架环境并建立和运行第一个App的教程
2016/07/02 Python
python os.path模块常用方法实例详解
2018/09/16 Python
python 利用pyttsx3文字转语音过程详解
2019/09/25 Python
python实现发送form-data数据的方法详解
2019/09/27 Python
python图的深度优先和广度优先算法实例分析
2019/10/26 Python
基于Python实现ComicReaper漫画自动爬取脚本过程解析
2019/11/11 Python
详解python opencv、scikit-image和PIL图像处理库比较
2019/12/26 Python
python第三方库学习笔记
2020/02/07 Python
使用Python pip怎么升级pip
2020/08/11 Python
CSS3轻松实现圆角效果
2017/11/09 HTML / CSS
英国户外装备和冒险服装零售商:alloutdoor
2018/01/30 全球购物
阿姆斯特丹杜莎夫人蜡像馆官方网站:Madame Tussauds Amsterdam
2019/03/12 全球购物
生日派对邀请函
2014/01/13 职场文书
生产部厂长助理职位说明书
2014/03/03 职场文书
教师党的群众路线学习心得体会
2014/11/04 职场文书
淘宝客服专员岗位职责
2015/04/07 职场文书