Python爬取国外天气预报网站的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下:

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python 相关文章推荐
Python入门篇之正则表达式
Oct 20 Python
使用Python脚本和ADB命令实现卸载App
Feb 10 Python
python实现Windows电脑定时关机
Jun 20 Python
Python3导入CSV文件的实例(跟Python2有些许的不同)
Jun 22 Python
Python面向对象程序设计OOP深入分析【构造函数,组合类,工具类等】
Jan 05 Python
Python树莓派学习笔记之UDP传输视频帧操作详解
Nov 15 Python
用python画一只可爱的皮卡丘实例
Nov 21 Python
对python中 math模块下 atan 和 atan2的区别详解
Jan 17 Python
python爬虫库scrapy简单使用实例详解
Feb 10 Python
Python基础类继承重写实现原理解析
Apr 03 Python
python - asyncio异步编程
Apr 06 Python
Python采集股票数据并制作可视化柱状图
Apr 04 Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
python简单文本处理的方法
Jul 10 #Python
Python实现把json格式转换成文本或sql文件
Jul 10 #Python
Python中的一些陷阱与技巧小结
Jul 10 #Python
Python中的fileinput模块的简单实用示例
Jul 09 #Python
Python中的anydbm模版和shelve模版使用指南
Jul 09 #Python
python冒泡排序简单实现方法
Jul 09 #Python
You might like
php中数据的批量导入(csv文件)
2006/10/09 PHP
php中看实例学正则表达式
2006/12/25 PHP
php 动态执行带有参数的类方法
2009/04/10 PHP
PHP操作MongoDB时的整数问题及对策说明
2011/05/02 PHP
php一次性删除前台checkbox多选内容的方法
2013/09/22 PHP
PDO防注入原理分析以及注意事项
2015/02/25 PHP
PHP也能干大事 随机函数
2015/04/14 PHP
php封装一个异常的处理类
2017/06/08 PHP
PHP给源代码加密的几种方法汇总(推荐)
2018/02/06 PHP
javascript 进阶篇3 Ajax 、JSON、 Prototype介绍
2012/03/14 Javascript
jquery的冒泡事件的阻止与允许(三种实现方法)
2013/02/01 Javascript
jQuery调用AJAX时Get和post公用的乱码解决方法实例说明
2013/06/04 Javascript
JavaScript拆分字符串时产生空字符的解决方案
2014/09/26 Javascript
Nodejs学习笔记之NET模块
2015/01/13 NodeJs
javascript发送短信验证码实现代码
2015/11/12 Javascript
JS中对象与字符串的互相转换详解
2016/05/20 Javascript
JavaScript中英文字符长度统计方法示例【按照中文占2个字符】
2017/01/17 Javascript
使用JavaScript开发跨平台的桌面应用详解
2017/07/27 Javascript
springmvc接收jquery提交的数组数据代码分享
2017/10/28 jQuery
基于JavaScript实现抽奖系统
2018/01/16 Javascript
vue-cli脚手架config目录下index.js配置文件的方法
2018/03/13 Javascript
js实现图片3D轮播效果
2019/09/21 Javascript
[44:09]DOTA2上海特级锦标赛A组小组赛#1 EHOME VS MVP.Phx第二局
2016/02/25 DOTA
[59:42]Secret vs Alliacne 2019国际邀请赛小组赛 BO2 第一场 8.15
2019/08/17 DOTA
Python实现将目录中TXT合并成一个大TXT文件的方法
2015/07/15 Python
Pycharm远程调试openstack的方法
2017/11/21 Python
Python一句代码实现找出所有水仙花数的方法
2018/11/13 Python
Django自定义用户登录认证示例代码
2019/06/30 Python
python 匿名函数与三元运算学习笔记
2020/10/23 Python
Python用摘要算法生成token及检验token的示例代码
2020/12/01 Python
Python try except finally资源回收的实现
2021/01/25 Python
Otticanet意大利:最顶尖的世界名牌眼镜, 能得到打折季的价格
2019/03/10 全球购物
Paper Cape官网:美国婴儿和儿童服装品牌
2019/11/02 全球购物
什么是表空间(tablespace)和系统表空间(System tablespace)
2013/02/25 面试题
单方离婚协议书范本2014
2014/10/28 职场文书
2016学雷锋优秀志愿者事迹材料
2016/02/25 职场文书