Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地


Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下

#!/user/bin/python 
# -*- coding: gbk -*- 
#Spider.py 
 
import urllib2 
import httplib 
import StringIO 
import gzip 
import re 
import chardet 
import sys 
import os 
import datetime 
from xml.dom.minidom import Document 
from BeautifulSoup import BeautifulSoup 
 
## 这段代码是用于解决控制台打印汉字报错的问题 
reload(sys) 
sys.setdefaultencoding("utf8") 
##################################################### 
 
## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志 
DEBUG = 1 
NO_DEBUG = 0 
httplib.HTTPConnection.debuglevel = DEBUG 
## 是否显示爬取网页源代码开关 
showSrcCode = False 
## 压缩方式 
ZIP_TYPE = "gzip" 
 
fileName = "auctions" 
location = "d://spiderData/" 
 
## header 
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} 
##################################################### 
 
 
#############class SpiderConfig ##################### 
class SpiderConfig: 
 """ 
  configuration for spider name and url 
 """ 
 def __init__(self, name, url): 
  self.name = name 
  self.url = url 
##################################################### 
 
##############class SpiderAuctionDomain############## 
class SpiderAuctionDomain: 
 """ 
  Store information with auctions spidered by python 
 """ 
 title = "" 
 url = "" 
 img = "" 
 price = "" 
 
 def __init__(self): 
  pass 
 
##################################################### 
 
########class SpiderDefaultErrorHandler############## 
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 
 def http_error_default(self, req, fp, code, msg, hdrs): 
  """ 
   default error process handler for spider 
  """ 
  result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) 
  result.status = code 
  result.url = req.get_full_url() 
 
  print "<", result.url, "Exception code :", result.status, ">" 
 
  return result 
##################################################### 
 
#############class SpiderHandler##################### 
class SpiderHandler: 
 """ 
  spider handler 
 """ 
 
 def spider(self, spiderConfig): 
  try: 
   request = urllib2.Request(spiderConfig.url) 
 
   ## configure request hreader 
   for key,val in headerConfig.items(): 
    request.add_header(key, val) 
 
   ## build opener 
   opener = urllib2.build_opener(SpiderDefaultErrorHandler()) 
 
   ## open request 
   openRequest = opener.open(request) 
 
   ## read data 
   spiderData = openRequest.read() 
 
   ## close 
   opener.close() 
 
   if 0 == len(spiderData): 
    return 
 
   if ZIP_TYPE== openRequest.headers.get("Content-Encoding"): 
    spiderData = SpiderHandler.gzipData(self, spiderData) 
 
   if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode: 
    print spiderData 
 
   # parse html 
   SpiderHandler.parse(self, spiderData) 
 
  except Exception,x: 
   print "spider process Exception:", x 
 
 
 
 def parse(self, spiderData): 
  """ 
   parse html content 
  """ 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   charsetAnalyze = chardet.detect(spiderData) 
   print "analyze spider data encode :",charsetAnalyze["encoding"] 
 
  print "执行解析", fileName 
 
  soup = BeautifulSoup(spiderData) 
  encode = soup.originalEncoding 
 
  encoding = lambda x : x.encode(encode) 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   print "识别到编码:", encode 
   title = soup.head.title.string 
   print encoding(title) 
 
  spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"}) 
  auctions = ["%s" % s for s in spiderContents] 
 
  if auctions is None: 
   return 
 
  auctionList = [] 
 
  for auc in auctions: 
   auctionDomain = SpiderAuctionDomain() 
   # parse auction link 
   links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc) 
   if links is not None : 
    auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0])) 
 
   #parse auction title 
   titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc) 
   if titles is not None: 
    auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0])) 
 
   #parse auction price 
   price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc) 
   if price is not None: 
    auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0]) 
 
   #parse image url 
   imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc) 
   if imgs is not None: 
    auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0]) 
 
   auctionList.append(auctionDomain) 
 
  print "成功解析商品信息:" 
  for a in auctionList: 
   print "--->",a.title 
 
  # sort auction list 
  auctionList = SpiderHandler.sortAuctionList(self, auctionList) 
 
  # save in file 
  SpiderHandler.save(self, auctionList) 
 
  print "解析完成" 
 
  pass 
 
 def sortAuctionList(self, auctionList): 
  """ 
   冒泡排序,按照价格排序 
  """ 
  length = len(auctionList) 
  if length < 2: 
   return auctionList 
  else: 
   for i in range(length-1): 
    for j in range(length - i -1): 
     if float(auctionList[j].price) > float(auctionList[j+1].price): 
      auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j] 
  return auctionList 
  pass 
 
 def save(self, auctionList): 
  if auctionList is not None: 
   doc = Document() 
 
   auctions = doc.createElement("auctions") 
   doc.appendChild(auctions) 
 
   for auc in auctionList: 
    auction = doc.createElement("auction") 
    auctions.appendChild(auction) 
 
    SpiderHandler.generateXML(self, doc, auction, "title", auc.title) 
    SpiderHandler.generateXML(self, doc, auction, "price", auc.price) 
    SpiderHandler.generateXML(self, doc, auction, "img", auc.img) 
    SpiderHandler.generateXML(self, doc, auction, "link", auc.link) 
 
   if False == os.path.exists(location): 
    os.mkdir(location) 
 
   file = open(location+fileName+".xml", 'w') 
   file.write(doc.toprettyxml()) 
   file.close() 
 
   if httplib.HTTPConnection.debuglevel == DEBUG: 
    print doc.toprettyxml() 
 
 def generateXML(self, doc, f, name, txt): 
  c = doc.createElement(name) 
  f.appendChild(c) 
  c.appendChild(doc.createTextNode(txt)) 
 
 def gzipData(self, spiderData): 
  """ 
   get data from gzip 
  """ 
  if 0 == len(spiderData): 
   return spiderData 
  spiderDataStream = StringIO.StringIO(spiderData) 
  spiderData = gzip.GzipFile(fileobj=spiderDataStream).read() 
  return spiderData 
##################################################### 
 
if __name__ == "__main__": 
 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒") 
 
 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou", 
      "hangzhou":"http://ju.taobao.com/hangzhou", 
      "shanghai":"http://ju.taobao.com/shanghai", 
      "beijing":"http://ju.taobao.com/beijing", 
      "chengdu":"http://ju.taobao.com/chengdu"} 
 
 configList = [] 
 for k,v in needSpiderUrl.items(): 
  spiderConfig = SpiderConfig(k, v) 
  configList.append(spiderConfig) 
 
 spiderHandler = SpiderHandler() 
 
 print "爬虫执行开始时间:",nowtime() 
 for spiderConfig in configList: 
  fileName = spiderConfig.name 
  spiderHandler.spider(spiderConfig) 
 
 print "爬虫执行完毕时间:",nowtime()

更多内容请参考专题《python爬取功能汇总》进行学习。

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
python解析模块(ConfigParser)使用方法
Dec 10 Python
Python实现的RSS阅读器实例
Jul 25 Python
详解python3实现的web端json通信协议
Dec 29 Python
linux平台使用Python制作BT种子并获取BT种子信息的方法
Jan 20 Python
Python实现将SQLite中的数据直接输出为CVS的方法示例
Jul 13 Python
分享几道你可能遇到的python面试题
Jul 24 Python
利用Python实现在同一网络中的本地文件共享方法
Jun 04 Python
python opencv 读取图片 返回图片某像素点的b,g,r值的实现方法
Jul 03 Python
Django Celery异步任务队列的实现
Jul 24 Python
python GUI库图形界面开发之PyQt5拖放控件实例详解
Feb 25 Python
jupyter 中文乱码设置编码格式 避免控制台输出的解决
Apr 20 Python
PyQt5实现登录页面
May 30 Python
Python各类图像库的图片读写方式总结(推荐)
Feb 23 #Python
python自动发邮件库yagmail的示例代码
Feb 23 #Python
Python KMeans聚类问题分析
Feb 23 #Python
浅谈python爬虫使用Selenium模拟浏览器行为
Feb 23 #Python
python kmeans聚类简单介绍和实现代码
Feb 23 #Python
python MysqlDb模块安装及其使用详解
Feb 23 #Python
Python实现k-means算法
Feb 23 #Python
You might like
基于OpenCV的PHP图像人脸识别技术
2009/10/11 PHP
thinkPHP模板中函数的使用方法示例
2016/11/30 PHP
thinkPHP框架实现图像裁剪、缩放、加水印的方法
2017/03/14 PHP
PHP 多任务秒级定时器的实现方法
2018/05/13 PHP
javascript 可以拖动的DIV(二)
2009/06/26 Javascript
Javascript学习笔记1 数据类型
2010/01/11 Javascript
菜鸟学习JavaScript小实验之函数引用
2010/11/17 Javascript
JavaScript中的Math.LN2属性用法详解
2015/06/12 Javascript
JavaScript模拟实现键盘打字效果
2015/06/29 Javascript
JavaScript实现删除,移动和复制文件的方法
2015/08/05 Javascript
Bootstrap基本组件学习笔记之导航(10)
2016/12/07 Javascript
angularjs中的$eval方法详解
2017/04/24 Javascript
Vue shopCart 组件开发详解
2018/01/26 Javascript
详解vue组件开发脚手架
2018/06/15 Javascript
webpack4 处理SCSS的方法示例
2018/09/03 Javascript
浅析Vue中拆分视图层代码的5点建议
2019/08/15 Javascript
详解Howler.js Web音频播放终极解决方案
2020/08/23 Javascript
手把手教你实现 Promise的使用方法
2020/09/02 Javascript
python服务器端收发请求的实现代码
2014/09/29 Python
分享Python切分字符串的一个不错方法
2018/12/14 Python
详解用Python实现自动化监控远程服务器
2019/05/18 Python
利用python list完成最简单的DB连接池方法
2019/08/09 Python
python深copy和浅copy区别对比解析
2019/12/26 Python
TensorFlow绘制loss/accuracy曲线的实例
2020/01/21 Python
Python实现在Windows平台修改文件属性
2020/03/05 Python
2020最新pycharm汉化安装(python工程狮亲测有效)
2020/04/26 Python
CSS类名支持中文命名的示例
2014/04/04 HTML / CSS
《九寨沟》教学反思
2014/04/08 职场文书
环保倡议书400字
2014/05/15 职场文书
党员批评与自我批评
2014/10/15 职场文书
外贸英文求职信范文
2015/03/19 职场文书
社区低保工作总结2015
2015/07/23 职场文书
2015年公司中秋节致辞
2015/07/31 职场文书
2019运动会广播加油稿汇总
2019/08/21 职场文书
详解Python为什么不用设计模式
2021/06/24 Python
永中文档在线转换预览基于nginx配置部署方案
2022/06/10 Servers