Python如何抓取天猫商品详细信息及交易记录


Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取天猫商品详细信息及交易记录的具体代码,供大家参考,具体内容如下

一、搭建Python环境

本帖使用的是Python 2.7
涉及到的模块:spynner, scrapy, bs4, pymmssql

二、要获取的天猫数据

三、数据抓取流程

四、源代码

#coding:utf-8
import spynner
from scrapy.selector import Selector
from bs4 import BeautifulSoup
import random
import pymssql


#------------------------接数据库-----------------------------#
server="localhost"
user="sa"
password = "123456"
conn=pymssql.connect(server,user,password,"TmallData")
if conn:
  print "DataBase connecting successfully!"
else:
  print "DataBase connecting error!"
cursor=conn.cursor()
#----------------------定义网页操作函数--------------------------#
def py_click_element(browser,pos):
  #点击网页中的元素
  #pos example:'a[href="#description" rel="external nofollow" rel="external nofollow" ]'
  browser.click(pos)
  browser.wait(random.randint(3,10))
  return browser

def py_click_xpath(browser,xpath):
  xpath=xpath+'/@href'
  inner_href=Selector(text=browser.html).xpath(xpath).extract()
  pos='a[href="'+str(inner_href[0])+'" rel="external nofollow" ]'
  browser=py_click_element(browser, pos)
  return browser

def py_webpage_load(browser,url):
  browser.load(url,load_timeout=60)
  browser.wait(10)
  return browser

def py_check_element(browser,xpath):
  #按照xpath查找元素,如果存在则返回True,否则返回False
  if Selector(text=browser.html).xpath(xpath).extract()!=[]:
    return True
  else:
    return False

def py_extract_xpath(browser,xpath):
  if py_check_element(browser, xpath):
    return Selector(text=browser.html).xpath(xpath).extract()[0]
  else:
    return "none"

def py_extract_xpaths(browser,xpaths):
  #批量提取网页内容
  length=len(xpaths)
  results=[0]*length
  for i in range(length):
    results[i]=py_extract_xpath(browser, xpaths[i])
  return results

#-----------------------------数据库操作函数---------------------------#


#-----------------------------数据提取函数----------------------------#
def py_getDealReord(doc):
  soup=BeautifulSoup(doc,'lxml')
  tr=soup.find_all('tr')
  total_dealRecord=[([0]*5)for i in range(len(tr))] 
  i=-1
  for this_tr in tr:
    i=i+1
    td_user=this_tr.find_all('td',attrs={'class':"cell-align-l buyer"})
    for this_td in td_user:
      total_dealRecord[i][0]=this_td.getText().strip(' ')
      #print username
    td_style=this_tr.find_all('td',attrs={'class':"cell-align-l style"})
    for this_td in td_style:
      total_dealRecord[i][1]=this_td.getText(',').strip(' ')
      #print style
    td_quantity=this_tr.find_all('td',attrs={'class':"quantity"})
    for this_td in td_quantity:
      total_dealRecord[i][2]=this_td.getText().strip(' ')
      #print quantity
    td_dealtime=this_tr.find_all('td',attrs={'class':"dealtime"})
    for this_td in td_dealtime:
      total_dealRecord[i][3]=this_td.find('p',attrs={'class':"date"}).getText()
      total_dealRecord[i][4]=this_td.find('p',attrs={'class':"time"}).getText()
  return total_dealRecord
#--------------------获取要抓取的所有商品链接-----------------------#
cursor.execute("""
select * from ProductURLs where BrandName='NB'
""")


file=open("H:\\Eclipse\\TmallCrawling\\HTMLParse\\errLog.txt")
InProductInfo=cursor.fetchall()
browser=spynner.Browser()
for temp_InProductInfo in InProductInfo:

  url='https:'+temp_InProductInfo[2]

  BrandName=temp_InProductInfo[0]
  ProductType=temp_InProductInfo[1]
  print BrandName,'\t',ProductType,'\t',url
  #url= 'https://detail.tmall.com/item.htm?id=524425656711&rn=77636d6db8dea5e30060976fdaf9768d&abbucket=19' 

  try:
    browser=py_webpage_load(browser, url)
  except:
    print "Loading webpage failed."
    file.write(url)
    file.write('\n')
    continue

  xpaths=['//*[@id="J_PromoPrice"]/dd/div/span/text()',\
    '//*[@id="J_StrPriceModBox"]/dd/span/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()',\
    '//*[@id="J_PostageToggleCont"]/p/span/text()',\
    '//*[@id="J_EmStock"]/text()',\
    '//*[@id="J_CollectCount"]/text()',\
    '//*[@id="J_ItemRates"]/div/span[2]/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]/text()']
  out_ProductInfo=py_extract_xpaths(browser,xpaths)
  browser=py_click_element(browser,'a[href="#description" rel="external nofollow" rel="external nofollow" ]')
  ProductProperty=py_extract_xpath(browser, '//*[@id="J_AttrUL"]')
  soup=BeautifulSoup(ProductProperty,'lxml')
  li=soup.find_all('li')
  prop=''
  for this_li in li:
    prop=prop+this_li.getText()+'\\'
  prop=prop[0:len(prop)-1]
  out_ProductProperty=prop
  print out_ProductProperty
  cursor.execute("""
  Insert into py_ProductInfo values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  """,(BrandName,ProductType,url,\
     out_ProductInfo[2],out_ProductInfo[1],\
     out_ProductInfo[0],out_ProductInfo[7],\
     out_ProductInfo[1],out_ProductInfo[3],\
     out_ProductInfo[4],out_ProductInfo[5],\
     out_ProductProperty))
  conn.commit()
  Deal_PageCount=0
  browser=py_click_element(browser, 'a[href="#J_DealRecord" rel="external nofollow" ]')
  #browser.browse(True)
  DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
  out_DealRecord=py_getDealReord(DealRecord)
  for temp_DealRecord in out_DealRecord:
    if str(temp_DealRecord[4])=='0':
      continue
    cursor.execute("""
    Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
    """,(url,temp_DealRecord[0],temp_DealRecord[1],\
       temp_DealRecord[2],temp_DealRecord[3],\
       temp_DealRecord[4]))
    conn.commit()
  Deal_PageCount=Deal_PageCount+1
  print "Page ",Deal_PageCount
  for i in range(6):
    if (i==0) or (i==2):
      continue
    xpath='//*[@id="J_showBuyerList"]/div/div/a['+str(i)+']'
    if py_check_element(browser,xpath):
      browser=py_click_xpath(browser, xpath)
      DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
      out_DealRecord=py_getDealReord(DealRecord)
      for temp_DealRecord in out_DealRecord:
        if str(temp_DealRecord[4])=='0':
          continue
        cursor.execute("""
        Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
        """,(url,temp_DealRecord[0],temp_DealRecord[1],\
           temp_DealRecord[2],temp_DealRecord[3],\
           temp_DealRecord[4]))
        conn.commit()
      Deal_PageCount=Deal_PageCount+1
      print "Page ",Deal_PageCount
  while py_check_element(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]'):
    browser=py_click_xpath(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]')
    DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
    out_DealRecord=py_getDealReord(DealRecord)
    for temp_DealRecord in out_DealRecord:
      if str(temp_DealRecord[4])=='0':
        continue
      cursor.execute("""
      Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
      """,(url,temp_DealRecord[0],temp_DealRecord[1],\
         temp_DealRecord[2],temp_DealRecord[3],\
         temp_DealRecord[4]))
      conn.commit()
    Deal_PageCount=Deal_PageCount+1
    print "Page ",Deal_PageCount

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Python def函数的定义、使用及参数传递实现代码
Aug 10 Python
Python多线程同步Lock、RLock、Semaphore、Event实例
Nov 21 Python
利用Anaconda完美解决Python 2与python 3的共存问题
May 25 Python
Python实现抢购IPhone手机
Feb 07 Python
python实现飞机大战微信小游戏
Mar 21 Python
Python3中_(下划线)和__(双下划线)的用途和区别
Apr 26 Python
python3.6 如何将list存入txt后再读出list的方法
Jul 02 Python
Python爬虫抓取技术的一些经验
Jul 12 Python
python字典setdefault方法和get方法使用实例
Dec 25 Python
pandas 对group进行聚合的例子
Dec 27 Python
Python GUI编程学习笔记之tkinter控件的介绍及基本使用方法详解
Mar 30 Python
Pyecharts 中Geo函数常用参数的用法说明
Feb 01 Python
python列表生成式与列表生成器的使用
Feb 23 #Python
1分钟快速生成用于网页内容提取的xslt
Feb 23 #Python
python使用xslt提取网页数据的方法
Feb 23 #Python
Python爬虫使用Selenium+PhantomJS抓取Ajax和动态HTML内容
Feb 23 #Python
python爬虫获取多页天涯帖子
Feb 23 #Python
Python即时网络爬虫项目启动说明详解
Feb 23 #Python
Python爬豆瓣电影实例
Feb 23 #Python
You might like
Zend studio for eclipse中使php可以调用mysql相关函数的设置方法
2008/10/13 PHP
thinkphp3.0 模板中函数的使用
2012/11/13 PHP
如何使用GDB调试PHP程序
2015/12/08 PHP
PHP面向对象程序设计模拟一般面向对象语言中的方法重载(overload)示例
2019/06/13 PHP
基于jQuery试卷自动排版系统
2010/07/18 Javascript
使用JavaScript检测Firefox浏览器是否启用了Firebug的代码
2010/12/28 Javascript
Jquery 获取对象的几种方式介绍
2014/01/17 Javascript
实例分析js和C#中使用正则表达式匹配a标签
2014/11/26 Javascript
JavaScript实现的字符串replaceAll函数代码分享
2015/04/02 Javascript
JS响应鼠标点击实现两个滑块区间拖动效果
2015/10/26 Javascript
win7下安装配置node.js+express开发环境
2015/12/06 Javascript
AngularJS 模块化详解及实例代码
2016/09/14 Javascript
JavaScript闭包和回调详解
2017/08/09 Javascript
利用10行js代码实现上下滚动公告效果
2017/12/08 Javascript
Vue使用localStorage存储数据的方法
2019/05/27 Javascript
利用d3.js实现蜂巢图表带动画效果
2019/09/03 Javascript
vue.js实现三级菜单效果
2019/10/19 Javascript
[01:15]PWL S2开团时刻第二期——他们杀 我就白给
2020/11/25 DOTA
[03:13]DOTA2-DPC中国联赛1月25日Recap集锦
2021/03/11 DOTA
Python中operator模块的操作符使用示例总结
2016/06/28 Python
恢复百度云盘本地误删的文件脚本(简单方法)
2017/10/21 Python
python类的方法属性与方法属性的动态绑定代码详解
2017/12/27 Python
Python二叉树定义与遍历方法实例分析
2018/05/25 Python
PyTorch的深度学习入门教程之构建神经网络
2019/06/27 Python
Mac安装python3的方法步骤
2019/08/09 Python
python实现静态服务器
2019/09/05 Python
Python异常模块traceback用法实例分析
2019/10/22 Python
Python上下文管理器用法及实例解析
2019/11/11 Python
Python高级编程之继承问题详解(super与mro)
2019/11/19 Python
python绘制彩虹图
2019/12/16 Python
Probikekit日本:自行车套件,跑步和铁人三项装备
2017/04/03 全球购物
青春飞扬演讲稿
2014/09/11 职场文书
乡镇安全生产月活动总结
2015/05/08 职场文书
毕业赠语大全
2015/06/23 职场文书
升职自荐书
2019/05/09 职场文书
动画「半妖的夜叉姬」新BD特典图公开
2022/03/22 日漫