编程 Python

Python如何抓取天猫商品详细信息及交易记录

Posted in Python onFebruary 23, 2018

本文实例为大家分享了Python抓取天猫商品详细信息及交易记录的具体代码，供大家参考，具体内容如下

一、搭建Python环境

本帖使用的是Python 2.7
涉及到的模块：spynner, scrapy, bs4, pymmssql

二、要获取的天猫数据

三、数据抓取流程

四、源代码

#coding:utf-8
import spynner
from scrapy.selector import Selector
from bs4 import BeautifulSoup
import random
import pymssql


#------------------------接数据库-----------------------------#
server="localhost"
user="sa"
password = "123456"
conn=pymssql.connect(server,user,password,"TmallData")
if conn:
  print "DataBase connecting successfully!"
else:
  print "DataBase connecting error!"
cursor=conn.cursor()
#----------------------定义网页操作函数--------------------------#
def py_click_element(browser,pos):
  #点击网页中的元素
  #pos example:'a[href="#description" rel="external nofollow" rel="external nofollow" ]'
  browser.click(pos)
  browser.wait(random.randint(3,10))
  return browser

def py_click_xpath(browser,xpath):
  xpath=xpath+'/@href'
  inner_href=Selector(text=browser.html).xpath(xpath).extract()
  pos='a[href="'+str(inner_href[0])+'" rel="external nofollow" ]'
  browser=py_click_element(browser, pos)
  return browser

def py_webpage_load(browser,url):
  browser.load(url,load_timeout=60)
  browser.wait(10)
  return browser

def py_check_element(browser,xpath):
  #按照xpath查找元素，如果存在则返回True，否则返回False
  if Selector(text=browser.html).xpath(xpath).extract()!=[]:
    return True
  else:
    return False

def py_extract_xpath(browser,xpath):
  if py_check_element(browser, xpath):
    return Selector(text=browser.html).xpath(xpath).extract()[0]
  else:
    return "none"

def py_extract_xpaths(browser,xpaths):
  #批量提取网页内容
  length=len(xpaths)
  results=[0]*length
  for i in range(length):
    results[i]=py_extract_xpath(browser, xpaths[i])
  return results

#-----------------------------数据库操作函数---------------------------#


#-----------------------------数据提取函数----------------------------#
def py_getDealReord(doc):
  soup=BeautifulSoup(doc,'lxml')
  tr=soup.find_all('tr')
  total_dealRecord=[([0]*5)for i in range(len(tr))] 
  i=-1
  for this_tr in tr:
    i=i+1
    td_user=this_tr.find_all('td',attrs={'class':"cell-align-l buyer"})
    for this_td in td_user:
      total_dealRecord[i][0]=this_td.getText().strip(' ')
      #print username
    td_style=this_tr.find_all('td',attrs={'class':"cell-align-l style"})
    for this_td in td_style:
      total_dealRecord[i][1]=this_td.getText(',').strip(' ')
      #print style
    td_quantity=this_tr.find_all('td',attrs={'class':"quantity"})
    for this_td in td_quantity:
      total_dealRecord[i][2]=this_td.getText().strip(' ')
      #print quantity
    td_dealtime=this_tr.find_all('td',attrs={'class':"dealtime"})
    for this_td in td_dealtime:
      total_dealRecord[i][3]=this_td.find('p',attrs={'class':"date"}).getText()
      total_dealRecord[i][4]=this_td.find('p',attrs={'class':"time"}).getText()
  return total_dealRecord
#--------------------获取要抓取的所有商品链接-----------------------#
cursor.execute("""
select * from ProductURLs where BrandName='NB'
""")


file=open("H:\\Eclipse\\TmallCrawling\\HTMLParse\\errLog.txt")
InProductInfo=cursor.fetchall()
browser=spynner.Browser()
for temp_InProductInfo in InProductInfo:

  url='https:'+temp_InProductInfo[2]

  BrandName=temp_InProductInfo[0]
  ProductType=temp_InProductInfo[1]
  print BrandName,'\t',ProductType,'\t',url
  #url= 'https://detail.tmall.com/item.htm?id=524425656711&rn=77636d6db8dea5e30060976fdaf9768d&abbucket=19' 

  try:
    browser=py_webpage_load(browser, url)
  except:
    print "Loading webpage failed."
    file.write(url)
    file.write('\n')
    continue

  xpaths=['//*[@id="J_PromoPrice"]/dd/div/span/text()',\
    '//*[@id="J_StrPriceModBox"]/dd/span/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()',\
    '//*[@id="J_PostageToggleCont"]/p/span/text()',\
    '//*[@id="J_EmStock"]/text()',\
    '//*[@id="J_CollectCount"]/text()',\
    '//*[@id="J_ItemRates"]/div/span[2]/text()',\
    '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]/text()']
  out_ProductInfo=py_extract_xpaths(browser,xpaths)
  browser=py_click_element(browser,'a[href="#description" rel="external nofollow" rel="external nofollow" ]')
  ProductProperty=py_extract_xpath(browser, '//*[@id="J_AttrUL"]')
  soup=BeautifulSoup(ProductProperty,'lxml')
  li=soup.find_all('li')
  prop=''
  for this_li in li:
    prop=prop+this_li.getText()+'\\'
  prop=prop[0:len(prop)-1]
  out_ProductProperty=prop
  print out_ProductProperty
  cursor.execute("""
  Insert into py_ProductInfo values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  """,(BrandName,ProductType,url,\
     out_ProductInfo[2],out_ProductInfo[1],\
     out_ProductInfo[0],out_ProductInfo[7],\
     out_ProductInfo[1],out_ProductInfo[3],\
     out_ProductInfo[4],out_ProductInfo[5],\
     out_ProductProperty))
  conn.commit()
  Deal_PageCount=0
  browser=py_click_element(browser, 'a[href="#J_DealRecord" rel="external nofollow" ]')
  #browser.browse(True)
  DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
  out_DealRecord=py_getDealReord(DealRecord)
  for temp_DealRecord in out_DealRecord:
    if str(temp_DealRecord[4])=='0':
      continue
    cursor.execute("""
    Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
    """,(url,temp_DealRecord[0],temp_DealRecord[1],\
       temp_DealRecord[2],temp_DealRecord[3],\
       temp_DealRecord[4]))
    conn.commit()
  Deal_PageCount=Deal_PageCount+1
  print "Page ",Deal_PageCount
  for i in range(6):
    if (i==0) or (i==2):
      continue
    xpath='//*[@id="J_showBuyerList"]/div/div/a['+str(i)+']'
    if py_check_element(browser,xpath):
      browser=py_click_xpath(browser, xpath)
      DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
      out_DealRecord=py_getDealReord(DealRecord)
      for temp_DealRecord in out_DealRecord:
        if str(temp_DealRecord[4])=='0':
          continue
        cursor.execute("""
        Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
        """,(url,temp_DealRecord[0],temp_DealRecord[1],\
           temp_DealRecord[2],temp_DealRecord[3],\
           temp_DealRecord[4]))
        conn.commit()
      Deal_PageCount=Deal_PageCount+1
      print "Page ",Deal_PageCount
  while py_check_element(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]'):
    browser=py_click_xpath(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]')
    DealRecord=py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody')
    out_DealRecord=py_getDealReord(DealRecord)
    for temp_DealRecord in out_DealRecord:
      if str(temp_DealRecord[4])=='0':
        continue
      cursor.execute("""
      Insert into DealRecord values(%s,%s,%s,%s,%s,%s)
      """,(url,temp_DealRecord[0],temp_DealRecord[1],\
         temp_DealRecord[2],temp_DealRecord[3],\
         temp_DealRecord[4]))
      conn.commit()
    Deal_PageCount=Deal_PageCount+1
    print "Page ",Deal_PageCount

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持三水点靠木。

Python如何抓取天猫商品详细信息及交易记录

- Author -

lwhusted

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

Python def函数的定义、使用及参数传递实现代码

Aug 10 Python

Python多线程同步Lock、RLock、Semaphore、Event实例

Nov 21 Python

利用Anaconda完美解决Python 2与python 3的共存问题

May 25 Python

Python实现抢购IPhone手机

Feb 07 Python

python实现飞机大战微信小游戏

Mar 21 Python

Python3中_(下划线)和__(双下划线)的用途和区别

Apr 26 Python

python3.6 如何将list存入txt后再读出list的方法

Jul 02 Python

Python爬虫抓取技术的一些经验

Jul 12 Python

python字典setdefault方法和get方法使用实例

Dec 25 Python

pandas 对group进行聚合的例子

Dec 27 Python

Python GUI编程学习笔记之tkinter控件的介绍及基本使用方法详解

Mar 30 Python

Pyecharts 中Geo函数常用参数的用法说明

Feb 01 Python

python列表生成式与列表生成器的使用

Feb 23 #Python

1分钟快速生成用于网页内容提取的xslt

Feb 23 #Python

python使用xslt提取网页数据的方法

Feb 23 #Python

Python爬虫使用Selenium+PhantomJS抓取Ajax和动态HTML内容

Feb 23 #Python

python爬虫获取多页天涯帖子

Feb 23 #Python

Python即时网络爬虫项目启动说明详解

Feb 23 #Python

Python爬豆瓣电影实例

Feb 23 #Python

You might like

Zend studio for eclipse中使php可以调用mysql相关函数的设置方法

2008/10/13 PHP

thinkphp3.0 模板中函数的使用

2012/11/13 PHP

如何使用GDB调试PHP程序

2015/12/08 PHP

PHP面向对象程序设计模拟一般面向对象语言中的方法重载(overload)示例

2019/06/13 PHP

基于jQuery试卷自动排版系统

2010/07/18 Javascript

使用JavaScript检测Firefox浏览器是否启用了Firebug的代码

2010/12/28 Javascript

Jquery 获取对象的几种方式介绍

2014/01/17 Javascript

实例分析js和C#中使用正则表达式匹配a标签

2014/11/26 Javascript

JavaScript实现的字符串replaceAll函数代码分享

2015/04/02 Javascript

JS响应鼠标点击实现两个滑块区间拖动效果

2015/10/26 Javascript

win7下安装配置node.js+express开发环境

2015/12/06 Javascript

AngularJS 模块化详解及实例代码

2016/09/14 Javascript

JavaScript闭包和回调详解

2017/08/09 Javascript

利用10行js代码实现上下滚动公告效果

2017/12/08 Javascript

Vue使用localStorage存储数据的方法

2019/05/27 Javascript

利用d3.js实现蜂巢图表带动画效果

2019/09/03 Javascript

vue.js实现三级菜单效果

2019/10/19 Javascript

[01:15]PWL S2开团时刻第二期——他们杀我就白给

2020/11/25 DOTA

[03:13]DOTA2-DPC中国联赛1月25日Recap集锦

2021/03/11 DOTA

Python中operator模块的操作符使用示例总结

2016/06/28 Python

恢复百度云盘本地误删的文件脚本(简单方法)

2017/10/21 Python

python类的方法属性与方法属性的动态绑定代码详解

2017/12/27 Python

Python二叉树定义与遍历方法实例分析

2018/05/25 Python

PyTorch的深度学习入门教程之构建神经网络

2019/06/27 Python

Mac安装python3的方法步骤

2019/08/09 Python

python实现静态服务器

2019/09/05 Python

Python异常模块traceback用法实例分析

2019/10/22 Python

Python上下文管理器用法及实例解析

2019/11/11 Python

Python高级编程之继承问题详解(super与mro)

2019/11/19 Python

python绘制彩虹图

2019/12/16 Python

Probikekit日本：自行车套件，跑步和铁人三项装备

2017/04/03 全球购物

青春飞扬演讲稿

2014/09/11 职场文书

乡镇安全生产月活动总结

2015/05/08 职场文书

毕业赠语大全

2015/06/23 职场文书

升职自荐书

2019/05/09 职场文书

动画「半妖的夜叉姬」新BD特典图公开

2022/03/22 日漫