python爬取本站电子书信息并入库的实现代码


Posted in Python onJanuary 20, 2020

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库

数据库工具类:DBUtil.py

import pymysql

class DBUtils(object):
  def connDB(self):               #连接数据库
    conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');
    cur=conn.cursor();
    return (conn,cur);

  def exeUpdate(self,conn,cur,sql):        #更新或插入操作
    sta=cur.execute(sql);
    conn.commit();
    return (sta);

  def exeDelete(self,conn,cur,IDs):        #删除操作 demo 没用到
    sta=0;
    for eachID in IDs.split(' '):
      sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));
    conn.commit();
    return (sta);

  def exeQuery(self,cur,sql):           #查找操作
    effect_row = cur.execute(sql);
    return (effect_row,cur);

  def connClose(self,conn,cur):          #关闭连接,释放资源
    cur.close();
    conn.close();

if __name__ == '__main__':
  dbUtil = DBUtils();
  conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py

from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
  level=logging.INFO
)
class BookOperator(object):
  def __addBook(self,book):
    logging.info("add book:%s" % book.bookName);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));
    dbUtil.exeUpdate(conn,cur,insertBookSql);
    dbUtil.connClose(conn,cur);
  def __selectLastBookId(self):
    logging.info("selectLastBookId ");
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    selectLastBookSql = "select id from book order by id desc limit 1";
    effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
    bookId = cur.fetchone()[0];
    dbUtil.connClose(conn,cur);
    return bookId;
  def __addBookDownLoadInfos(self,downLoadInfos,bookId):
    logging.info("add bookId:%s" % bookId);
    dbUtil = DBUtils();
    conn,cur = dbUtil.connDB();
    for downLoadinfo in downLoadInfos:
      insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
      dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
    dbUtil.connClose(conn,cur);
  def addBookInfo(self,book):
    logging.info("add bookInfo:%s" % book.bookName);
    self.__addBook(book);
    bookId = self.__selectLastBookId();
    self.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ == '__main__':
  bookope = BookOperator();
  book = Book("aaa","yang","cccc");
  book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));
  bookope.addBookInfo(book);

书籍信息文件 bookInfo.py

import sys
sys.encoding = "utf8"
class Book(object):
  #书籍信息#
  def __init__(self,mainInfo,downLoadUrl,bookName):
    self.mainInfo = mainInfo;
    self.downLoadUrl = downLoadUrl;
    self.bookName = bookName;
    self.downLoadInfos = [];
  def addDownLoadUrl(self,downloadInfo):
    self.downLoadInfos.append(downloadInfo);
  def print_book_info(self):
    print ("bookName :%s" % (self.bookName));
class DownLoadInfo(object):
  #下载信息#
  def __init__(self,downUrl,downName):
    self.downUrl = downUrl;
    self.downName = downName;
  def print_down_info(self):
    print ("downLoad %s - %s" % (self.downUrl,self.downName));

51job界面解析文件 FiveOneJobFetch.py

import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch(object):
  host = "//3water.com/";  #域名+分类
  category = "books/"; #具体请求页
  def __init__(self,pageUrl):
    self.pageUrl = pageUrl; #完整URL
    self.url = PageFetch.host+PageFetch.category + pageUrl;
  def __getPageContent(self):
    req = requests.get(self.url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def getPageContent(url):
    req = requests.get(url);
    if req.status_code == 200:
      req.encoding = "gb2312";
      strText = req.text;
      return strText;
    else:
      return "";
  def __getMaxPageNumAndUrl(self):
    fetchUrl = self.pageUrl;
    #获取分页地址 分页url 形如 list45_2.html 2为页号#
    maxPageNum = 0;
    maxLink = "";
    while maxLink == "":
      url = PageFetch.host+PageFetch.category +fetchUrl;
      reqContent = PageFetch.getPageContent(url)
      soup = BeautifulSoup (reqContent,"html.parser");
      for ul in soup.select(".plist"):
        print ("数据");
        print (ul);
        maxPageNum = ul.select("strong")[0].text;
        alink = ul.select("a");
        if alink[-1]['href'] == "#":
          maxLink = alink[1]['href'];
        else:
          fetchUrl = alink[-1]['href'];
    return maxPageNum,maxLink;
  def __formatPage(self,pageNum):
    #格式化url 形如 list45_2.html#
    lineBeginSite = self.pageUrl.index("_")+1;
    docBeginSite = self.pageUrl.index(".");
    return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];
  def getBookPageList(self):
    #获取书籍每页的URL#
    shortPageList = [];
    maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
    for i in range(int(maxPageNum)):
      shortPageList.append(self.host +self.category+ self.__formatPage(i));
    return shortPageList;
  def getDownloadPage(url):
    downPage= [];
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    for a in soup.select(".cur-cat-list .btn-dl"):
      downPage.append(PageFetch.host+a['href']);
    return downPage;
  def getBookInfo(url):
    logging.info("获取书籍信息url:%s" % url);
    reqContent = PageFetch.getPageContent(url);
    soup = BeautifulSoup (reqContent,"html.parser");
    mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");
    title = (soup.select("dl dt h1"))[0].text.replace("'","");
    book = Book(mainInfo,url,title);
    for ul in soup.select(".ul_Address"):
      for li in ul.select("li"):
        downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);
        book.addDownLoadUrl(downLoadInfo);
    return book;
if __name__ == '__main__':
  p = PageFetch("list152_1.html");
  shortPageList = p.getBookPageList();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  print ("================汇总如下===============================");
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    print (book.bookName+":%s" % book.downLoadUrl);
    for d in book.downLoadInfos:
      print ("%s - %s" % (d.downUrl,d.downName));
  # p = PageFetch("list977_1.html");
  # p = p.getMaxPageNumAndUrl();
  # print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py

from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator

def main(url):
  p = PageFetch(url);
  shortPageList = p.getBookPageList();
  bookOperator = BookOperator();
  downPage= [];
  for page in shortPageList:
    downLoadPage = PageFetch.getDownloadPage(page);
    downPage = downPage+downLoadPage;
  for bookDownLoadPage in downPage:
    book = PageFetch.getBookInfo(bookDownLoadPage);
    bookOperator.addBookInfo(book);
  print ("数据抓取成功:"+url);

if __name__ == '__main__':
  urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];
  for url in urls:
    main(url);

数据库表:书籍信息表和下载地址表

CREATE TABLE `book` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR(200) NULL DEFAULT NULL,
`bookUrl` VARCHAR(500) NULL DEFAULT NULL,
`bookInfo` TEXT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
CREATE TABLE `book_down_url` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookId` INT(11) NOT NULL DEFAULT '0',
`downName` VARCHAR(200) NOT NULL DEFAULT '0',
`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master

Python 相关文章推荐
Python 自动安装 Rising 杀毒软件
Apr 24 Python
Python的SQLAlchemy框架使用入门
Apr 29 Python
Python3的urllib.parse常用函数小结(urlencode,quote,quote_plus,unquote,unquote_plus等)
Sep 18 Python
解决pyqt中ui编译成窗体.py中文乱码的问题
Dec 23 Python
python的构建工具setup.py的方法使用示例
Oct 23 Python
Python实现PS滤镜Fish lens图像扭曲效果示例
Jan 29 Python
PyQt5每天必学之组合框
Apr 20 Python
Django中的ajax请求
Oct 19 Python
Python实现常见的回文字符串算法
Nov 14 Python
python3学生名片管理v2.0版
Nov 29 Python
深入了解Python枚举类型的相关知识
Jul 09 Python
python继承threading.Thread实现有返回值的子类实例
May 02 Python
浅谈Tensorflow 动态双向RNN的输出问题
Jan 20 #Python
关于tf.nn.dynamic_rnn返回值详解
Jan 20 #Python
双向RNN:bidirectional_dynamic_rnn()函数的使用详解
Jan 20 #Python
关于tf.reverse_sequence()简述
Jan 20 #Python
tensorflow使用range_input_producer多线程读取数据实例
Jan 20 #Python
浅谈tensorflow中Dataset图片的批量读取及维度的操作详解
Jan 20 #Python
使用tensorflow DataSet实现高效加载变长文本输入
Jan 20 #Python
You might like
社区(php&&mysql)一
2006/10/09 PHP
解析CI的AJAX分页 另类实现方法
2013/06/27 PHP
PHP往XML中添加节点的方法
2015/03/12 PHP
PHP的Yii框架中Model模型的学习教程
2016/03/29 PHP
php面向对象值单例模式
2016/05/03 PHP
CheckBox 如何实现全选?
2006/06/23 Javascript
Javascript中的异步编程规范Promises/A详细介绍
2014/06/06 Javascript
基于jquery实现的文字向上跑动类似跑马灯的效果
2014/06/17 Javascript
原生javascript实现分享到朋友圈功能 支持ios和android
2016/05/11 Javascript
vue实现可增删查改的成绩单
2016/10/27 Javascript
JS中使用正则表达式g模式和非g模式的区别
2017/04/01 Javascript
node文件上传功能简易实现代码
2017/06/16 Javascript
使用jquery的jsonp如何发起跨域请求及其原理详解
2017/08/17 jQuery
vue实现商品加减计算总价的实例代码
2018/08/12 Javascript
vue template中slot-scope/scope的使用方法
2018/09/06 Javascript
JavaScript数据结构与算法之二叉树遍历算法详解【先序、中序、后序】
2019/02/21 Javascript
javascript-hashchange事件和历史状态管理实例分析
2020/04/18 Javascript
Vue性能优化的方法
2020/07/30 Javascript
[40:03]Liquid vs Optic 2018国际邀请赛淘汰赛BO3 第一场 8.21
2018/08/22 DOTA
Python简单进程锁代码实例
2015/04/27 Python
详解Python 数据库 (sqlite3)应用
2016/12/07 Python
Python爬取数据并写入MySQL数据库的实例
2018/06/21 Python
Python定时任务随机时间执行的实现方法
2019/08/14 Python
python 监测内存和cpu的使用率实例
2019/11/28 Python
pytorch自定义二值化网络层方式
2020/01/07 Python
python判断变量是否为int、字符串、列表、元组、字典的方法详解
2020/02/13 Python
使用SQLAlchemy操作数据库表过程解析
2020/06/10 Python
PyCharm 解决找不到新打开项目的窗口问题
2021/01/15 Python
北京-环亚运商测试题.net程序员初步测试题
2013/05/28 面试题
农业大学毕业生的个人自我评价
2013/10/11 职场文书
酒店开业策划方案
2014/06/02 职场文书
助人为乐模范事迹材料
2014/06/02 职场文书
师德师风建设整改措施思想汇报
2014/10/11 职场文书
2016新年慰问信范文
2015/03/25 职场文书
2015年电教工作总结
2015/05/26 职场文书
南京大屠杀观后感
2015/06/02 职场文书