编程 Python

Python爬取读者并制作成PDF

Posted in Python onMarch 10, 2015

学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..

crawler.py

#!/usr/bin/env python

#coding=utf-8

"""

    Author:         Anemone

    Filename:       getmain.py

    Last modified:  2015-02-19 16:47

    E-mail:         anemone@82flex.com

"""

import urllib2

from bs4 import BeautifulSoup

import re

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

def getEachArticle(url):

#    response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')

    response = urllib2.urlopen(url)

    html = response.read()

    soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))

    #for i in soup.find_all('div'):

    #    print i,1

    title=soup.find("h1").string

    writer=soup.find(id="pub_date").string.strip()

    _from=soup.find(id="media_name").string.strip()

    text=soup.get_text()#.encode("utf-8")

    main=re.split("BAIDU_CLB.*;",text)

    result={"title":title,"writer":writer,"from":_from,"context":main[1]}

    return result

    #new=open("new.txt","w")

    #new.write(result["title"]+"\n\n")

    #new.write(result["writer"]+"  "+result["from"])

    #new.write(result["context"])

    #new.close()

def getCatalog(issue):

    url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"

    firstUrl=url+"duzh"+issue+"01.html"

    firstUrl=url+"index.html"

    duzhe=dict()

    response = urllib2.urlopen(firstUrl)

    html = response.read()

    soup=BeautifulSoup(html)

    firstUrl=url+soup.table.a.get("href")

    response = urllib2.urlopen(firstUrl)

    html = response.read()

    soup = BeautifulSoup(html)

    all=soup.find_all("h2")

    for i in all:

        print i.string

        duzhe[i.string]=list()

        for link in i.parent.find_all("a"):

            href=url+link.get("href")

            print href

            while 1:

                try:

                    article=getEachArticle(href)

                    break

                except:

                    continue

            duzhe[i.string].append(article)

    return duzhe

def readDuZhe(duzhe):

    for eachColumn in duzhe:

        for eachArticle in duzhe[eachColumn]:

            print eachArticle["title"]

if __name__ == '__main__':

#    issue=raw_input("issue(201501):")

    readDuZhe(getCatalog("201424"))

getpdf.py

#!/usr/bin/env python

#coding=utf-8

"""

    Author:         Anemone

    Filename:       writetopdf.py

    Last modified:  2015-02-20 19:19

    E-mail:         anemone@82flex.com

"""

#coding=utf-8

import reportlab.rl_config

from reportlab.pdfbase import pdfmetrics

from reportlab.pdfbase.ttfonts import TTFont

from reportlab.lib import fonts

import copy

from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables

from reportlab.lib.styles import getSampleStyleSheet

import crawler

def writePDF(issue,duzhe):

    reportlab.rl_config.warnOnMissingFontGlyphs = 0

    pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))

    pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))

    fonts.addMapping('song', 0, 0, 'song')

    fonts.addMapping('song', 0, 1, 'song')

    fonts.addMapping('song', 1, 0, 'hei')

    fonts.addMapping('song', 1, 1, 'hei')

    stylesheet=getSampleStyleSheet()

    normalStyle = copy.deepcopy(stylesheet['Normal'])

    normalStyle.fontName ='song'

    normalStyle.fontSize = 11

    normalStyle.leading = 11

    normalStyle.firstLineIndent = 20

    titleStyle = copy.deepcopy(stylesheet['Normal'])

    titleStyle.fontName ='song'

    titleStyle.fontSize = 15

    titleStyle.leading = 20

    firstTitleStyle = copy.deepcopy(stylesheet['Normal'])

    firstTitleStyle.fontName ='song'

    firstTitleStyle.fontSize = 20

    firstTitleStyle.leading = 20

    firstTitleStyle.firstLineIndent = 50

    smallStyle = copy.deepcopy(stylesheet['Normal'])

    smallStyle.fontName ='song'

    smallStyle.fontSize = 8

    smallStyle.leading = 8

    story = []

    story.append(Paragraph("<b>读者{0}期</b>".format(issue), firstTitleStyle))

    for eachColumn in duzhe:

        story.append(Paragraph('__'*28, titleStyle))

        story.append(Paragraph('<b>{0}</b>'.format(eachColumn), titleStyle))

        for eachArticle in duzhe[eachColumn]:

            story.append(Paragraph(eachArticle["title"],normalStyle))

    story.append(flowables.PageBreak())

    for eachColumn in duzhe:

        for eachArticle in duzhe[eachColumn]:

            story.append(Paragraph("<b>{0}</b>".format(eachArticle["title"]),titleStyle))

            story.append(Paragraph(" {0}  {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))

            para=eachArticle["context"].split("")

            for eachPara in para:

                story.append(Paragraph(eachPara,normalStyle))

            story.append(flowables.PageBreak())

    #story.append(Paragraph("context",normalStyle))

    doc = SimpleDocTemplate("duzhe"+issue+".pdf")

    print "Writing PDF..."

    doc.build(story)

def main(issue):

    duzhe=crawler.getCatalog(issue)

    writePDF(issue,duzhe)

if __name__ == '__main__':

    issue=raw_input("Enter issue(201501):")

    main(issue)

以上就是本文的全部内容了，希望大家能够喜欢。

Python爬取读者并制作成PDF

- Author -

hebedich

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

python实用代码片段收集贴

Jun 03 Python

使用Django的模版来配合字符串翻译工作

Jul 27 Python

用Python将IP地址在整型和字符串之间轻松转换

Mar 22 Python

django开发之settings.py中变量的全局引用详解

Mar 29 Python

修复 Django migration 时遇到的问题解决

Jun 14 Python

python对list中的每个元素进行某种操作的方法

Jun 29 Python

对Python 数组的切片操作详解

Jul 02 Python

详解python中递归函数

Apr 16 Python

python使用time、datetime返回工作日列表实例代码

May 09 Python

使用 PyTorch 实现 MLP 并在 MNIST 数据集上验证方式

Jan 08 Python

Python字典取键、值对的方法步骤

Sep 30 Python

python3排序的实例方法

Oct 20 Python

Python生成随机MAC地址

Mar 10 #Python

Python中实现结构相似的函数调用方法

Mar 10 #Python

Python实现CET查分的方法

Mar 10 #Python

Python实现的批量下载RFC文档

Mar 10 #Python

Python制作CSDN免积分下载器

Mar 10 #Python

Python Tkinter GUI编程入门介绍

Mar 10 #Python

Python格式化css文件的方法

Mar 10 #Python

You might like

php 采集书并合成txt格式的实现代码

2009/03/01 PHP

PHP压缩html网页代码(清除空格,换行符,制表符,注释标记)

2012/04/02 PHP

浅析php header 跳转

2013/06/17 PHP

php判断输入是否是纯数字，英文，汉字的方法

2015/03/05 PHP

PHP基于简单递归函数求一个数阶乘的方法示例

2017/04/26 PHP

Google韩国首页图标动画效果

2007/08/26 Javascript

jquery next nextAll nextUntil siblings的区别介绍

2013/10/05 Javascript

Area 区域实现post提交数据的js写法

2014/04/22 Javascript

js实现的二分查找算法实例

2016/01/21 Javascript

微信小程序 wx:for的使用实例详解

2017/04/27 Javascript

JavaScript 程序错误Cannot use 'in' operator to search的解决方法

2017/07/10 Javascript

vue2.0 自定义饼状图 (Echarts)组件的方法

2018/03/02 Javascript

webpack打包非模块化js的方法

2018/10/24 Javascript

浅谈React碰到v-if

2018/11/04 Javascript

javascript设计模式 ? 职责链模式原理与用法实例分析

2020/04/16 Javascript

基于js实现判断浏览器类型代码实例

2020/07/17 Javascript

python人人网登录应用实例

2014/09/26 Python

Python while、for、生成器、列表推导等语句的执行效率测试

2015/06/03 Python

初步认识Python中的列表与位运算符

2015/10/12 Python

Python 网络爬虫--关于简单的模拟登录实例讲解

2018/06/01 Python

Linux系统（CentOS）下python2.7.10安装

2018/09/26 Python

在python中只选取列表中某一纵列的方法

2018/11/28 Python

Python3.7安装keras和TensorFlow的教程图解

2020/06/18 Python

Python OpenCV去除字母后面的杂线操作

2020/07/05 Python

python基于爬虫+django，打造个性化API接口

2021/01/21 Python

使用CSS3实现一个3D相册效果实例

2016/12/03 HTML / CSS

欧洲最大的婴幼儿服装及内衣公司：Petit Bateau（小帆船）

2016/08/16 全球购物

Android interview questions

2016/12/25 面试题

高级护理专业毕业生推荐信

2013/12/25 职场文书

县级文明单位申报材料

2014/05/23 职场文书

商铺消防安全责任书

2014/07/29 职场文书

故意杀人罪辩护词

2015/05/21 职场文书

2016廉政教育学习心得体会

2016/01/25 职场文书

好段摘抄大全（48句）

2019/08/08 职场文书

python 模块重载的五种方法

2021/04/24 Python

Oracle11g r2 卸载干净重装的详细教程(亲测有效已重装过)

2021/06/04 Oracle