编程 Python

python读取html中指定元素生成excle文件示例

Posted in Python onApril 03, 2014

Python2.7编写的读取html中指定元素，并生成excle文件

#coding=gbk
import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import BeautifulSoup 
from xlrd import open_workbook
class LogMsg:
        def __init__(self,logfile,Level=0):
                try:
                        import logging
                        #self.logger = None
                        self.logger = logging.getLogger()
                        self.hdlr = logging.FileHandler(logfile)
                        formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")
                        self.hdlr.setFormatter(formatter)
                        self.logger.addHandler(self.hdlr)
                        #logger.setLevel()
                        if Level == 10:
                                self.logger.setLevel(logging.DEBUG)
                        elif Level == 20:
                                self.logger.setLevel(logging.INFO)
                        elif Level == 30:
                                self.logger.setLevel(logging.WARNING)
                        elif Level == 40:
                                self.logger.setLevel(logging.ERROR)
                        elif Level == 50:
                                self.logger.setLevel(logging.CRITICAL)
                        else:
                                self.logger.setLevel(logging.NOTSET)
                except:
                        print "log init error!"
                        exit(1)
        def output(self,logInfo):
                Level = self.logger.getEffectiveLevel()
                try:
                        if Level == 10:
                                self.logger.debug(logInfo)
                        elif Level == 20:
                                self.logger.info(logInfo)
                        elif Level == 30:
                                self.logger.warning(logInfo)
                        elif Level == 40:
                                self.logger.error(logInfo)
                        elif Level == 50:
                                self.logger.critical(logInfo)
                        else:
                                self.logger.info(logInfo)
                except:
                        print "log output error!"
                        exit(1)
        def close(self):
                try:
                #logging.shutdown([self.hdlr])
                        self.logger.removeHandler(self.hdlr)
                except:
                        print "log closed error!"
                        exit(1) 
Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime())
logFileTime = time.strftime("%Y%m%d",time.localtime())
Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime
log = LogMsg(Logfile,20)

DATAPATH = '/data/pyExample/' 
XLSname = 'dangjian_'+Logtime+'.xls'

if __name__ == '__main__':
    
    wbk = xlwt.Workbook(encoding = 'gbk')
    sheet = wbk.add_sheet('基本内容导入模板')
    sheet.write(0,0,'内容类型 ')
    sheet.write(0,1,'栏目名称')
    sheet.write(0,2,'栏目编号')
    sheet.write(0,3,'内容名称')
    sheet.write(0,4,'时长')
    sheet.write(0,5,'关键字')
    sheet.write(0,6,'看点')
    sheet.write(0,7,'作者')
    sheet.write(0,8,'来源')
    sheet.write(0,9,'子内容1')
    sheet.write(0,10,'子内容2')
    xlsContent = []   
    files = os.listdir(DATAPATH)
    k = 0
    for f in files:  
        if os.path.splitext(f)[1] == '.html':
            content=[]
            log.output('当前文件：'+f)
            htmlFile =codecs.open(DATAPATH+f,'r','gbk')
            lines = htmlFile.readlines()
            if not lines:
                log.output ('not line')
            for line in lines:
                if line.strip()=='\n':
                    log.output('该处是空行')
                else:
                    line = line.replace(' ','')
                    soup  = BeautifulSoup(line)
                    for tdd in soup.findAll('td'):  
                        #print tdd.text.encode("gbk")
                        content.append(tdd.text.encode("gbk"))       
                #print line.encode('gbk') 
            htmlFile.close()    
            for i in content:
                print content.index(i),',',i 
                log.output(i) 
                log.output(content.index(i)) 
            print '----------------------------------------'
            
            folderName =  content[6]
            contentName=  content[4]       
            duration =    filter(str.isdigit, content[16])
            int_duration = string.atoi(duration)*60
            str_duration = "%i"%int_duration
            keyWord =     content[6] 
            desciption =  content[36]
            videoName_1 = content[10]
            print folderName
            print contentName
            print str_duration
            print keyWord
            print desciption
            print videoName_1
            log.output('输出xls数据：'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')
            print k            
            sheet.write(k+1,0,'')
            sheet.write(k+1,1,folderName)
            sheet.write(k+1,2,'')
            sheet.write(k+1,3,contentName)
            sheet.write(k+1,4,str_duration)
            sheet.write(k+1,5,keyWord)
            sheet.write(k+1,6,desciption)
            sheet.write(k+1,7,'管理员')
            sheet.write(k+1,8,'华数编辑')
            sheet.write(k+1,9,videoName_1)
            sheet.write(k+1,10,'')
            k+=1
    wbk.save(DATAPATH + XLSname)        
    print '========================================='

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

python复制文件的方法实例详解

May 22 Python

在Django的视图(View)外使用Session的方法

Jul 23 Python

Python爬取三国演义的实现方法

Sep 12 Python

Python实现求两个csv文件交集的方法

Sep 06 Python

Python操作Oracle数据库的简单方法和封装类实例

May 07 Python

python 实现登录网页的操作方法

May 11 Python

Python生态圈图像格式转换问题(推荐)

Dec 02 Python

python range实例用法分享

Feb 06 Python

简单介绍一下pyinstaller打包以及安全性的实现

Jun 02 Python

Python创建文件夹与文件的快捷方法

Dec 08 Python

python中str内置函数用法总结

Dec 27 Python

利于python脚本编写可视化nmap和masscan的方法

Dec 29 Python

python实现zencart产品数据导入到magento(python导入数据)

Apr 03 #Python

python模拟登陆阿里妈妈生成商品推广链接

Apr 03 #Python

python多线程抓取天涯帖子内容示例

Apr 03 #Python

python局域网ip扫描示例分享

Apr 03 #Python

python实现数通设备tftp备份配置文件示例

Apr 02 #Python

python实现巡检系统(solaris)示例

Apr 02 #Python

python实现apahce网站日志分析示例

Apr 02 #Python