python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
Python中字符编码简介、方法及使用建议
Jan 08 Python
Python求解平方根的方法
Mar 11 Python
python通过socket查询whois的方法
Jul 18 Python
python中使用序列的方法
Aug 03 Python
利用Python循环(包括while&amp;for)各种打印九九乘法表的实例
Nov 06 Python
python 筛选数据集中列中value长度大于20的数据集方法
Jun 14 Python
python获取中文字符串长度的方法
Nov 14 Python
python学生管理系统
Jan 30 Python
python实现简单俄罗斯方块
Mar 13 Python
浅谈python3打包与拆包在函数的应用详解
May 02 Python
Tensorflow与Keras自适应使用显存方式
Jun 22 Python
Python实现仓库管理系统
May 30 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
傻瓜化配置PHP环境――Appserv
2006/12/13 PHP
PHP实现在线阅读PDF文件的方法
2015/06/17 PHP
小议Function.apply()之二------利用Apply的参数数组化来提高 JavaScript程序性能
2006/11/30 Javascript
Javascript 构造函数,公有,私有特权和静态成员定义方法
2009/11/30 Javascript
javascript中RegExp保留小数点后几位数的方法分享
2013/08/13 Javascript
jquery事件重复绑定的快速解决方法
2014/01/03 Javascript
jQuery实现响应鼠标滚动的动感菜单效果
2015/09/21 Javascript
JS文件/图片从电脑里面拖拽到浏览器上传文件/图片
2017/03/08 Javascript
微信小程序 中wx.chooseAddress(OBJECT)实例详解
2017/03/31 Javascript
JS基于正则实现数字千分位用逗号分隔的方法
2017/06/16 Javascript
JS实现多张图片预览同步上传功能
2017/06/23 Javascript
webpack多页面开发实践
2017/12/18 Javascript
vue-cli webpack2项目打包优化分享
2018/02/07 Javascript
详解Vue-cli中的静态资源管理(src/assets和static/的区别)
2018/06/19 Javascript
Node.js在图片模板上生成二维码图片并附带底部文字说明实现详解
2019/08/07 Javascript
vue父子组件的通信方法(实例详解)
2019/11/10 Javascript
原生js实现随机点名
2020/07/05 Javascript
谈谈node.js中的模块系统
2020/09/01 Javascript
[01:14]2014DOTA2展望TI 剑指西雅图newbee战队专访
2014/06/30 DOTA
解析Python中的eval()、exec()及其相关函数
2017/12/20 Python
Python3 XML 获取雅虎天气的实现方法
2018/02/01 Python
取numpy数组的某几行某几列方法
2018/04/03 Python
Ubuntu下使用python读取doc和docx文档的内容方法
2018/05/08 Python
django2+uwsgi+nginx上线部署到服务器Ubuntu16.04
2018/06/26 Python
python3发送邮件需要经过代理服务器的示例代码
2019/07/25 Python
python中字典按键或键值排序的实现代码
2019/08/27 Python
Python字典添加,删除,查询等相关操作方法详解
2020/02/07 Python
印度在线内衣和时尚目的地:Zivame
2017/09/28 全球购物
小学运动会入场式解说词
2014/02/18 职场文书
母婴店促销方案
2014/03/05 职场文书
总经理工作职责范文
2014/03/14 职场文书
2015年九一八事变纪念日演讲稿
2015/03/19 职场文书
材料员岗位职责范本
2015/04/11 职场文书
学困生转化工作总结
2015/08/13 职场文书
医院中层管理人员培训心得体会
2016/01/11 职场文书
正确的理解和使用Django信号(Signals)
2021/04/14 Python