python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
python实现将html表格转换成CSV文件的方法
Jun 28 Python
Python写入CSV文件的方法
Jul 08 Python
Python爬虫辅助利器PyQuery模块的安装使用攻略
Apr 24 Python
python实现的二叉树定义与遍历算法实例
Jun 30 Python
详细分析python3的reduce函数
Dec 05 Python
Python get获取页面cookie代码实例
Sep 12 Python
python处理自动化任务之同时批量修改word里面的内容的方法
Aug 23 Python
使用virtualenv创建Python环境及PyQT5环境配置的方法
Sep 10 Python
python numpy之np.random的随机数函数使用介绍
Oct 06 Python
Django+uni-app实现数据通信中的请求跨域的示例代码
Oct 12 Python
pytorch 图像中的数据预处理和批标准化实例
Jan 15 Python
Python 窗体(tkinter)下拉列表框(Combobox)实例
Mar 04 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
特转载一高手总结PHP学习资源和链接.
2006/12/05 PHP
php读取csv文件并输出的方法
2015/03/14 PHP
Mac环境下php操作mysql数据库的方法分享
2015/05/11 PHP
laravel 5.4中实现无限级分类的方法示例
2017/07/27 PHP
PhpStorm的使用教程(本地运行PHP+远程开发+快捷键)
2020/03/26 PHP
捕获关闭窗口的脚本
2009/01/10 Javascript
javascript 运算数的求值顺序
2011/08/23 Javascript
使用jquery实现IE下按backspace相当于返回操作
2014/03/18 Javascript
jQuery动画出现连续触发、滞后反复执行的解决方法
2015/01/28 Javascript
js实现创建删除html元素小结
2015/09/30 Javascript
简单谈谈node.js 版本控制 nvm和 n
2015/10/15 Javascript
解决node.js安装包失败的几种方法
2016/09/02 Javascript
jquery checkbox的相关操作总结
2016/10/17 Javascript
jquery广告无缝轮播实例
2017/01/05 Javascript
bootstrap fileinput 上传插件的基础使用
2017/02/17 Javascript
你应该知道的几类npm依赖包管理详解
2017/10/06 Javascript
javascript json对象小技巧之键名作为变量用法分析
2019/11/11 Javascript
Electron实现应用打包、自动升级过程解析
2020/07/07 Javascript
Python中实现的RC4算法
2015/02/14 Python
Windows系统配置python脚本开机启动的3种方法分享
2015/03/10 Python
Python中的getopt函数使用详解
2015/07/28 Python
python实现可以断点续传和并发的ftp程序
2016/09/13 Python
利用Python将图片中扭曲矩形的复原
2020/09/07 Python
Pycharm 解决自动格式化冲突的设置操作
2021/01/15 Python
pytho matplotlib工具栏源码探析一之禁用工具栏、默认工具栏和工具栏管理器三种模式的差异
2021/02/25 Python
html5 迷宫游戏(碰撞检测)实例一
2013/07/25 HTML / CSS
老公给老婆的检讨书(精华篇)
2014/10/18 职场文书
暑假安全保证书
2015/02/28 职场文书
高考1977观后感
2015/06/04 职场文书
会计做账心得体会
2016/01/22 职场文书
农村房屋租赁合同(范本)
2019/07/23 职场文书
Python - 10行代码集2000张美女图
2021/05/23 Python
Golang 实现WebSockets
2022/04/24 Golang
Java设计模式中的命令模式
2022/04/28 Java/Android
python神经网络Xception模型
2022/05/06 Python
一文解答什么是MySQL的回表
2022/08/05 MySQL