python实现爬虫下载漫画示例


Posted in Python onFebruary 16, 2014
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6") 
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])
 
def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()
        
        webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)
        picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

        chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')
        i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1
Python 相关文章推荐
python控制台显示时钟的示例
Feb 24 Python
Python中单例模式总结
Feb 20 Python
Python实现简单的文本相似度分析操作详解
Jun 16 Python
python实现三次样条插值
Dec 17 Python
python读取图片任意范围区域
Jan 23 Python
用Anaconda安装本地python包的方法及路径问题(图文)
Jul 16 Python
python opencv调用笔记本摄像头
Aug 28 Python
Python实现大数据收集至excel的思路详解
Jan 03 Python
python 对任意数据和曲线进行拟合并求出函数表达式的三种解决方案
Feb 18 Python
python如何支持并发方法详解
Jul 25 Python
在pycharm中使用pipenv创建虚拟环境和安装django的详细教程
Nov 30 Python
python图像处理 PIL Image操作实例
Apr 09 Python
python发送邮件示例(支持中文邮件标题)
Feb 16 #Python
python定时器使用示例分享
Feb 16 #Python
python求素数示例分享
Feb 16 #Python
python检测服务器是否正常
Feb 16 #Python
java直接调用python脚本的例子
Feb 16 #Python
python根据距离和时长计算配速示例
Feb 16 #Python
python根据经纬度计算距离示例
Feb 16 #Python
You might like
ThinkPHP 404页面的设置方法
2015/01/14 PHP
PHP中file_get_contents函数抓取https地址出错的解决方法(两种方法)
2015/09/22 PHP
PHP实现登录注册之BootStrap表单功能
2017/09/03 PHP
php 实现简单的登录功能示例【基于thinkPHP框架】
2019/12/02 PHP
在一个form用一个SUBMIT(或button)分别提交到两个处理表单页面的代码
2007/02/15 Javascript
Javascript 解疑
2009/11/11 Javascript
Javascript继承机制的设计思想分享
2011/08/28 Javascript
Javascript中的this绑定介绍
2011/09/22 Javascript
javascript打印大全(打印页面设置/打印预览代码)
2013/03/29 Javascript
jquery弹出框的用法示例(2)
2013/08/26 Javascript
javascript判断office版本示例
2014/04/11 Javascript
jquery中EasyUI使用技巧小结
2015/02/10 Javascript
PassWord输入框代码分享
2016/06/07 Javascript
详解使用Vue.Js结合Jquery Ajax加载数据的两种方式
2017/01/10 Javascript
Easyui使用Dialog行内按钮布局的实例
2017/07/27 Javascript
Nodejs+angularjs结合multiparty实现多图片上传的示例代码
2017/09/29 NodeJs
Bootstrap 模态框多次显示后台提交多次BUG的解决方法
2017/12/26 Javascript
vue2.0 elementUI制作面包屑导航栏
2018/02/22 Javascript
还不懂递归?读完这篇文章保证你会懂
2018/07/29 Javascript
解决vue组件没显示,没起作用,没报错,但该显示的组件没显示问题
2020/09/02 Javascript
python多进程操作实例
2014/11/21 Python
解密Python中的描述符(descriptor)
2015/06/03 Python
简单谈谈python中的多进程
2016/11/06 Python
django js实现部分页面刷新的示例代码
2018/05/28 Python
使用pandas实现连续数据的离散化处理方式(分箱操作)
2019/11/22 Python
python/golang 删除链表中的元素
2020/09/14 Python
Pycharm添加虚拟解释器报错问题解决方案
2020/10/13 Python
护理自我鉴定范文
2013/10/06 职场文书
酒后驾驶检讨书
2014/01/27 职场文书
篮球友谊赛通讯稿
2014/10/10 职场文书
离婚财产分配协议书
2014/10/21 职场文书
2014幼儿园中班工作总结
2014/11/10 职场文书
2016大学生社会实践单位评语
2015/12/01 职场文书
python 使用Tensorflow训练BP神经网络实现鸢尾花分类
2021/05/12 Python
Django Paginator分页器的使用示例
2021/06/23 Python
Python实现Matplotlib,Seaborn动态数据图
2022/05/06 Python