python支持断点续传的多线程下载示例


Posted in Python onJanuary 16, 2014
#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)
Python 相关文章推荐
python字符串加密解密的三种方法分享(base64 win32com)
Jan 19 Python
Python解析最简单的验证码
Jan 07 Python
Python守护进程和脚本单例运行详解
Jan 06 Python
对python读取zip压缩文件里面的csv数据实例详解
Feb 08 Python
Python函数中参数是传递值还是引用详解
Jul 02 Python
python多线程与多进程及其区别详解
Aug 08 Python
Python进程,多进程,获取进程id,给子进程传递参数操作示例
Oct 11 Python
Python 闭包,函数分隔作用域,nonlocal声明非局部变量操作示例
Oct 14 Python
python绘制玫瑰的实现代码
Mar 02 Python
解决python中显示图片的plt.imshow plt.show()内存泄漏问题
Apr 24 Python
pytorch显存一直变大的解决方案
Apr 08 Python
pytorch中的torch.nn.Conv2d()函数图文详解
Feb 28 Python
python获得图片base64编码示例
Jan 16 #Python
python练习程序批量修改文件名
Jan 16 #Python
python使用urllib模块开发的多线程豆瓣小站mp3下载器
Jan 16 #Python
python使用urllib模块和pyquery实现阿里巴巴排名查询
Jan 16 #Python
python3.3教程之模拟百度登陆代码分享
Jan 16 #Python
python解析发往本机的数据包示例 (解析数据包)
Jan 16 #Python
python多线程扫描端口示例
Jan 16 #Python
You might like
GD输出汉字的函数的分析
2006/10/09 PHP
支持数组的ADDSLASHES的php函数
2010/02/16 PHP
利用中国天气预报接口实现简单天气预报
2014/01/20 PHP
PHP实现模仿socket请求返回页面的方法
2014/11/04 PHP
PHP将HTML转换成文本的实现代码
2015/01/21 PHP
全面解读PHP的Yii框架中的日志功能
2016/03/17 PHP
多广告投放代码 推荐
2006/11/13 Javascript
fancybox1.3.1 基于Jquery的插件在IE中图片显示问题
2010/10/01 Javascript
理解Javascript_08_函数对象
2010/10/15 Javascript
jquery表单验证框架提供的身份证验证方法(示例代码)
2013/12/27 Javascript
基于jQuery实现的图片切换焦点图整理
2014/12/07 Javascript
不想让浏览器运行javascript脚本的方法
2015/11/20 Javascript
jQuery 3.0十大新特性最终版发布
2016/07/14 Javascript
BootStrap Table 分页后重新搜索问题的解决办法
2016/08/08 Javascript
vue组件实例解析
2017/01/10 Javascript
jquery表单插件form使用方法详解
2017/01/20 Javascript
基于JavaScript实现一个简单的Vue
2018/09/26 Javascript
webpack4+react多页面架构的实现
2018/10/25 Javascript
Vue 事件处理操作实例详解
2019/03/05 Javascript
vuex + keep-alive实现tab标签页面缓存功能
2019/10/17 Javascript
IDEA配置jQuery, $符号不再显示黄色波浪线的问题
2020/10/09 jQuery
Python实现提取文章摘要的方法
2015/04/21 Python
在Django的session中使用User对象的方法
2015/07/23 Python
python traceback捕获并打印异常的方法
2018/08/31 Python
python实现名片管理系统
2018/11/29 Python
pandas中read_csv的缺失值处理方式
2019/12/19 Python
pytorch torch.nn.AdaptiveAvgPool2d()自适应平均池化函数详解
2020/01/03 Python
python+selenium 脚本实现每天自动登记的思路详解
2020/03/11 Python
Python socket连接中的粘包、精确传输问题实例分析
2020/03/24 Python
Python基于jieba, wordcloud库生成中文词云
2020/05/13 Python
纯css3无js实现的Android Logo(有简单动画)
2013/01/21 HTML / CSS
欧洲顶级的童装奢侈品购物网站:Bambini Fashion(面向全球)
2018/04/24 全球购物
采购部主管岗位职责
2014/01/01 职场文书
师德个人剖析材料
2014/02/02 职场文书
2015年成本会计工作总结
2015/10/14 职场文书
python文本处理的方案(结巴分词并去除符号)
2021/05/26 Python