编程 Python

python支持断点续传的多线程下载示例

Posted in Python onJanuary 16, 2014

#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

python字符串加密解密的三种方法分享(base64 win32com)

Jan 19 Python

Python解析最简单的验证码

Jan 07 Python

Python守护进程和脚本单例运行详解

Jan 06 Python

对python读取zip压缩文件里面的csv数据实例详解

Feb 08 Python

Python函数中参数是传递值还是引用详解

Jul 02 Python

python多线程与多进程及其区别详解

Aug 08 Python

Python进程，多进程，获取进程id，给子进程传递参数操作示例

Oct 11 Python

Python 闭包，函数分隔作用域，nonlocal声明非局部变量操作示例

Oct 14 Python

python绘制玫瑰的实现代码

Mar 02 Python

解决python中显示图片的plt.imshow plt.show()内存泄漏问题

Apr 24 Python

pytorch显存一直变大的解决方案

Apr 08 Python

pytorch中的torch.nn.Conv2d()函数图文详解

Feb 28 Python

python获得图片base64编码示例

Jan 16 #Python

python练习程序批量修改文件名

Jan 16 #Python

python使用urllib模块开发的多线程豆瓣小站mp3下载器

Jan 16 #Python

python使用urllib模块和pyquery实现阿里巴巴排名查询

Jan 16 #Python

python3.3教程之模拟百度登陆代码分享

Jan 16 #Python

python解析发往本机的数据包示例 (解析数据包)

Jan 16 #Python

python多线程扫描端口示例

Jan 16 #Python

You might like

GD输出汉字的函数的分析

2006/10/09 PHP

支持数组的ADDSLASHES的php函数

2010/02/16 PHP

利用中国天气预报接口实现简单天气预报

2014/01/20 PHP

PHP实现模仿socket请求返回页面的方法

2014/11/04 PHP

PHP将HTML转换成文本的实现代码

2015/01/21 PHP

全面解读PHP的Yii框架中的日志功能

2016/03/17 PHP

多广告投放代码推荐

2006/11/13 Javascript

fancybox1.3.1 基于Jquery的插件在IE中图片显示问题

2010/10/01 Javascript

理解Javascript_08_函数对象

2010/10/15 Javascript

jquery表单验证框架提供的身份证验证方法(示例代码)

2013/12/27 Javascript

基于jQuery实现的图片切换焦点图整理

2014/12/07 Javascript

不想让浏览器运行javascript脚本的方法

2015/11/20 Javascript

jQuery 3.0十大新特性最终版发布

2016/07/14 Javascript

BootStrap Table 分页后重新搜索问题的解决办法

2016/08/08 Javascript

vue组件实例解析

2017/01/10 Javascript

jquery表单插件form使用方法详解

2017/01/20 Javascript

基于JavaScript实现一个简单的Vue

2018/09/26 Javascript

webpack4+react多页面架构的实现

2018/10/25 Javascript

Vue 事件处理操作实例详解

2019/03/05 Javascript

vuex + keep-alive实现tab标签页面缓存功能

2019/10/17 Javascript

IDEA配置jQuery, $符号不再显示黄色波浪线的问题

2020/10/09 jQuery

Python实现提取文章摘要的方法

2015/04/21 Python

在Django的session中使用User对象的方法

2015/07/23 Python

python traceback捕获并打印异常的方法

2018/08/31 Python

python实现名片管理系统

2018/11/29 Python

pandas中read_csv的缺失值处理方式

2019/12/19 Python

pytorch torch.nn.AdaptiveAvgPool2d()自适应平均池化函数详解

2020/01/03 Python

python+selenium 脚本实现每天自动登记的思路详解

2020/03/11 Python

Python socket连接中的粘包、精确传输问题实例分析

2020/03/24 Python

Python基于jieba, wordcloud库生成中文词云

2020/05/13 Python

纯css3无js实现的Android Logo(有简单动画)

2013/01/21 HTML / CSS

欧洲顶级的童装奢侈品购物网站：Bambini Fashion（面向全球）

2018/04/24 全球购物

采购部主管岗位职责

2014/01/01 职场文书

师德个人剖析材料

2014/02/02 职场文书

2015年成本会计工作总结

2015/10/14 职场文书

python文本处理的方案(结巴分词并去除符号)

2021/05/26 Python