python支持断点续传的多线程下载示例


Posted in Python onJanuary 16, 2014
#! /usr/bin/env python
#coding=utf-8
from __future__ import unicode_literals
from multiprocessing.dummy import Pool as ThreadPool
import threading
import os
import sys
import cPickle
from collections import namedtuple
import urllib2
from urlparse import urlsplit
import time

# global lock
lock = threading.Lock()

# default parameters
defaults = dict(thread_count=10,
    buffer_size=10*1024,
    block_size=1000*1024)

def progress(percent, width=50):
    print "%s %d%%\r" % (('%%-%ds' % width) % (width * percent / 100 * '='), percent),
    if percent >= 100:
        print
        sys.stdout.flush()

def write_data(filepath, data):
    with open(filepath, 'wb') as output:
        cPickle.dump(data, output)

def read_data(filepath):
    with open(filepath, 'rb') as output:
        return cPickle.load(output)

FileInfo = namedtuple('FileInfo', 'url name size lastmodified')

def get_file_info(url):
    class HeadRequest(urllib2.Request):
        def get_method(self):
            return "HEAD"
    res = urllib2.urlopen(HeadRequest(url))
    res.read()
    headers = dict(res.headers)
    size = int(headers.get('content-length', 0))
    lastmodified = headers.get('last-modified', '')
    name = None
    if headers.has_key('content-disposition'):
        name = headers['content-disposition'].split('filename=')[1]
        if name[0] == '"' or name[0] == "'":
            name = name[1:-1]
    else:
        name = os.path.basename(urlsplit(url)[2])
    return FileInfo(url, name, size, lastmodified)

def download(url, output,
        thread_count = defaults['thread_count'],
        buffer_size = defaults['buffer_size'],
        block_size = defaults['block_size']):
    # get latest file info
    file_info = get_file_info(url)
    # init path
    if output is None:
        output = file_info.name
    workpath = '%s.ing' % output
    infopath = '%s.inf' % output
    # split file to blocks. every block is a array [start, offset, end],
    # then each greenlet download filepart according to a block, and
    # update the block' offset.
    blocks = []
    if os.path.exists(infopath):
        # load blocks
        _x, blocks = read_data(infopath)
        if (_x.url != url or
                _x.name != file_info.name or
                _x.lastmodified != file_info.lastmodified):
            blocks = []
    if len(blocks) == 0:
        # set blocks
        if block_size > file_info.size:
            blocks = [[0, 0, file_info.size]]
        else:
            block_count, remain = divmod(file_info.size, block_size)
            blocks = [[i*block_size, i*block_size, (i+1)*block_size-1] for i in range(block_count)]
            blocks[-1][-1] += remain
        # create new blank workpath
        with open(workpath, 'wb') as fobj:
            fobj.write('')
    print 'Downloading %s' % url
    # start monitor
    threading.Thread(target=_monitor, args=(infopath, file_info, blocks)).start()
    # start downloading
    with open(workpath, 'rb+') as fobj:
        args = [(url, blocks[i], fobj, buffer_size) for i in range(len(blocks)) if blocks[i][1] < blocks[i][2]]
        if thread_count > len(args):
            thread_count = len(args)
        pool = ThreadPool(thread_count)
        pool.map(_worker, args)
        pool.close()
        pool.join()

    # rename workpath to output
    if os.path.exists(output):
        os.remove(output)
    os.rename(workpath, output)
    # delete infopath
    if os.path.exists(infopath):
        os.remove(infopath)
    assert all([block[1]>=block[2] for block in blocks]) is True

def _worker((url, block, fobj, buffer_size)):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (block[1], block[2])
    res = urllib2.urlopen(req)
    while 1:
        chunk = res.read(buffer_size)
        if not chunk:
            break
        with lock:
            fobj.seek(block[1])
            fobj.write(chunk)
            block[1] += len(chunk)

def _monitor(infopath, file_info, blocks):
    while 1:
        with lock:
            percent = sum([block[1] - block[0] for block in blocks]) * 100 / file_info.size
            progress(percent)
            if percent >= 100:
                break
            write_data(infopath, (file_info, blocks))
        time.sleep(2)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Download file by multi-threads.')
    parser.add_argument('url', type=str, help='url of the download file')
    parser.add_argument('-o', type=str, default=None, dest="output", help='output file')
    parser.add_argument('-t', type=int, default=defaults['thread_count'], dest="thread_count", help='thread counts to downloading')
    parser.add_argument('-b', type=int, default=defaults['buffer_size'], dest="buffer_size", help='buffer size')
    parser.add_argument('-s', type=int, default=defaults['block_size'], dest="block_size", help='block size')
    argv = sys.argv[1:]
    if len(argv) == 0:
        argv = ['https://eyes.nasa.gov/eyesproduct/EYES/os/win']
    args = parser.parse_args(argv)
    start_time = time.time()
    download(args.url, args.output, args.thread_count, args.buffer_size, args.block_size)
    print 'times: %ds' % int(time.time()-start_time)
Python 相关文章推荐
跟老齐学Python之有容乃大的list(3)
Sep 15 Python
python人人网登录应用实例
Sep 26 Python
Python for Informatics 第11章之正则表达式(四)
Apr 21 Python
python利用正则表达式提取字符串
Dec 08 Python
Python 中开发pattern的string模板(template) 实例详解
Apr 01 Python
Python基于回溯法子集树模板解决取物搭配问题实例
Sep 02 Python
python命令行工具Click快速掌握
Jul 04 Python
Django stark组件使用及原理详解
Aug 22 Python
浅谈Django2.0 加xadmin踩的坑
Nov 15 Python
python 实现视频 图像帧提取
Dec 10 Python
Python如何优雅删除字符列表空字符及None元素
Jun 25 Python
手把手教你使用TensorFlow2实现RNN
Jul 15 Python
python获得图片base64编码示例
Jan 16 #Python
python练习程序批量修改文件名
Jan 16 #Python
python使用urllib模块开发的多线程豆瓣小站mp3下载器
Jan 16 #Python
python使用urllib模块和pyquery实现阿里巴巴排名查询
Jan 16 #Python
python3.3教程之模拟百度登陆代码分享
Jan 16 #Python
python解析发往本机的数据包示例 (解析数据包)
Jan 16 #Python
python多线程扫描端口示例
Jan 16 #Python
You might like
PHP Switch 语句之学习笔记
2013/09/21 PHP
Yii2增加验证码步骤详解
2016/04/25 PHP
老生常谈PHP面向对象之命令模式(必看篇)
2017/05/24 PHP
PHP设计模式之单例模式原理与实现方法分析
2018/04/25 PHP
Laravel框架定时任务2种实现方式示例
2018/12/08 PHP
教您去掉ie网页加载进度条的方法
2010/12/09 Javascript
IE6下CSS图片缓存问题解决方法
2010/12/09 Javascript
javascript创建数组之联合数组的使用方法示例
2013/12/26 Javascript
js实现的星星评分功能函数
2015/12/09 Javascript
Ionic实现仿通讯录点击滑动及$ionicscrolldelegate使用分析
2016/01/18 Javascript
AngularJs学习第五篇从Controller控制器谈谈$scope作用域
2016/06/08 Javascript
NodeJS配置HTTPS服务实例分享
2017/02/19 NodeJs
Javascript中 toFixed四舍六入方法
2017/08/21 Javascript
基于vue开发的在线付费课程应用过程
2018/01/25 Javascript
JS逻辑运算符短路操作实例分析
2018/07/09 Javascript
微信小程序 slot踩坑的解决
2019/04/01 Javascript
微信小程序实现日期格式化和倒计时
2020/11/01 Javascript
vue视频播放暂停代码
2019/11/08 Javascript
Python编程实现删除VC临时文件及Debug目录的方法
2017/03/22 Python
Python3利用SMTP协议发送E-mail电子邮件的方法
2017/09/30 Python
神经网络理论基础及Python实现详解
2017/12/15 Python
Python分割指定页数的pdf文件方法
2018/10/26 Python
pytorch:实现简单的GAN示例(MNIST数据集)
2020/01/10 Python
python实现替换word中的关键文字(使用通配符)
2020/02/13 Python
pytorch查看通道数 维数 尺寸大小方式
2020/05/26 Python
python代码中怎么换行
2020/06/17 Python
Prototype如何更新局部页面
2013/03/03 面试题
文职个人求职信范文
2013/09/23 职场文书
自荐书格式
2013/12/01 职场文书
主持人婚宴答谢词
2014/01/28 职场文书
2014年教师业务学习材料
2014/05/12 职场文书
十佳好少年事迹材料
2014/08/21 职场文书
运动会800米赞词
2015/07/22 职场文书
安全教育主题班会总结
2015/08/14 职场文书
Python insert() / append() 用法 Leetcode实战演示
2021/03/31 Python
Go 中的空白标识符下划线
2022/03/25 Golang