Python开发实例分享bt种子爬虫程序和种子解析


Posted in Python onMay 21, 2014

看到网上也有开源的代码,这不,我拿来进行了二次重写,呵呵,上代码:

    #encoding: utf-8  
    import socket  
    from hashlib import sha1  
    from random import randint  
    from struct import unpack, pack  
    from socket import inet_aton, inet_ntoa  
    from bisect import bisect_left  
    from threading import Timer  
    from time import sleep  
    import MySQLdb  
    from datetime import *  
    import time  
    from bencode import bencode, bdecode  
    BOOTSTRAP_NODES = [  
        ("router.bittorrent.com", 6881),  
        ("dht.transmissionbt.com", 6881),  
        ("router.utorrent.com", 6881)  
    ]   
    TID_LENGTH = 4  
    KRPC_TIMEOUT = 10  
    REBORN_TIME = 5 * 60  
    K = 8  
    def entropy(bytes):  
        s = ""  
        for i in range(bytes):  
            s += chr(randint(0, 255))  
        return s  
    def random_id():  
        hash = sha1()  
        hash.update( entropy(20) )  
        return hash.digest()  
    def decode_nodes(nodes):  
        n = []  
        length = len(nodes)  
        if (length % 26) != 0:   
            return n  
        for i in range(0, length, 26):  
            nid = nodes[i:i+20]  
            ip = inet_ntoa(nodes[i+20:i+24])  
            port = unpack("!H", nodes[i+24:i+26])[0]  
            n.append( (nid, ip, port) )  
        return n  
    def encode_nodes(nodes):  
        strings = []  
        for node in nodes:  
            s = "%s%s%s" % (node.nid, inet_aton(node.ip), pack("!H", node.port))  
            strings.append(s)  
        return "".join(strings)  
    def intify(hstr):  
        return long(hstr.encode('hex'), 16)      
    def timer(t, f):  
        Timer(t, f).start()  
    class BucketFull(Exception):  
        pass  
    class KRPC(object):  
        def __init__(self):  
            self.types = {  
                "r": self.response_received,  
                "q": self.query_received  
            }  
            self.actions = {  
                "ping": self.ping_received,  
                "find_node": self.find_node_received,  
                "get_peers": self.get_peers_received,  
                "announce_peer": self.announce_peer_received,  
            }  
            self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)  
            self.socket.bind(("0.0.0.0", self.port))  
        def response_received(self, msg, address):  
            self.find_node_handler(msg)  
        def query_received(self, msg, address):  
            try:  
                self.actions[msg["q"]](msg, address)  
            except KeyError:  
                pass  
        def send_krpc(self, msg, address):  
            try:  
                self.socket.sendto(bencode(msg), address)  
            except:  
                pass  
    class Client(KRPC):  
        def __init__(self, table):  
            self.table = table  
            timer(KRPC_TIMEOUT, self.timeout)  
            timer(REBORN_TIME, self.reborn)  
            KRPC.__init__(self)  
        def find_node(self, address, nid=None):  
            nid = self.get_neighbor(nid) if nid else self.table.nid  
            tid = entropy(TID_LENGTH)  
            msg = {  
                "t": tid,  
                "y": "q",  
                "q": "find_node",  
                "a": {"id": nid, "target": random_id()}  
            }  
            self.send_krpc(msg, address)  
        def find_node_handler(self, msg):  
            try:  
                nodes = decode_nodes(msg["r"]["nodes"])  
                for node in nodes:  
                    (nid, ip, port) = node  
                    if len(nid) != 20: continue  
                    if nid == self.table.nid: continue  
                    self.find_node( (ip, port), nid )  
            except KeyError:  
                pass  
        def joinDHT(self):  
            for address in BOOTSTRAP_NODES:   
                self.find_node(address)  
        def timeout(self):  
            if len( self.table.buckets ) < 2:  
                self.joinDHT()  
            timer(KRPC_TIMEOUT, self.timeout)  
        def reborn(self):  
            self.table.nid = random_id()  
            self.table.buckets = [ KBucket(0, 2**160) ]  
            timer(REBORN_TIME, self.reborn)  
        def start(self):  
            self.joinDHT()  
            while True:  
                try:  
                    (data, address) = self.socket.recvfrom(65536)  
                    msg = bdecode(data)  
                    self.types[msg["y"]](msg, address)  
                except Exception:  
                    pass  
        def get_neighbor(self, target):  
            return target[:10]+random_id()[10:]  
    class Server(Client):  
        def __init__(self, master, table, port):  
            self.table = table  
            self.master = master  
            self.port = port  
            Client.__init__(self, table)  
        def ping_received(self, msg, address):  
            try:  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(nid)}  
                }  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def find_node_received(self, msg, address):  
            try:  
                target = msg["a"]["target"]  
                neighbors = self.table.get_neighbors(target)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(target),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def get_peers_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                neighbors = self.table.get_neighbors(infohash)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(infohash),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def announce_peer_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                nid = msg["a"]["id"]  
                msg = {   
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(infohash)}  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
    class KTable(object):  
        def __init__(self, nid):  
            self.nid = nid  
            self.buckets = [ KBucket(0, 2**160) ]  
        def append(self, node):  
            index = self.bucket_index(node.nid)  
            try:  
                bucket = self.buckets[index]  
                bucket.append(node)  
            except IndexError:  
                return  
            except BucketFull:  
                if not bucket.in_range(self.nid): return  
                self.split_bucket(index)  
                self.append(node)  
        def get_neighbors(self, target):  
            nodes = []  
            if len(self.buckets) == 0: return nodes  
            if len(target) != 20 : return nodes  
            index = self.bucket_index(target)  
            try:  
                nodes = self.buckets[index].nodes  
                min = index - 1  
                max = index + 1  
                while len(nodes) < K and ((min >= 0) or (max < len(self.buckets))):  
                    if min >= 0:  
                        nodes.extend(self.buckets[min].nodes)  
                    if max < len(self.buckets):  
                        nodes.extend(self.buckets[max].nodes)  
                    min -= 1  
                    max += 1  
                num = intify(target)  
                nodes.sort(lambda a, b, num=num: cmp(num^intify(a.nid), num^intify(b.nid)))  
                return nodes[:K]  
            except IndexError:  
                return nodes  
        def bucket_index(self, target):  
            return bisect_left(self.buckets, intify(target))  
        def split_bucket(self, index):  
            old = self.buckets[index]  
            point = old.max - (old.max - old.min)/2  
            new = KBucket(point, old.max)  
            old.max = point  
            self.buckets.insert(index + 1, new)  
            for node in old.nodes[:]:  
                if new.in_range(node.nid):  
                    new.append(node)  
                    old.remove(node)  
        def __iter__(self):  
            for bucket in self.buckets:  
                yield bucket  
    class KBucket(object):  
        __slots__ = ("min", "max", "nodes")  
        def __init__(self, min, max):  
            self.min = min  
            self.max = max  
            self.nodes = []  
        def append(self, node):  
            if node in self:  
                self.remove(node)  
                self.nodes.append(node)  
            else:  
                if len(self) < K:  
                    self.nodes.append(node)  
                else:  
                    raise BucketFull  
        def remove(self, node):  
            self.nodes.remove(node)  
        def in_range(self, target):  
            return self.min <= intify(target) < self.max  
        def __len__(self):  
            return len(self.nodes)  
        def __contains__(self, node):  
            return node in self.nodes  
        def __iter__(self):  
            for node in self.nodes:  
                yield node  
        def __lt__(self, target):  
            return self.max <= target  
    class KNode(object):  
        __slots__ = ("nid", "ip", "port")  
        def __init__(self, nid, ip, port):  
            self.nid = nid  
            self.ip = ip  
            self.port = port  
        def __eq__(self, other):  
            return self.nid == other.nid  
    #using example  
    class Master(object):  
        def __init__(self, f):  
            self.f = f  
            try:  
                self.conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
                self.cur=self.conn.cursor()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        def log(self, infohash):  
            try:  
                sql = "insert into bt_main_new(hash,name,length,date) values(%s,%s,%s,%s)"  
                date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())  
                re=self.cur.execute(sql,(infohash,'','',date))  
                self.conn.commit()  
                self.cur.close()  
                self.conn.close()  
                #print re  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
            self.f.write(infohash.encode("hex")+"\n")  
            self.f.flush()  
    try:  
        d = date.today()  
        f = open("%s.log" % d, "a")  
        m = Master(f)  
        s = Server(Master(f), KTable(random_id()), 8006)  
        s.start()       
    except KeyboardInterrupt:  
        s.socket.close()  
        f.close() 

本爬虫程序,会自动爬取得网络上分享的bt种子,写入文件盒数据库,爬取的只是个种子的hash码,还需要到网络上下载种子进行分析

下载种子,相信大家都知道国外有几个免费分享种子的网站,大家可以根据hash码去下载,分析,下面呈上我写的一个分析种子的程序:

#! /usr/bin/python  
# -*- coding: utf-8 -*-  
import MySQLdb  
from datetime import *  
import time  
import re  
from time import sleep  
import bencode  
import urllib2  
import base64  
try:  
    conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
    cur=conn.cursor()  
    sql = "select * from bt_main where name = '' order by id desc"  
    count = cur.execute(sql)  
    rows = cur.fetchall()  
    for row in rows:  
        if row[2].strip() != '':  
            continue  
        id = row[0]  
        hash = row[1]  
        url = "http://haofuli.duapp.com/go/info.php?hash=%s" % hash  
        file = urllib2.urlopen(url).read()  
        if "error!" == file:  
            try:  
                sql = "update bt_main set isTrue = 0 where id = %s "  
                re = cur.execute(sql,(id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        else:  
            #decode  
            try:  
                fileEncode = bencode.bdecode(file)  
            except Exception,e:pass  
            if 'name.utf-8' in fileEncode['info']:  
                filename=fileEncode['info']['name.utf-8']  
            else:  
                filename = fileEncode['info']['name']  
            ##length  
            if "length" in fileEncode['info']:  
                length = fileEncode['info']['length']  
            else:  
                length = 0  
            try:  
                sql = "update bt_main set name = %s , length = %s , isTrue = 1 where id = %s"  
                re = cur.execute(sql,(base64.b64encode(filename),length,id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
except MySQLdb.Error,e:  
    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

上面的只是简单的分析,对于多文件的,还没有处理。我最近在解析种子的时候,总是出现莫名的填充文件的问题,可能是版本过低吧,最近仍旧在解决。

Python 相关文章推荐
Python实现新浪博客备份的方法
Apr 27 Python
Python科学计算之Pandas详解
Jan 15 Python
Python贪心算法实例小结
Apr 22 Python
python 按照固定长度分割字符串的方法小结
Apr 30 Python
Python GUI布局尺寸适配方法
Oct 11 Python
DES加密解密算法之python实现版(图文并茂)
Dec 06 Python
python binascii 进制转换实例
Jun 12 Python
在python中画正态分布图像的实例
Jul 08 Python
修改Pandas的行或列的名字(重命名)
Dec 18 Python
Python如何基于Tesseract实现识别文字功能
Jun 05 Python
终于搞懂了Keras中multiloss的对应关系介绍
Jun 22 Python
pip已经安装好第三方库但pycharm中import时还是标红的解决方案
Oct 09 Python
从零学Python之引用和类属性的初步理解
May 15 #Python
python中xrange和range的区别
May 13 #Python
Python中os和shutil模块实用方法集锦
May 13 #Python
Python中的jquery PyQuery库使用小结
May 13 #Python
Python getopt模块处理命令行选项实例
May 13 #Python
Python random模块(获取随机数)常用方法和使用例子
May 13 #Python
Python自动化测试工具Splinter简介和使用实例
May 13 #Python
You might like
PHP中empty和isset对于参数结构的判断及empty()和isset()的区别
2015/11/15 PHP
解决php-fpm.service not found问题的办法
2017/06/06 PHP
PHP使用栈解决约瑟夫环问题算法示例
2017/08/27 PHP
PHP中关于php.ini参数优化详解
2020/02/28 PHP
JS效率个人经验谈(8-15更新),加入range技巧
2007/01/09 Javascript
javascript document.execCommand() 常用解析
2009/12/14 Javascript
用js做一个小游戏平台 (一)
2009/12/29 Javascript
在javaScript中关于submit和button的区别介绍
2013/10/20 Javascript
javascript 10进制和62进制的相互转换
2014/07/31 Javascript
Bootstrap 折叠(Collapse)插件用法实例详解
2016/06/01 Javascript
jQuery插件Easyui设置datagrid的pageNumber导致两次请求问题的解决方法
2016/08/06 Javascript
基于jQuery实现选项卡效果
2017/01/04 Javascript
Angular开发者指南之入门介绍
2017/03/05 Javascript
Angular.Js之Scope作用域的学习教程
2017/04/27 Javascript
JavaScript通过filereader接口读取文件
2017/05/10 Javascript
微信小程序下拉刷新PullDownRefresh的使用方法
2018/11/29 Javascript
RxJS的入门指引和初步应用
2019/06/15 Javascript
python opencv检测目标颜色的实例讲解
2018/04/02 Python
对pandas进行数据预处理的实例讲解
2018/04/20 Python
python 两个数据库postgresql对比
2019/10/21 Python
Pyecharts绘制全球流向图的示例代码
2020/01/08 Python
一款简洁的纯css3代码实现的动画导航
2014/10/31 HTML / CSS
DJI大疆无人机官方商城:全球领先的无人飞行器研发和生产商
2016/12/21 全球购物
番木瓜健康和保健产品第一大制造商:Herbal Papaya
2017/04/25 全球购物
幼师专业求职推荐信
2013/11/08 职场文书
联谊活动策划书
2014/01/26 职场文书
《灯光》教学反思
2014/02/08 职场文书
简历中自我评价怎么写
2014/02/12 职场文书
海洋科学专业求职信
2014/08/10 职场文书
技术经济专业求职信
2014/09/03 职场文书
信用卡结清证明怎么写
2014/09/13 职场文书
新生入学欢迎词
2015/01/26 职场文书
大学文艺委员竞选稿
2015/11/19 职场文书
用Python爬取各大高校并可视化帮弟弟选大学,弟弟直呼牛X
2021/06/11 Python
什么是SOLID
2022/03/24 Javascript
Python  lambda匿名函数和三元运算符
2022/04/19 Python