Python开发实例分享bt种子爬虫程序和种子解析


Posted in Python onMay 21, 2014

看到网上也有开源的代码,这不,我拿来进行了二次重写,呵呵,上代码:

    #encoding: utf-8  
    import socket  
    from hashlib import sha1  
    from random import randint  
    from struct import unpack, pack  
    from socket import inet_aton, inet_ntoa  
    from bisect import bisect_left  
    from threading import Timer  
    from time import sleep  
    import MySQLdb  
    from datetime import *  
    import time  
    from bencode import bencode, bdecode  
    BOOTSTRAP_NODES = [  
        ("router.bittorrent.com", 6881),  
        ("dht.transmissionbt.com", 6881),  
        ("router.utorrent.com", 6881)  
    ]   
    TID_LENGTH = 4  
    KRPC_TIMEOUT = 10  
    REBORN_TIME = 5 * 60  
    K = 8  
    def entropy(bytes):  
        s = ""  
        for i in range(bytes):  
            s += chr(randint(0, 255))  
        return s  
    def random_id():  
        hash = sha1()  
        hash.update( entropy(20) )  
        return hash.digest()  
    def decode_nodes(nodes):  
        n = []  
        length = len(nodes)  
        if (length % 26) != 0:   
            return n  
        for i in range(0, length, 26):  
            nid = nodes[i:i+20]  
            ip = inet_ntoa(nodes[i+20:i+24])  
            port = unpack("!H", nodes[i+24:i+26])[0]  
            n.append( (nid, ip, port) )  
        return n  
    def encode_nodes(nodes):  
        strings = []  
        for node in nodes:  
            s = "%s%s%s" % (node.nid, inet_aton(node.ip), pack("!H", node.port))  
            strings.append(s)  
        return "".join(strings)  
    def intify(hstr):  
        return long(hstr.encode('hex'), 16)      
    def timer(t, f):  
        Timer(t, f).start()  
    class BucketFull(Exception):  
        pass  
    class KRPC(object):  
        def __init__(self):  
            self.types = {  
                "r": self.response_received,  
                "q": self.query_received  
            }  
            self.actions = {  
                "ping": self.ping_received,  
                "find_node": self.find_node_received,  
                "get_peers": self.get_peers_received,  
                "announce_peer": self.announce_peer_received,  
            }  
            self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)  
            self.socket.bind(("0.0.0.0", self.port))  
        def response_received(self, msg, address):  
            self.find_node_handler(msg)  
        def query_received(self, msg, address):  
            try:  
                self.actions[msg["q"]](msg, address)  
            except KeyError:  
                pass  
        def send_krpc(self, msg, address):  
            try:  
                self.socket.sendto(bencode(msg), address)  
            except:  
                pass  
    class Client(KRPC):  
        def __init__(self, table):  
            self.table = table  
            timer(KRPC_TIMEOUT, self.timeout)  
            timer(REBORN_TIME, self.reborn)  
            KRPC.__init__(self)  
        def find_node(self, address, nid=None):  
            nid = self.get_neighbor(nid) if nid else self.table.nid  
            tid = entropy(TID_LENGTH)  
            msg = {  
                "t": tid,  
                "y": "q",  
                "q": "find_node",  
                "a": {"id": nid, "target": random_id()}  
            }  
            self.send_krpc(msg, address)  
        def find_node_handler(self, msg):  
            try:  
                nodes = decode_nodes(msg["r"]["nodes"])  
                for node in nodes:  
                    (nid, ip, port) = node  
                    if len(nid) != 20: continue  
                    if nid == self.table.nid: continue  
                    self.find_node( (ip, port), nid )  
            except KeyError:  
                pass  
        def joinDHT(self):  
            for address in BOOTSTRAP_NODES:   
                self.find_node(address)  
        def timeout(self):  
            if len( self.table.buckets ) < 2:  
                self.joinDHT()  
            timer(KRPC_TIMEOUT, self.timeout)  
        def reborn(self):  
            self.table.nid = random_id()  
            self.table.buckets = [ KBucket(0, 2**160) ]  
            timer(REBORN_TIME, self.reborn)  
        def start(self):  
            self.joinDHT()  
            while True:  
                try:  
                    (data, address) = self.socket.recvfrom(65536)  
                    msg = bdecode(data)  
                    self.types[msg["y"]](msg, address)  
                except Exception:  
                    pass  
        def get_neighbor(self, target):  
            return target[:10]+random_id()[10:]  
    class Server(Client):  
        def __init__(self, master, table, port):  
            self.table = table  
            self.master = master  
            self.port = port  
            Client.__init__(self, table)  
        def ping_received(self, msg, address):  
            try:  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(nid)}  
                }  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def find_node_received(self, msg, address):  
            try:  
                target = msg["a"]["target"]  
                neighbors = self.table.get_neighbors(target)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(target),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def get_peers_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                neighbors = self.table.get_neighbors(infohash)  
                nid = msg["a"]["id"]  
                msg = {  
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {  
                        "id": self.get_neighbor(infohash),   
                        "nodes": encode_nodes(neighbors)  
                    }  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
        def announce_peer_received(self, msg, address):  
            try:  
                infohash = msg["a"]["info_hash"]  
                nid = msg["a"]["id"]  
                msg = {   
                    "t": msg["t"],  
                    "y": "r",  
                    "r": {"id": self.get_neighbor(infohash)}  
                }  
                self.table.append(KNode(nid, *address))  
                self.send_krpc(msg, address)  
                self.master.log(infohash)  
                self.find_node(address, nid)  
            except KeyError:  
                pass  
    class KTable(object):  
        def __init__(self, nid):  
            self.nid = nid  
            self.buckets = [ KBucket(0, 2**160) ]  
        def append(self, node):  
            index = self.bucket_index(node.nid)  
            try:  
                bucket = self.buckets[index]  
                bucket.append(node)  
            except IndexError:  
                return  
            except BucketFull:  
                if not bucket.in_range(self.nid): return  
                self.split_bucket(index)  
                self.append(node)  
        def get_neighbors(self, target):  
            nodes = []  
            if len(self.buckets) == 0: return nodes  
            if len(target) != 20 : return nodes  
            index = self.bucket_index(target)  
            try:  
                nodes = self.buckets[index].nodes  
                min = index - 1  
                max = index + 1  
                while len(nodes) < K and ((min >= 0) or (max < len(self.buckets))):  
                    if min >= 0:  
                        nodes.extend(self.buckets[min].nodes)  
                    if max < len(self.buckets):  
                        nodes.extend(self.buckets[max].nodes)  
                    min -= 1  
                    max += 1  
                num = intify(target)  
                nodes.sort(lambda a, b, num=num: cmp(num^intify(a.nid), num^intify(b.nid)))  
                return nodes[:K]  
            except IndexError:  
                return nodes  
        def bucket_index(self, target):  
            return bisect_left(self.buckets, intify(target))  
        def split_bucket(self, index):  
            old = self.buckets[index]  
            point = old.max - (old.max - old.min)/2  
            new = KBucket(point, old.max)  
            old.max = point  
            self.buckets.insert(index + 1, new)  
            for node in old.nodes[:]:  
                if new.in_range(node.nid):  
                    new.append(node)  
                    old.remove(node)  
        def __iter__(self):  
            for bucket in self.buckets:  
                yield bucket  
    class KBucket(object):  
        __slots__ = ("min", "max", "nodes")  
        def __init__(self, min, max):  
            self.min = min  
            self.max = max  
            self.nodes = []  
        def append(self, node):  
            if node in self:  
                self.remove(node)  
                self.nodes.append(node)  
            else:  
                if len(self) < K:  
                    self.nodes.append(node)  
                else:  
                    raise BucketFull  
        def remove(self, node):  
            self.nodes.remove(node)  
        def in_range(self, target):  
            return self.min <= intify(target) < self.max  
        def __len__(self):  
            return len(self.nodes)  
        def __contains__(self, node):  
            return node in self.nodes  
        def __iter__(self):  
            for node in self.nodes:  
                yield node  
        def __lt__(self, target):  
            return self.max <= target  
    class KNode(object):  
        __slots__ = ("nid", "ip", "port")  
        def __init__(self, nid, ip, port):  
            self.nid = nid  
            self.ip = ip  
            self.port = port  
        def __eq__(self, other):  
            return self.nid == other.nid  
    #using example  
    class Master(object):  
        def __init__(self, f):  
            self.f = f  
            try:  
                self.conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
                self.cur=self.conn.cursor()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        def log(self, infohash):  
            try:  
                sql = "insert into bt_main_new(hash,name,length,date) values(%s,%s,%s,%s)"  
                date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())  
                re=self.cur.execute(sql,(infohash,'','',date))  
                self.conn.commit()  
                self.cur.close()  
                self.conn.close()  
                #print re  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
            self.f.write(infohash.encode("hex")+"\n")  
            self.f.flush()  
    try:  
        d = date.today()  
        f = open("%s.log" % d, "a")  
        m = Master(f)  
        s = Server(Master(f), KTable(random_id()), 8006)  
        s.start()       
    except KeyboardInterrupt:  
        s.socket.close()  
        f.close() 

本爬虫程序,会自动爬取得网络上分享的bt种子,写入文件盒数据库,爬取的只是个种子的hash码,还需要到网络上下载种子进行分析

下载种子,相信大家都知道国外有几个免费分享种子的网站,大家可以根据hash码去下载,分析,下面呈上我写的一个分析种子的程序:

#! /usr/bin/python  
# -*- coding: utf-8 -*-  
import MySQLdb  
from datetime import *  
import time  
import re  
from time import sleep  
import bencode  
import urllib2  
import base64  
try:  
    conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)  
    cur=conn.cursor()  
    sql = "select * from bt_main where name = '' order by id desc"  
    count = cur.execute(sql)  
    rows = cur.fetchall()  
    for row in rows:  
        if row[2].strip() != '':  
            continue  
        id = row[0]  
        hash = row[1]  
        url = "http://haofuli.duapp.com/go/info.php?hash=%s" % hash  
        file = urllib2.urlopen(url).read()  
        if "error!" == file:  
            try:  
                sql = "update bt_main set isTrue = 0 where id = %s "  
                re = cur.execute(sql,(id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
        else:  
            #decode  
            try:  
                fileEncode = bencode.bdecode(file)  
            except Exception,e:pass  
            if 'name.utf-8' in fileEncode['info']:  
                filename=fileEncode['info']['name.utf-8']  
            else:  
                filename = fileEncode['info']['name']  
            ##length  
            if "length" in fileEncode['info']:  
                length = fileEncode['info']['length']  
            else:  
                length = 0  
            try:  
                sql = "update bt_main set name = %s , length = %s , isTrue = 1 where id = %s"  
                re = cur.execute(sql,(base64.b64encode(filename),length,id))  
                conn.commit()  
            except MySQLdb.Error,e:  
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
except MySQLdb.Error,e:  
    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

上面的只是简单的分析,对于多文件的,还没有处理。我最近在解析种子的时候,总是出现莫名的填充文件的问题,可能是版本过低吧,最近仍旧在解决。

Python 相关文章推荐
python用装饰器自动注册Tornado路由详解
Feb 14 Python
windows下Virtualenvwrapper安装教程
Dec 13 Python
Python3.遍历某文件夹提取特定文件名的实例
Apr 26 Python
python smtplib模块实现发送邮件带附件sendmail
May 22 Python
flask中过滤器的使用详解
Aug 01 Python
python实现排序算法解析
Sep 08 Python
对python中的six.moves模块的下载函数urlretrieve详解
Dec 19 Python
解决Tensorflow占用GPU显存问题
Feb 03 Python
使用Keras实现简单线性回归模型操作
Jun 12 Python
django restframework serializer 增加自定义字段操作
Jul 15 Python
如何在 Matplotlib 中更改绘图背景的实现
Nov 26 Python
python如何在word中存储本地图片
Apr 07 Python
从零学Python之引用和类属性的初步理解
May 15 #Python
python中xrange和range的区别
May 13 #Python
Python中os和shutil模块实用方法集锦
May 13 #Python
Python中的jquery PyQuery库使用小结
May 13 #Python
Python getopt模块处理命令行选项实例
May 13 #Python
Python random模块(获取随机数)常用方法和使用例子
May 13 #Python
Python自动化测试工具Splinter简介和使用实例
May 13 #Python
You might like
php操作csv文件代码实例汇总
2014/09/22 PHP
php的XML文件解释类应用实例
2014/09/22 PHP
PHP开发框架laravel安装与配置教程
2015/03/13 PHP
PHP实现的函数重载功能示例
2018/08/03 PHP
PHP正则表达式函数preg_replace用法实例分析
2020/06/04 PHP
JS input 数字验证代码
2009/07/30 Javascript
基于jQuery的日期选择控件
2009/10/27 Javascript
自定义jQuery选项卡插件实例
2013/03/27 Javascript
jquery必须知道的一些常用特效方法及使用示例(整理)
2013/06/24 Javascript
JavaScript 学习笔记之数据类型
2015/01/14 Javascript
jQuery验证插件validation使用指南
2015/04/21 Javascript
详解Document.Cookie
2015/12/25 Javascript
vue项目总结之文件夹结构配置详解
2017/12/13 Javascript
jquery radio 动态控制选中失效问题的解决方法
2018/02/28 jQuery
通过 JS 判断页面是否有滚动条的实现方法
2018/04/05 Javascript
turn.js异步加载实现翻书效果
2019/07/25 Javascript
vue使用混入定义全局变量、函数、筛选器的实例代码
2019/07/29 Javascript
js实现百度淘宝搜索功能
2020/02/17 Javascript
jQuery实现移动端笔触canvas电子签名
2020/05/21 jQuery
微信小程序自定义联系人弹窗
2020/05/26 Javascript
jQuery使用jsonp实现百度搜索的示例代码
2020/07/08 jQuery
Python实现的数据结构与算法之队列详解
2015/04/22 Python
Python中import机制详解
2017/11/14 Python
Python中实现单例模式的n种方式和原理
2018/11/14 Python
python判断一个对象是否可迭代的例子
2019/07/22 Python
HTML5 画布canvas使用方法
2016/03/18 HTML / CSS
体育教育专业毕业生自荐信
2013/11/15 职场文书
《锄禾》教学反思
2014/04/08 职场文书
我的大学四年规划书范文2014
2014/09/26 职场文书
教师先进个人材料
2014/12/17 职场文书
五好家庭申报材料
2014/12/20 职场文书
nginx 防盗链防爬虫配置详解
2021/03/31 Servers
golang switch语句的灵活写法介绍
2021/05/06 Golang
python实现自定义日志的具体方法
2021/05/28 Python
2007年老电脑安装win11会怎么样? 网友实测win11在老电脑运行良好
2021/11/21 数码科技
《火纹风花雪月无双》预告“神秘雇佣兵” 紫发剑客
2022/04/13 其他游戏