Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
python实现删除文件与目录的方法
Nov 10 Python
Python实现列表转换成字典数据结构的方法
Mar 11 Python
使用Python写一个贪吃蛇游戏实例代码
Aug 21 Python
Python微信公众号开发平台
Jan 25 Python
centos 安装python3.6环境并配置虚拟环境的详细教程
Feb 22 Python
Django中数据库的数据关系:一对一,一对多,多对多
Oct 21 Python
Python3.5面向对象编程图文与实例详解
Apr 24 Python
Python 使用threading+Queue实现线程池示例
Dec 21 Python
python3 动态模块导入与全局变量使用实例
Dec 22 Python
python实现爱奇艺登陆密码RSA加密的方法示例详解
May 27 Python
Python API 操作Hadoop hdfs详解
Jun 06 Python
python 制作一个gui界面的翻译工具
May 14 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
php date()日期时间函数详解
2010/05/16 PHP
thinkphp分页集成实例
2017/07/24 PHP
JAVASCRIPT对象及属性
2007/02/13 Javascript
Javascript String对象扩展HTML编码和解码的方法
2009/06/02 Javascript
用JS写的一个TableView控件代码
2010/01/23 Javascript
jquery中ajax学习笔记一
2011/10/16 Javascript
IE与FireFox中的childNodes区别
2011/10/20 Javascript
Textarea根据内容自适应高度
2013/10/28 Javascript
jquery(hide方法)隐藏指定元素实例
2013/11/11 Javascript
String.prototype实现的一些javascript函数介绍
2013/11/22 Javascript
Ubuntu中搭建Nodejs开发环境过程分享
2014/06/01 NodeJs
深入理解jQuery事件绑定
2016/06/02 Javascript
jQuery多个版本和其他js库冲突的解决方法
2016/08/11 Javascript
浅谈JavaScript正则表达式-非捕获性分组
2017/03/08 Javascript
详解JavaScript对象的深浅复制
2017/03/30 Javascript
微信小程序 action-sheet 反馈上拉菜单简单实例
2017/05/11 Javascript
使用travis-ci如何持续部署node.js应用详解
2017/07/30 Javascript
jQuery中each方法的使用详解
2018/03/18 jQuery
vue单页面应用打开新窗口显示跳转页面的实例
2018/09/21 Javascript
Object.keys() 和 Object.getOwnPropertyNames() 的区别详解
2020/05/21 Javascript
从源码角度来回答keep-alive组件的缓存原理
2021/01/18 Javascript
[14:36]2014 DOTA2国际邀请赛中国区预选赛5.21 Orenda VS NE
2014/05/22 DOTA
[55:16]Mski vs VGJ.S Supermajor小组赛C组 BO3 第二场 6.3
2018/06/04 DOTA
Python实现的排列组合计算操作示例
2017/10/13 Python
Python机器学习之K-Means聚类实现详解
2018/02/22 Python
python批量识别图片指定区域文字内容
2019/04/30 Python
Pytorch如何切换 cpu和gpu的使用详解
2021/03/01 Python
英国标志性生活方式品牌:Skinnydip London
2019/12/15 全球购物
什么是属性访问器
2015/10/26 面试题
高中生毕业自我鉴定范文
2013/12/22 职场文书
竞争性谈判邀请书
2014/02/06 职场文书
教师职位说明书
2014/07/29 职场文书
基层党组织整改方案
2014/10/25 职场文书
2015社区六五普法工作总结
2015/04/21 职场文书
2019毕业论文致谢词
2019/06/24 职场文书
spring boot项目application.properties文件存放及使用介绍
2021/06/30 Java/Android