python多线程计算文件MD5
python新手,写了一个计算文件md5的程序,能跑通。然后尝试改造成多线程时遇到疑惑。两核心的机器,开双线程时计算消耗的时间和不使用多线程一致,不知是因为什么问题导致。测试环境下一共44个文件,每个文件200MB-400MB不等,总大小12.1GB,两段代码的运行时间都是42秒左右。
后面又尝试了使用多进程的方式,依然处理时间没有改善
未使用多线程代码:
#!/usr/bin/python3
import os, hashlib, binascii, pymysql, time, json, datetime
def listFiles(dir):
paths = []
for root,dirs,files in os.walk(dir):
for file in files:
paths.append(os.path.join(root,file))
return paths
def calcMD5(filePath, block_size=2**20):
md5 = hashlib.md5()
f = open(filePath, 'rb')
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
f.close()
return md5.hexdigest()
files = listFiles('/data/S01')
result = []
startTime = datetime.datetime.now()
for i in files:
fileMD5 = calcMD5(i)
result.append(fileMD5)
print(result)
endTime = datetime.datetime.now()
timeDiff = endTime - startTime
timeDiffSeconds = timeDiff.seconds
print('总费时{0}分钟{1}秒'.format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))
使用多线程代码:
#!/usr/bin/python3
import os, hashlib, binascii, pymysql, time, json, datetime, threading, queue
def listFiles(dir):
paths = []
for root,dirs,files in os.walk(dir):
for file in files:
paths.append(os.path.join(root,file))
return paths
class threadMD5(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
try:
filePath = self.queue.get(block=False)
except Exception as e:
print('thread end')
break
fileMD5 = calcMD5(filePath)
self.queue.task_done()
def calcMD5(filePath, block_size=2**20):
md5 = hashlib.md5()
f = open(filePath, 'rb')
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
f.close()
return md5.hexdigest()
startTime = datetime.datetime.now()
files = listFiles('/data/S01')
result = []
#多线程
queue = queue.Queue()
for i in files:
queue.put(i, block=False)
threads = []
for i in range(2):
t = threadMD5(queue)
t.setDaemon(True)
t.start()
threads.append(t)
for i in threads:
i.join()
print(result)
endTime = datetime.datetime.now()
timeDiff = endTime - startTime
timeDiffSeconds = timeDiff.seconds
print('总费时{0}分钟{1}秒'.format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))
多进程代码:
import os, hashlib, time, datetime
import multiprocessing as mp
results = []
def listFiles(dir):
paths = []
for root,dirs,files in os.walk(dir):
for file in files:
paths.append(os.path.join(root,file))
return paths
def calcMD5(filePath, block_size=2**20):
md5 = hashlib.md5()
f = open(filePath, 'rb')
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
f.close()
return md5.hexdigest()
def collect_results(result):
results.extend(result)
if __name__ == "__main__":
p = mp.Pool(processes=2)
files = listFiles('/data/S01')
startTime = datetime.datetime.now()
for f in files:
p.apply_async(calcMD5, args=(f, ), callback=collect_results)
p.close()
p.join()
print(results)
endTime = datetime.datetime.now()
timeDiff = endTime - startTime
timeDiffSeconds = timeDiff.seconds
print('总费时{0}分钟{1}秒'.format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
貌似被hyperv坑了,换了物理机以及vmware的虚拟机后就正常了。
百度搜索python GIL