-
Notifications
You must be signed in to change notification settings - Fork 0
/
rm_same_file.py
93 lines (79 loc) · 2.17 KB
/
rm_same_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os, sys
import hashlib
import threading, multiprocessing
from os.path import getsize
from tqdm import tqdm
class DiskWalk(object):
"""
获取路径下的所有文件
"""
def __init__(self, path):
self.path = path
def paths(self):
path = self.path
path_collection = []
for dirpath, dirnames, filenames in os.walk(path):
for file in filenames:
fullpath = os.path.join(dirpath, file)
path_collection.append(fullpath)
return path_collection
def create_checksum(path):
"""
计算文件的MD5值
"""
fp = open(path, 'rb')
checksum = hashlib.md5()
while True:
buffer = fp.read(8192)
if not buffer: break
checksum.update(buffer)
fp.close()
checksum = checksum.digest()
return checksum
def find_dupes(path):
"""
创建重复文件的字典
注: key=重复文件名, value=已有文件名
"""
record = {}
dup = {}
d = DiskWalk(path)
files = d.paths()
for file in files:
compound_key = (getsize(file), create_checksum(file))
if compound_key in record:
dup[file] = record[compound_key]
else:
record[compound_key] = file
return dup
def deal(record, rt, folders):
"""
移除重复文件
"""
for folder in folders:
rtf = os.path.join(rt, folder)
files = os.listdir(rtf)
for file in files:
file_path = os.path.join(rtf, file)
cc = create_checksum(file_path)
if cc in record:
os.remove(file_path)
def rm_none():
files = os.listdir('./char_datasets')
l = len(files)
f2 = os.listdir('./labels')
record = []
for fx in f2:
fxpath = os.path.join('./labels', fx)
cc = create_checksum(fxpath)
record.append(cc)
for th in range(16):
end_f = (th + 1) * 26
if end_f > l:
end_f = l
file_used = files[th * 26 : end_f]
t = threading.Thread(target=deal,
args=(record, './char_datasets/', file_used))
t.start()
if __name__ == '__main__':
rm_none()