-
Notifications
You must be signed in to change notification settings - Fork 0
/
freq_dict.py
114 lines (94 loc) · 3.42 KB
/
freq_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import sys
import string
import json
import operator
walk_dir = sys.argv[1]
output_dir = sys.argv[2]
def get_immediate_subdirectories(a_dir):
return [name for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]
print('walk_dir = ' + walk_dir)
# If your current working directory may change during script execution, it's recommended to
# immediately convert program arguments to an absolute path. Then the variable root below will
# be an absolute path as well. Example:
# walk_dir = os.path.abspath(walk_dir)
print('walk_dir (absolute) = ' + os.path.abspath(walk_dir))
# Create dict<> of keyword and it's document list
# [
# {"heather": {
# "doc": {
# "/home/alok/Projects/security/key_simple/1.": 2
# },
# "freq": 2
# }
# },
# {"ani": {
# "doc": {
# "/home/alok/Projects/security/key_simple/1.": 1
# },
# "freq": 1
# }
# },
# {"aani": {
# "doc": {
# "/home/alok/Projects/security/key_simple/1.": 1
# },
# "freq": 1
# }
# }
# ]
def write_to_file(dict_obj, output_dir):
print("dumping.....")
global op_file_name
global func_call_counter
output_file_path = output_dir+op_file_name+str(func_call_counter)+'.json'
if not os.path.exists(os.path.dirname(output_file_path)):
os.makedirs(os.path.dirname(output_file_path))
with open(output_file_path, 'w') as f:
json.dump(dict_obj, f)
func_call_counter+=1
op_file_name = '/freq_dict-'
subdirs = get_immediate_subdirectories(walk_dir)
for walk_subdir in subdirs:
absolute_path = os.path.join(walk_dir, walk_subdir)
print(absolute_path)
count = 0
dict_obj = {}
file_iterator=0
for root, subdirs, files in os.walk(absolute_path):
# print("changing folder"+root)
for filename in files:
count+=1
if count%50000 == 0 :
print ("Done %d objects" %(count))
# file_path = os.path.join(root, filename)
# with open(file_path, 'rb') as f:
# f_content = f.readlines()
# for keyword in f_content :
# keyword = keyword.strip()
# if keyword in dict_obj :
# dict_obj[keyword]['tf'] += 1
# if file_iterator in dict_obj[keyword]['did'] :
# dict_obj[keyword]['did'][file_iterator] += 1
# else :
# dict_obj[keyword]['did'][file_iterator] = 1
# dict_obj[keyword]['df'] += 1
# else :
# dict_obj[keyword] = {}
# dict_obj[keyword]['df'] = 1
# dict_obj[keyword]['tf'] = 1
# dict_obj[keyword]['did'] = {}
# dict_obj[keyword]['did'][file_iterator] = 1
file_iterator+=1
# for i in dict_obj:
# dict_obj[i].pop('did', None)
print("sorting.....")
# dict_obj = list(dict_obj.iteritems())
# dict_obj.sort(key=lambda x:x[1]["df"], reverse=True)
# dict_obj = [{x:y} for x,y in dict_obj]
# for i in range(len(dict_obj)/1000 +1):
output_usr_dir = os.path.join(output_dir, walk_subdir)
# write_to_file(dict_obj[0:1000], output_usr_dir)
func_call_counter = 1
write_to_file([{'total_number_of_files':file_iterator}], output_usr_dir)