-
Notifications
You must be signed in to change notification settings - Fork 26
/
statistics.py
109 lines (105 loc) · 5.58 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
import argparse
import os
percent_list = [0.1*i for i in range(1,10)]
def preprocess_log(ipath, opath):
#preprocess
processed_log = opath
command_for_preprocessing = "python code/preprocessing.py -rawlog %s -o %s"%(ipath, processed_log)
os.system(command_for_preprocessing)
return processed_log
def get_num_of_line(ipath):
count = 0
ifile = open(ipath, 'r')
for line in ifile:
count += 1
ifile.close()
return count
def get_statistics_for_oov(datasets, ipath_list, opath):
result = {}
for index in range(len(datasets)):
dataset = datasets[index]
path = ipath_list[index]
result[dataset] = {}
for train_percent in percent_list:
result[dataset][train_percent] = {}
result[dataset][train_percent]["test_oov_line"] = 0
result[dataset][train_percent]["test_oov_kind_num"] = 0
result[dataset][train_percent]["test_oov_all_num"] = 0
num_of_logs = get_num_of_line(path)
statistics = {'train':{}, 'test':{}}
result[dataset][train_percent]["train_line"] = round(train_percent * num_of_logs)
result[dataset][train_percent]["test_line"] = num_of_logs - round(train_percent * num_of_logs)
statistics['train']['words'] = {}
statistics['test']['words'] = {}
with open(path, 'r') as file:
count = 1
for log in file:
if len(log) == 0:
continue
if len(log) == 1:
if log[0] == '\n':
continue
words = log.split()
log_with_oov = False
if count <= result[dataset][train_percent]["train_line"]:
for word in words:
statistics['train']['words'][word] = statistics['train']['words'].get(word, 0) + 1
else:
for word in words:
statistics['test']['words'][word] = statistics['test']['words'].get(word, 0) + 1
if log_with_oov == False and word not in statistics['train']['words']:
log_with_oov = True
result[dataset][train_percent]["test_oov_line"] += 1
count += 1
result[dataset][train_percent]["train_words"] = sum(statistics['train']['words'].values())
result[dataset][train_percent]["test_words"] = sum(statistics['test']['words'].values())
result[dataset][train_percent]["test_oov_log_percent"] = (
result[dataset][train_percent]["test_oov_line"] / result[dataset][train_percent]['test_line'])
for word in statistics['test']['words']:
if word not in statistics['train']['words']:
result[dataset][train_percent]["test_oov_kind_num"] += 1
result[dataset][train_percent]['test_oov_all_num'] += statistics['test']['words'][word]
result[dataset][train_percent]['test_oov_word_percent'] = (
result[dataset][train_percent]['test_oov_all_num'] / result[dataset][train_percent]["test_words"])
print(dataset, " ", train_percent, " finished", "count:", count)
print(dataset, " finished")
print('--------------------------')
final_result = {}
for dataset in result:
final_result[dataset] = {}
for percent in result[dataset]:
final_result[dataset][round(percent, 1)] = {}
final_result[dataset][round(percent, 1)]['train_num_of_logs'] = result[dataset][percent]['train_line']
final_result[dataset][round(percent, 1)]['train_num_of_words'] = result[dataset][percent]['train_words']
final_result[dataset][round(percent, 1)]['test_num_of_logs'] = result[dataset][percent]['test_line']
final_result[dataset][round(percent, 1)]['test_num_of_words'] = result[dataset][percent]['test_words']
final_result[dataset][round(percent, 1)]['test_oov_num_of_logs'] = result[dataset][percent]['test_oov_line']
final_result[dataset][round(percent, 1)]['test_oov_num_of_words'] = result[dataset][percent]['test_oov_all_num']
final_result[dataset][round(percent, 1)]['test_oov_num_of_wordKind'] = result[dataset][percent]['test_oov_kind_num']
final_result[dataset][round(percent, 1)]['test_oov_log_percent'] = result[dataset][percent]['test_oov_log_percent']
final_result[dataset][round(percent, 1)]['test_oov_word_percent'] = result[dataset][percent]['test_oov_word_percent']
with open(opath, 'w') as ofile:
ofile.write(json.dumps(final_result))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', help='input file')
parser.add_argument('-t', help='log type')
parser.add_argument('-o', help='output file')
parser.add_argument('-preprocess', help='whether to preprocess or not', default=False, type=int)
args = parser.parse_args()
input_list = args.i.split(',')
log_type = args.t.split(',')
if args.preprocess:
if not os.path.exists('middle'):
os.mkdir('middle')
temp_list = []
for index in range(len(log_type)):
logt = log_type[index]
temp_list.append('middle/'+logt+'.log')
preprocess_log(input_list[index],temp_list[-1])
input_list = temp_list
get_statistics_for_oov(log_type, input_list, args.o)
if args.preprocess:
for middle_file in input_list:
os.remove(middle_file)