-
Notifications
You must be signed in to change notification settings - Fork 0
/
FileDAO.py
92 lines (68 loc) · 3.08 KB
/
FileDAO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
import math
from itertools import islice
class FileDAO:
def __init__(self):
self.file_path = ""
self.raw_dataframe = None
self.dataframe = pd.DataFrame()
self.folds_d = {} # dictionary of lists of (len(dataframe)/k)-sized dataframes
def load_dataframe(self, filepath, types):
self.file_path = filepath
self.dataframe = pd.read_csv(self.file_path, dtype=types) # open class as string
def load_raw_dataframe(self, filepath):
self.file_path = filepath
self.raw_dataframe = pd.read_csv(self.file_path)
def get_dataframe(self):
return self.dataframe
def get_folds(self, k):
if not k in self.folds_d.keys():
self.generate_list_of_folds(k)
return self.folds_d[k]
def assemble_numeric_columns(self, numerics, target):
cutting_value = {}
for attribute in numerics:
sorted_dataframe = self.raw_dataframe.sort_values(by=attribute)
differences = []
pre_row = sorted_dataframe.iloc[0]
for index, cur_row in islice(sorted_dataframe.iterrows(), 1, None):
if pre_row[target] != cur_row[target]:
differences.append((float(cur_row[attribute] + pre_row[attribute]))/2.0)
pre_row = cur_row
cutting_value[attribute] = sum(differences) / float(len(differences))
# build new dataframe
for index, cur_row in islice(self.raw_dataframe.iterrows(), 0, None):
columns = self.raw_dataframe.columns.values
new_row = pd.Series(index = columns)
for attribute in columns:
if attribute == target: # set the class
new_row[attribute] = cur_row[attribute]
else:
if cur_row[attribute] <= cutting_value[attribute]:
new_row[attribute] = 'left'
else:
new_row[attribute] = 'right'
self.dataframe = self.dataframe.append(new_row, ignore_index=True)
return self.dataframe
def generate_list_of_folds(self, k):
folds = [] # list of (len(dataframe)/k)-sized dataframes
length = len(self.dataframe)
size = int(math.ceil(length/float(k)))
for i in range(0, k):
start = i*size
if start + size > length:
end = length
else:
end = start + size
folds.append(self.dataframe[start:end])
self.folds_d[k] = folds
def save_converted_dataframe(self, path_and_name):
#path_and_name = "data/converted_input.csv"
export_csv = self.dataframe.to_csv(path_and_name, index=None)
return 0
@staticmethod
def save_dictionary(prefix, sufix, k, n_tree, info_as_dictionary):
info_as_dataframe = pd.DataFrame(info_as_dictionary)
filename = prefix + "_k_" + str(k) + "_ntree_" + str(n_tree) + sufix
path = "collected_data/" + filename + ".csv"
export_csv = info_as_dataframe.to_csv(path, index=None)