-
Notifications
You must be signed in to change notification settings - Fork 73
/
generic_preprocessing.py
287 lines (262 loc) · 10.4 KB
/
generic_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
"""
@author: Aayush Agrawal
@Purpose - Re-usable code in Python 3 for general data exploration and pre-processing in modeling process
"""
## Importing required libraries
import pandas as pd ## For DataFrame operation
import numpy as np ## Numerical python for matrix operations
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler ## Preprocessing function
import pandas_profiling ## For easy profiling of pandas DataFrame
import missingno as msno ## Missing value co-occurance analysis
####### Data Exploration ############
def print_dim(df):
'''
Function to print the dimensions of a given python dataframe
Required Input -
- df = Pandas DataFrame
Expected Output -
- Data size
'''
print("Data size: Rows-{0} Columns-{1}".format(df.shape[0],df.shape[1]))
def print_dataunique(df):
'''
Function to print unique information for each column in a python dataframe
Required Input -
- df = Pandas DataFrame
Expected Output -
- Column name
- Data type of that column
- Number of unique values in that column
- 5 unique values from that column
'''
counter = 0
for i in df.columns:
x = df.loc[:,i].unique()
print(counter,i,type(df.loc[0,i]), len(x), x[0:5])
counter +=1
def do_data_profiling(df, filename):
'''
Function to do basic data profiling
Required Input -
- df = Pandas DataFrame
- filename = Path for output file with a .html extension
Expected Output -
- HTML file with data profiling summary
'''
profile = pandas_profiling.ProfileReport(df)
profile.to_file(output_file = filename)
print("Data profiling done")
def missing_value_analysis(df):
'''
Function to do basic missing value analysis
Required Input -
- df = Pandas DataFrame
Expected Output -
- Chart of Missing value co-occurance
- Chart of Missing value heatmap
'''
msno.matrix(df)
msno.heatmap(df)
####### Basic helper function ############
def join_df(left, right, left_on, right_on=None, method='left'):
'''
Function to outer joins of pandas dataframe
Required Input -
- left = Pandas DataFrame 1
- right = Pandas DataFrame 2
- left_on = Fields in DataFrame 1 to merge on
- right_on = Fields in DataFrame 2 to merge with left_on fields of Dataframe 1
- method = Type of join
Expected Output -
- Pandas dataframe with dropped no variation columns
'''
if right_on is None:
right_on = left_on
return left.merge(right,
how=method,
left_on=left_on,
right_on=right_on,
suffixes=("","_y"))
####### Pre-processing ############
def drop_allsame(df):
'''
Function to remove any columns which have same value all across
Required Input -
- df = Pandas DataFrame
Expected Output -
- Pandas dataframe with dropped no variation columns
'''
to_drop = list()
for i in df.columns:
if len(df.loc[:,i].unique()) == 1:
to_drop.append(i)
return df.drop(to_drop,axis =1)
def treat_missing_numeric(df,columns,how = 'mean'):
'''
Function to treat missing values in numeric columns
Required Input -
- df = Pandas DataFrame
- columns = List input of all the columns need to be imputed
- how = valid values are 'mean', 'mode', 'median','ffill', numeric value
Expected Output -
- Pandas dataframe with imputed missing value in mentioned columns
'''
if how == 'mean':
for i in columns:
print("Filling missing values with mean for columns - {0}".format(i))
df.ix[:,i] = df.ix[:,i].fillna(df.ix[:,i].mean())
elif how == 'mode':
for i in columns:
print("Filling missing values with mode for columns - {0}".format(i))
df.ix[:,i] = df.ix[:,i].fillna(df.ix[:,i].mode())
elif how == 'median':
for i in columns:
print("Filling missing values with median for columns - {0}".format(i))
df.ix[:,i] = df.ix[:,i].fillna(df.ix[:,i].median())
elif how == 'ffill':
for i in columns:
print("Filling missing values with forward fill for columns - {0}".format(i))
df.ix[:,i] = df.ix[:,i].fillna(method ='ffill')
elif type(how) == int or type(how) == float:
for i in columns:
print("Filling missing values with {0} for columns - {1}".format(how,i))
df.ix[:,i] = df.ix[:,i].fillna(how)
else:
print("Missing value fill cannot be completed")
return df
def treat_missing_categorical(df,columns,how = 'mode'):
'''
Function to treat missing values in numeric columns
Required Input -
- df = Pandas DataFrame
- columns = List input of all the columns need to be imputed
- how = valid values are 'mode', any string or numeric value
Expected Output -
- Pandas dataframe with imputed missing value in mentioned columns
'''
if how == 'mode':
for i in columns:
print("Filling missing values with mode for columns - {0}".format(i))
df.ix[:,i] = df.ix[:,i].fillna(df.ix[:,i].mode()[0])
elif type(how) == str:
for i in columns:
print("Filling missing values with {0} for columns - {1}".format(how,i))
df.ix[:,i] = df.ix[:,i].fillna(how)
elif type(how) == int or type(how) == float:
for i in columns:
print("Filling missing values with {0} for columns - {1}".format(how,i))
df.ix[:,i] = df.ix[:,i].fillna(str(how))
else:
print("Missing value fill cannot be completed")
return df
def min_max_scaler(df,columns):
'''
Function to do Min-Max scaling
Required Input -
- df = Pandas DataFrame
- columns = List input of all the columns which needs to be min-max scaled
Expected Output -
- df = Python DataFrame with Min-Max scaled attributes
- scaler = Function which contains the scaling rules
'''
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(df.loc[:,columns]))
data.index = df.index
data.columns = columns
return data, scaler
def z_scaler(df,columns):
'''
Function to standardize features by removing the mean and scaling to unit variance
Required Input -
- df = Pandas DataFrame
- columns = List input of all the columns which needs to be min-max scaled
Expected Output -
- df = Python DataFrame with Min-Max scaled attributes
- scaler = Function which contains the scaling rules
'''
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(df.loc[:,columns]))
data.index = df.index
data.columns = columns
return data, scaler
def label_encoder(df,columns):
'''
Function to label encode
Required Input -
- df = Pandas DataFrame
- columns = List input of all the columns which needs to be label encoded
Expected Output -
- df = Pandas DataFrame with lable encoded columns
- le_dict = Dictionary of all the column and their label encoders
'''
le_dict = {}
for c in columns:
print("Label encoding column - {0}".format(c))
lbl = LabelEncoder()
lbl.fit(list(df[c].values.astype('str')))
df[c] = lbl.transform(list(df[c].values.astype('str')))
le_dict[c] = lbl
return df, le_dict
def one_hot_encoder(df, columns):
'''
Function to do one-hot encoded
Required Input -
- df = Pandas DataFrame
- columns = List input of all the columns which needs to be one-hot encoded
Expected Output -
- df = Pandas DataFrame with one-hot encoded columns
'''
for each in columns:
print("One-Hot encoding column - {0}".format(each))
dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
df = pd.concat([df, dummies], axis=1)
return df.drop(columns,axis = 1)
####### Feature Engineering ############
def create_date_features(df,column, date_format = None, more_features = False, time_features = False):
'''
Function to extract date features
Required Input -
- df = Pandas DataFrame
- date_format = Date parsing format
- columns = Columns name containing date field
- more_features = To get more feature extracted
- time_features = To extract hour from datetime field
Expected Output -
- df = Pandas DataFrame with additional extracted date features
'''
if date_format is None:
df.loc[:,column] = pd.to_datetime(df.loc[:,column])
else:
df.loc[:,column] = pd.to_datetime(df.loc[:,column],format = date_format)
df.loc[:,column+'_Year'] = df.loc[:,column].dt.year
df.loc[:,column+'_Month'] = df.loc[:,column].dt.month.astype('uint8')
df.loc[:,column+'_Week'] = df.loc[:,column].dt.week.astype('uint8')
df.loc[:,column+'_Day'] = df.loc[:,column].dt.day.astype('uint8')
if more_features:
df.loc[:,column+'_Quarter'] = df.loc[:,column].dt.quarter.astype('uint8')
df.loc[:,column+'_DayOfWeek'] = df.loc[:,column].dt.dayofweek.astype('uint8')
df.loc[:,column+'_DayOfYear'] = df.loc[:,column].dt.dayofyear
if time_features:
df.loc[:,column+'_Hour'] = df.loc[:,column].dt.hour.astype('uint8')
return df
def target_encoder(train_df, col_name, target_name, test_df = None, how='mean'):
'''
Function to do target encoding
Required Input -
- train_df = Training Pandas Dataframe
- test_df = Testing Pandas Dataframe
- col_name = Name of the columns of the source variable
- target_name = Name of the columns of target variable
- how = 'mean' default but can also be 'count'
Expected Output -
- train_df = Training dataframe with added encoded features
- test_df = Testing dataframe with added encoded features
'''
aggregate_data = train_df.groupby(col_name)[target_name] \
.agg([how]) \
.reset_index() \
.rename(columns={how: col_name+'_'+target_name+'_'+how})
if test_df is None:
return join_df(train_df,aggregate_data,left_on = col_name)
else:
return join_df(train_df,aggregate_data,left_on = col_name), join_df(test_df,aggregate_data,left_on = col_name)