-
Notifications
You must be signed in to change notification settings - Fork 0
/
DS6_black.py
149 lines (111 loc) · 3.95 KB
/
DS6_black.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#importing the dataset
from sklearn.metrics import accuracy_score, mean_squared_error
df_train = pd.read_csv('train.csv')
# print(df_train.head())
# import the test data
df_test = pd.read_csv('test.csv')
# print(df_test.head())
#MErge both train and test data
df=df_train.append(df_test)
# print(df.head(10))
##Basic
# print(df.info())
#print(df.describe())
df.drop(['User_ID'],axis=1,inplace=True)
#print(df.head())
'''
#not effective-. use map
df['Gender']=pd.get_dummies(df['Gender'],drop_first=1)
'''
##HAndling categorical feature Gender
df['Gender']=df['Gender'].map({'F':0,'M':1})
#print(df.head())
## Handle categorical feature Age
#print(df['Age'].unique())
# map age
df['Age']=df['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7})
#print(df.head())
'''
##second technqiue
from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
df['Age'] = label_encoder.fit_transform(df['Age'])
df['Age'].unique()
'''
##fixing categorical City_categort
df_city=pd.get_dummies(df['City_Category'],drop_first=True)
#print(df_city.head())
##fixing categorical City_categort
df_city=pd.get_dummies(df['City_Category'],drop_first=True)
#print(df_city.head())
df=pd.concat([df,df_city],axis=1)
#print(df.head())
df.drop('City_Category',axis=1,inplace=True)
#print(df.head())
## Missing Values
# print(df.isnull().sum())
# ## Focus on replacing missing values
# print(df['Product_Category_2'].unique())
#
# print(df['Product_Category_2'].value_counts())
#
# print(df['Product_Category_2'].mode()[0])
## Replace the missing values with mode
df['Product_Category_2']=df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])
# print(df['Product_Category_2'].isnull().sum())
## Focus on replacing missing values
# print(df['Product_Category_3'].unique())
#
# print(df['Product_Category_3'].value_counts())
#
# print(df['Product_Category_3'].mode()[0])
## Replace the missing values with mode
df['Product_Category_3']=df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0])
# print(df['Product_Category_3'].isnull().sum())
# print(df.head())
#print(df['Stay_In_Current_City_Years'].unique())
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+','')
#print(df.head())
#print(df.info())
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].astype(int)
#print(df.info())
df['B']=df['B'].astype(int)
df['C']=df['C'].astype(int)
#print(df.info())
##Feature Scaling
selector = ExtraTreesRegressor()
selector.fit(X, Y)
feature_imp = selector.feature_importances_
X.drop(['Gender', 'City_Category', 'Marital_Status'], axis = 1, inplace = True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
lin_reg = LinearRegression()
knn = KNeighborsRegressor()
dec_tree = DecisionTreeRegressor()
ran_for = RandomForestRegressor()
dtc=DecisionTreeClassifier()
print("MEAN SQUARED ERRORS")
lin_reg.fit(X_train, Y_train)
Y_pred_lin_reg = lin_reg.predict(X_test)
print("Linear Regression: ",mean_squared_error(Y_test, Y_pred_lin_reg))
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)
print("KNN regression: ",mean_squared_error(Y_test, Y_pred_knn))
dec_tree.fit(X_train, Y_train)
Y_pred_dec = dec_tree.predict(X_test)
print("Decision tree regression: ",mean_squared_error(Y_test, Y_pred_dec))
ran_for.fit(X_train, Y_train)
Y_pred_ran_for = ran_for.predict(X_test)
print("Random forest regression: ",mean_squared_error(Y_test, Y_pred_ran_for))
'''
MEAN SQUARED ERRORS
Linear Regression: 22044840.101023477
KNN regression: 10469708.932551857
Decision tree regression: 9843766.502839973
Random forest regression: 9058790.950768305
'''