-
Notifications
You must be signed in to change notification settings - Fork 0
/
12223737_proj_2_2.py
85 lines (64 loc) · 2.64 KB
/
12223737_proj_2_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
"""12223737_Proj#2-2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1tueSiF7huFFWtFr-AUKwEU3rWT34PZUD
"""
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
def sort_dataset(dataset_df):
sorted_df = dataset_df.sort_values('year', ascending=True)
return sorted_df
def split_dataset(dataset_df):
dataset_df['salary'] = dataset_df['salary'] * 0.001
dataset_train = dataset_df.iloc[:1718]
dataset_test = dataset_df.iloc[1718:]
X_train = dataset_train.drop('salary', axis=1)
Y_train = dataset_train['salary']
X_test = dataset_test.drop('salary', axis=1)
Y_test = dataset_test['salary']
return X_train, X_test, Y_train, Y_test
def extract_numerical_cols(dataset_df):
numerical_cols = {'age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI',
'SB', 'CS', 'BB', 'HBP', 'SO', 'GDP', 'fly', 'war'}
numerical_df = dataset_df[numerical_cols]
return numerical_df
def train_predict_decision_tree(X_train, Y_train, X_test):
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, Y_train)
dt_predicted = dt_reg.predict(X_test)
return dt_predicted
def train_predict_random_forest(X_train, Y_train, X_test):
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, Y_train)
rf_predicted = rf_reg.predict(X_test)
return rf_predicted
def train_predict_svm(X_train, Y_train, X_test):
svm_pipe = make_pipeline(
StandardScaler(),
SVR()
)
svm_pipe.fit(X_train, Y_train)
svm_predicted = svm_pipe.predict(X_test)
return svm_predicted
def calculate_RMSE(labels, predictions):
RMSE = np.sqrt(np.mean((predictions - labels)**2))
return RMSE
if __name__=='__main__':
data_df = pd.read_csv('/content/drive/MyDrive/OSS_Project2/2019_kbo_for_kaggle_v2.csv')
sorted_df = sort_dataset(data_df)
X_train, X_test, Y_train, Y_test = split_dataset(sorted_df)
X_train = extract_numerical_cols(X_train)
X_test = extract_numerical_cols(X_test)
dt_predictions = train_predict_decision_tree(X_train, Y_train, X_test)
rf_predictions = train_predict_random_forest(X_train, Y_train, X_test)
svm_predictions = train_predict_svm(X_train, Y_train, X_test)
print ("Decision Tree Test RMSE: ", calculate_RMSE(Y_test, dt_predictions))
print ("Random Forest Test RMSE: ", calculate_RMSE(Y_test, rf_predictions))
print ("SVM Test RMSE: ", calculate_RMSE(Y_test, svm_predictions))