forked from bnsreenu/python_for_microscopists
-
Notifications
You must be signed in to change notification settings - Fork 1
/
045-linear_regression_cells.py
104 lines (71 loc) · 3.37 KB
/
045-linear_regression_cells.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
__author__ = "Sreenivas Bhattiprolu"
__license__ = "Feel free to copy, I appreciate if you acknowledge Python for Microscopists"
# https://www.youtube.com/watch?v=2Bgma44OAF4
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
df = pd.read_csv('other_files/cells.csv')
print(df)
#plt.xlabel('time')
#plt.ylabel('cells')
#plt.scatter(df.time, df.cells,color='red',marker='+')
#For linear regression, Y=the value we want to predict
#X= all independent variables upon which Y depends.
#3 steps for linear regression....
#Step 1: Create the instance of the model
#Step 2: .fit() to train the model or fit a linear model
#Step 3: .predict() to predict Y for given X values.
#Now let us define our x and y values for the model.
#x values will be time column, so we can define it by dropping cells
#x can be multiple independent variables which we will discuss in a different tutorial
#this is why it is better to drop the unwanted columns rather than picking the wanted column
#y will be cells column, dependent variable that we are trying to predict.
x_df = df.drop('cells', axis='columns')
#Or you can pick columns manually. Remember double brackets.
#Single bracket returns as series whereas double returns pandas dataframe which is what the model expects.
#x_df=df[['time']]
print(x_df.dtypes) #Prints as object when you drop cells or use double brackets [[]]
#Prints as float64 if you do only single brackets, which is not the right type for our model.
y_df = df.cells
#print(x_df)
#print(y_df)
#TO create a model instance
reg = linear_model.LinearRegression() #Create an instance of the model.
reg.fit(x_df,y_df) #Train the model or fits a linear model
print(reg.score(x_df,y_df)) #Prints the R^2 value, a measure of how well
#observed values are replicated by the model.
#Test the model by Predicting cells for some values reg.predict()
print("Predicted # cells...", reg.predict([[2.3]]))
# Y = m * X + b (m is coefficient and b is intercept)
#Get the intercept and coefficient values
b = reg.intercept_
m = reg.coef_
#Manually verify the above calculation
print("From maual calculation, cells = ", (m*2.3 + b))
#Now predict cells for a list of times by reading time values from a csv file
cells_predict_df = pd.read_csv("other_files/cells_predict.csv")
print(cells_predict_df.head())
predicted_cells = reg.predict(cells_predict_df)
print(predicted_cells)
#Add the new predicted cells values as a new column to cells_predict_df dataframe
cells_predict_df['cells']=predicted_cells
print(cells_predict_df)
cells_predict_df.to_csv("other_files/predicted_cells.csv")
##############################
#
#####################################################
#Using Seaborn for plotting and linregress from scipy stats library
import pandas as pd
df = pd.read_csv('other_files/cells.csv')
import seaborn as sns
sns.set(style='darkgrid')
sns.lmplot(x='time', y='cells', data=df, order=1)
#If you want equation, not possible to display in seaborn but you can get it the
#regular way using scipy stats module.
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(df['time'],df['cells'])
print(slope, intercept)
#Compare the slope and intercept reported with m and b values from above.
#Should be the same.