forked from bnsreenu/python_for_microscopists
-
Notifications
You must be signed in to change notification settings - Fork 1
/
042_data_analysis_using_Seaborn_Plotting.py
160 lines (103 loc) · 4.63 KB
/
042_data_analysis_using_Seaborn_Plotting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
__author__ = "Sreenivas Bhattiprolu"
__license__ = "Feel free to copy, I appreciate if you acknowledge Python for Microscopists"
# https://www.youtube.com/watch?v=ze7HGAf729k
####################################
#
#For better control over plotting you may as well use Matplotlib or Seaborn
#For Seaborn look here
##########################################
#Seaborn builds on top of matplotlib to provide a richer out of the box environment.
# https://seaborn.pydata.org/
#https://seaborn.pydata.org/examples/index.html #Checkout for more examples
import pandas as pd
df = pd.read_csv('manual_vs_auto.csv')
import seaborn as sns
##############
#Single variable (distribution histogram plots)
#sns.distplot(df['Manual']) #Will fail as we have a few missing values.
#Let us fill missing values with a value of 100
df['Manual'].fillna(100, inplace=True)
sns.distplot(df['Manual']) #The overlay over histogram is KDE plot (Kernel density distribution)
#KDE plots. Kernel density estimation.
#KDE is a way to estimate the probability density function of a continuous random variable.
import pandas as pd
df = pd.read_csv('manual_vs_auto.csv')
df['Manual'].fillna(100, inplace=True)
import seaborn as sns
sns.kdeplot(df['Manual'], shade=True)
## Add Multiple plots
sns.kdeplot(df['Auto_th_2'], shade=True)
sns.kdeplot(df['Auto_th_3'], shade=True)
sns.kdeplot(df['Auto_th_4'], shade=True)
###################
#Basic line plot
import pandas as pd
df = pd.read_csv('manual_vs_auto.csv')
df['Manual'].fillna(100, inplace=True)
import seaborn as sns
sns.set(style='darkgrid') #Adds a grid
sns.lineplot(x='Image', y='Manual', data=df, hue='Unnamed: 0') #Simple line plot
#Hue tells seaborn how to color various subcategories, like our set in this example.
##############################
#Scatter plots
import pandas as pd
df = pd.read_csv('manual_vs_auto.csv')
df['Manual'].fillna(100, inplace=True)
import seaborn as sns
#Basic scatter plot
sns.jointplot(x="Manual", y="Auto_th_2", data=df)
#KDE plot, Kernel density estimation.
sns.jointplot(x="Manual", y="Auto_th_2", data=df, kind="kde")
#Relationship between each feature and another selected feature can be easily plotted
#using pariplot function in Seaborn
import pandas as pd
import seaborn as sns
df = pd.read_csv('manual_vs_auto.csv')
df['Manual'].fillna(100, inplace=True)
print(df.columns)
#sns.pairplot(df, x_vars=["Auto_th_2", "Auto_th_3", "Auto_th_4"], y_vars="Manual")
#too small. Let us chage the size
sns.pairplot(df, x_vars=["Auto_th_2", "Auto_th_3", "Auto_th_4"], y_vars="Manual", size=6, aspect=0.75)
#Scatterplot with linear regression
import pandas as pd
df = pd.read_csv('manual_vs_auto.csv')
df['Manual'].fillna(100, inplace=True)
#Change Unnamed: 0 name to Image_set
df = df.rename(columns = {'Unnamed: 0':'Image_set'})
import seaborn as sns
sns.lmplot(x='Manual', y='Auto_th_2', data=df, hue='Image_set', order=1) #Scatterplot with linear regression fit and 95% confidence interval
#If you want equation, not possible to display in seaborn but you can get it the
#regular way using scipy stats module.
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(df['Manual'],df['Auto_th_2'])
print(slope, intercept)
#filtered = df[df['FileName'] != 'images/grains\grains1.jpg']
#filtered = df['FileName']
#sns.lmplot(x="Area", y="MeanIntensity", data=df, hue="orientation", fit_reg=False, col='FileName', col_wrap=2)
#Swarm plots
#Let's use manual_vs_auto2 file that we generated earlier
import pandas as pd
df = pd.read_csv('manual_vs_auto2.csv')
df['Manual'].fillna(100, inplace=True)
print(df.head())
import seaborn as sns
#sns.swarmplot(x = "Image_set", y="Manual", data = df, hue="cell_count_index")
#SPlit each category
sns.swarmplot(x = "Image_set", y="Manual", data = df, hue="cell_count_index", dodge=True)
##################
"""
we can utilise the pandas Corr() to find the correlation between each variable
in the matrix and plot this using Seaborn’s Heatmap function,
specifying the labels and the Heatmap colour range.
"""
import pandas as pd
df = pd.read_csv('manual_vs_auto.csv')
print(df.dtypes)
df['Manual'].fillna(100, inplace=True)
#Change Unnamed: 0 name to Image_set
df = df.rename(columns = {'Unnamed: 0':'Image_set'})
import seaborn as sns
corr = df.loc[:,df.dtypes == 'int64'].corr() #Correlates all int64 columns
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap=sns.diverging_palette(220, 10, as_cmap=True))
##########################