-
Notifications
You must be signed in to change notification settings - Fork 0
/
chr1234ESCecc.R
230 lines (162 loc) · 6.78 KB
/
chr1234ESCecc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
library(ggplot2)
rawESCdata <- read.csv("~/Dropbox/DS/wTrush/rawESCdata.csv", header=TRUE, stringsAsFactors = FALSE)
chr1 <- rawESCdata[2:length(rawESCdata[,2]), 2:4]
chr1$X <- as.numeric(chr1$X)
chr1$X.1 <- as.numeric(chr1$X.1)
chr1$X.2 <- as.numeric(chr1$X.2)
names(chr1) = c("startTAD", "endTAD", "interaction")
chr1 <- chr1[1:sum(!is.na(chr1$startTAD)), ]
chr1$dist = 1/chr1$interaction
n1 = max(chr1$endTAD, chr1$startTAD) - (min(chr1$endTAD, chr1$startTAD) - 1)
chr1metric <- matrix(, n1, n1)
for (i in 1:length(chr1$startTAD)) {
chr1metric[chr1$startTAD[i], chr1$endTAD[i]] = chr1$dist[i]
}
# we can compute the number of non-NA values in our distance metric. Is should be length(chr1$startTAD)
max(chr1$startTAD, chr1$endTAD, na.rm = TRUE)**2 - sum(is.na(chr1metric)) == length(chr1$startTAD)
# 1st the lower triangular part
for (i in 1:n1) {
for (j in 1:n1){
chr1metric[j,i] = chr1metric[i,j]
}
}
# we don't need to do the diagonal since each diagonal value is 0 and affects the number of entries
chr1ecc <- matrix(, 394,2)
for (i in 1:394){
chr1ecc[i,1] = i
chr1ecc[i,2] = sum(chr1metric[,i], na.rm=TRUE)/sum(!is.na(chr1metric[,i]))
}
chr1ecc <- data.frame(chr1ecc)
names(chr1ecc) = c("TAD", "eccentricity")
head(chr1ecc)
# see if any TADs lack a value
sum(is.na(chr1ecc$eccentricity))
par(mfrow = c(1,2))
g <- ggplot(chr1ecc, aes(TAD, log(eccentricity)))
p <- g + geom_point(alpha = 1/2, size = 2) + geom_smooth(method = "lm") + ggtitle("chr1 eccentricity")
print(p)
########################################################
############## chr2 ################
########################################################
# start by isolating the relevant data for chr2. note the code below is more adaptable/flexible
chr2col <- which(names(rawESCdata)=="chr2")
chr2 <- rawESCdata[2:length(rawESCdata[,chr2col]), (chr2col+1):(chr2col+3)]
chr2[,1] <- as.numeric(chr2[,1])
chr2[,2] <- as.numeric(chr2[,2])
chr2[,3] <- as.numeric(chr2[,3])
names(chr2) <- c("startTAD", "endTAD", "interaction")
# because we have less data for chr2, there are lots of NA values at the end of the columns for chr2.
tail(chr2)
# we want to remove these NA values. So let's find out how many values we have
sum(!is.na(chr2$startTAD))
# so there are only 372 TAD interaction measurements. let's isolate those
chr2 <- chr2[1:sum(!is.na(chr2$startTAD)), ]
chr2$dist = 1/chr2$interaction
n2 = max(chr2$endTAD, chr2$startTAD) - min(chr2$endTAD, chr2$startTAD) + 1
chr2metric <- matrix(, n2, n2)
for (i in 1:n2) {
chr2metric[chr2$startTAD[i], chr2$endTAD[i]] = chr2$dist[i]
}
# check how many values we do have. the computed value below should be length(chr2$startTAD)
max(chr2$startTAD, chr2$endTAD)**2 - sum(is.na(chr2metric)) == length(chr2$startTAD)
# now fill in the diagonal and lower triangular part
# 1st the lower triangular part
for (i in 1:n2) {
for (j in 1:n2){
chr2metric[j,i] = chr2metric[i,j]
}
}
chr2ecc <- matrix(, n2,2)
for (i in 1:n2){
chr2ecc[i,1] = i
chr2ecc[i,2] = sum(chr2metric[,i], na.rm=TRUE)/sum(!is.na(chr2metric[,i]))
}
chr2ecc <- data.frame(chr2ecc)
names(chr2ecc) = c("TAD", "eccentricity")
head(chr2ecc)
# see if any TADs lack a value
sum(is.na(chr2ecc$eccentricity))
par(mfrow = c(1,2))
g <- ggplot(chr2ecc, aes(TAD, log(eccentricity)))
p <- g + geom_point(alpha = 1/2, size = 2) + geom_smooth(method = "lm") + ggtitle("chr2 eccentricity")
print(p)
########################################################
############## chr3 ################
########################################################
# start by isolating the relevant data for chr2. note the code below is more adaptable/flexible
chr3col <- which(names(rawESCdata)=="chr3")
chr3 <- rawESCdata[2:length(rawESCdata[,chr3col]), (chr3col+1):(chr3col+3)]
chr3[,1] <- as.numeric(chr3[,1])
chr3[,2] <- as.numeric(chr3[,2])
chr3[,3] <- as.numeric(chr3[,3])
names(chr3) <- c("startTAD", "endTAD", "interaction")
chr3 <- chr3[1:sum(!is.na(chr3$startTAD)), ]
chr3$dist = 1/chr3$interaction
n3 = max(chr3$endTAD, chr3$startTAD) - min(chr3$endTAD, chr3$startTAD) + 1
chr3metric <- matrix(, n3, n3)
for (i in 1:n3) {
chr3metric[chr3$startTAD[i], chr3$endTAD[i]] = chr3$dist[i]
}
# check how many values we do have. the computed value below should be length(chr2$startTAD)
max(chr3$startTAD, chr3$endTAD)**2 - sum(is.na(chr3metric)) == length(chr3$startTAD)
# now fill in the diagonal and lower triangular part
# 1st the lower triangular part
for (i in 1:n3) {
for (j in 1:n3){
chr3metric[j,i] = chr3metric[i,j]
}
}
chr3ecc <- matrix(, n3,2)
for (i in 1:n3){
chr3ecc[i,1] = i
chr3ecc[i,2] = sum(chr3metric[,i], na.rm=TRUE)/sum(!is.na(chr3metric[,i]))
}
chr3ecc <- data.frame(chr3ecc)
names(chr3ecc) = c("TAD", "eccentricity")
head(chr3ecc)
# see if any TADs lack a value
sum(is.na(chr3ecc$eccentricity))
par(mfrow = c(1,2))
g <- ggplot(chr3ecc, aes(TAD, log(eccentricity)))
p <- g + geom_point(alpha = 1/2, size = 2) + geom_smooth(method = "lm") + ggtitle("chr3 eccentricity")
print(p)
########################################################
############## chr4 ################
########################################################
# start by isolating the relevant data for chr2. note the code below is more adaptable/flexible
chr4col <- which(names(rawESCdata)=="chr4")
chr4 <- rawESCdata[2:length(rawESCdata[,chr4col]), (chr4col+1):(chr4col+3)]
chr4[,1] <- as.numeric(chr4[,1])
chr4[,2] <- as.numeric(chr4[,2])
chr4[,3] <- as.numeric(chr4[,3])
names(chr4) <- c("startTAD", "endTAD", "interaction")
chr4 <- chr4[1:sum(!is.na(chr4$startTAD)), ]
chr4$dist = 1/chr4$interaction
n4 = max(chr4$endTAD, chr4$startTAD) - min(chr4$endTAD, chr4$startTAD) + 1
chr4metric <- matrix(, n4, n4)
for (i in 1:n4) {
chr4metric[chr4$startTAD[i], chr4$endTAD[i]] = chr4$dist[i]
}
# check how many values we do have. the computed value below should be length(chr2$startTAD)
max(chr4$startTAD, chr4$endTAD)**2 - sum(is.na(chr4metric)) == length(chr4$startTAD)
# now fill in the diagonal and lower triangular part
# 1st the lower triangular part
for (i in 1:n4) {
for (j in 1:n4){
chr4metric[j,i] = chr4metric[i,j]
}
}
chr4ecc <- matrix(, n4,2)
for (i in 1:n4){
chr4ecc[i,1] = i
chr4ecc[i,2] = sum(chr4metric[,i], na.rm=TRUE)/sum(!is.na(chr4metric[,i]))
}
chr4ecc <- data.frame(chr4ecc)
names(chr4ecc) = c("TAD", "eccentricity")
head(chr4ecc)
# see if any TADs lack a value
sum(is.na(chr4ecc$eccentricity))
par(mfrow = c(1,2))
g <- ggplot(chr4ecc, aes(TAD, log(eccentricity)))
p <- g + geom_point(alpha = 1/2, size = 2) + geom_smooth(method = "lm") + ggtitle("chr 4 eccentricity")
print(p)