-
Notifications
You must be signed in to change notification settings - Fork 0
/
DA_differential_abundance_manuscript_cleaned.Rmd
252 lines (197 loc) · 9.66 KB
/
DA_differential_abundance_manuscript_cleaned.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
---
title: "Dental Arcade: Differential abundance"
author: "Zandra Fagernaes"
output: html_notebook
---
This notebook goes through the differential abundance analysis for the Dental Arcade project.
```{r}
library(janitor)
library(tidyverse)
library(ggpubr)
```
The software Songbird is used for differential abundance studies.Input data is a species level OTU table from a MALT run against the custom RefSeq database. Contaminants (blanks and bones) are cleaned out, and only taxa present in all four individuals are included. Blanks, bones, and occlusal samples are excluded. Another dataset is made where occlusal samples are included, to study the effect of disease on differential abundance.
```{r}
# Import raw data
data <- read.delim("<path_to>/DA_MALT_cRefSeq_species_summarized_20200616.txt")
colnames(data)[1] <- "Species"
# Metadata
meta <- read.delim("<path_to>/da_metadata_new.txt")
# Fix sample names
colnames(data) <- gsub(".SG1.1.2_S0_L001_R1_001.fastq.combined.prefixed.fastq.extractunmapped.bam",
"", colnames(data))
colnames(data) <- gsub(".SG1.1_S0_L001_R1_001.fastq.combined.prefixed.fastq.extractunmapped.bam",
"", colnames(data))
# Remove bones, occlusal samples and blanks
data <- data %>%
select(-c("EXB037.A2101","EXB037.A2201","EXB037.A2301","EXB037.A2401","EXB037.A2501", "EXB037.A3301",
"LIB030.A0107","LIB030.A0108","LIB030.A0109","LIB030.A0110","LIB030.A0111", "LIB030.A0115",
"CDM045.Z0101", "CDM047.Z0101", "CDM053.J0101", "CDM054.A0101",
#"CDM048.E0101", "CDM048.O0101", "CDM049.E0101", "CDM049.I0101"
))
# Import contaminants
cont <- read.delim("<path_to>/DA_decontam_cRefSeq_species_20210408.txt")
# Remove contaminants
data_clean <- anti_join(data, cont)
# Remove genera with no counts
data_clean$sum <- rowSums(data_clean[2:88])
data_clean <- data_clean %>%
filter(!sum==0) %>%
select(-sum)
# Transpose
data_clean_t <- data_clean %>%
gather(sample, count, 2:88) %>%
spread(Species, count)
# Add column with individual
ind <- meta %>%
select(LibraryID, Individual)
data_clean_t_ind <- left_join(data_clean_t, ind, by=c("sample"="LibraryID"))
# Turn into long format and remove zero counts
data_long <- data_clean_t_ind %>%
gather(species, count, 2:275) %>%
filter(count > 0)
# Count how many samples each species is found in per individual
data_prevalence <- data_long %>%
select(species, Individual, sample) %>%
group_by(species, Individual) %>%
summarize(prevalence = n())
# Filter for species present in >3 samples per individual
data_prevalence <- data_prevalence %>%
filter(!prevalence<3)
# Filter for species present in all four individuals
data_list <- data_prevalence %>%
select(species, Individual) %>%
group_by(species) %>%
summarize(prevalence=n()) %>%
filter(prevalence==4) %>%
select(species)
# Create final OTU table
data_final <- left_join(data_list, data_clean, by=c("species"="Species"))
colnames(data_final)[1] <- "# OTU ID"
# Save OTU table
write.table(data_final, "<path_to>/DA_MALT_cRefSeq_species_cleaned_occlusal_3sample_4ind_20210609.txt", sep="\t", row.names = FALSE, quote=FALSE)
```
# Make plot for Songbird results
The data was run through Songbird with a model including all explanatory variables (also individual), as well as a null model. Different models were tried to find ne with good fit.
```{r}
# Import result dataset
songbird <- read_tsv("<path_to>/all_species_cleaned_3sample_4ind_20210409_bestfit.tsv") %>%
clean_names()
# Filter out only total weight (scaled), top and bottom ten
weight <- songbird %>%
dplyr::select("featureid", "total_weight_scaled") %>%
arrange(total_weight_scaled) %>%
filter(row_number() %in% c(1:10, 137:146))
# Filter out only jawbone, top and bottom ten
jawbone <- songbird %>%
dplyr::select("featureid", "jawbone_t_maxilla") %>%
arrange(jawbone_t_maxilla) %>%
filter(row_number() %in% c(1:10, 137:146))
# Filter out only toothside (i), top and bottom ten
toothside_i <- songbird %>%
dplyr::select("featureid", "tooth_side_t_i") %>%
arrange(tooth_side_t_i) %>%
filter(row_number() %in% c(1:10, 137:146))
# Filter out only toothside (l), top and bottom ten
toothside_l <- songbird %>%
dplyr::select("featureid", "tooth_side_t_l") %>%
arrange(tooth_side_t_l) %>%
filter(row_number() %in% c(1:10, 137:146))
# Filter out only tooth position, top and bottom ten
toothposition <- songbird %>%
dplyr::select("featureid", "tooth_position_t_posterior") %>%
arrange(tooth_position_t_posterior) %>%
filter(row_number() %in% c(1:10, 137:146))
# For dataset with occlusal, filter out only toothside (o), top and bottom ten
toothside_o <- songbird %>%
dplyr::select("featureid", "tooth_side_t_o") %>%
arrange(tooth_side_t_o) %>%
filter(row_number() %in% c(1:10, 137:146))
```
And then we will create a divergent bar chart where all OTUs are labelled.
```{r}
# Weight
ws <- ggplot(weight, aes(x = reorder(featureid, total_weight_scaled), y = total_weight_scaled))+
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "", y = "") +
ggtitle("Total weight (scaled)") +
theme_classic()
# Jawbone
jaw <- ggplot(jawbone, aes(x = reorder(featureid, jawbone_t_maxilla), y = jawbone_t_maxilla))+
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "", y = "") +
ggtitle("Jawbone (maxilla)") +
theme_classic()
# Toothside i
tsi <- ggplot(toothside_i, aes(x = reorder(featureid, tooth_side_t_i), y = tooth_side_t_i))+
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "", y = "") +
ggtitle("Tooth surface (interproximal)") +
theme_classic()
# Toothside o
tsl <- ggplot(toothside_o, aes(x = reorder(featureid, tooth_side_t_o), y = tooth_side_t_o))+
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "", y = "") +
ggtitle("Tooth surface (occlusal)") +
theme_classic()
# Tooth position
tp <- ggplot(toothposition, aes(x = reorder(featureid, tooth_position_t_posterior), y = tooth_position_t_posterior))+
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "", y = "") +
ggtitle("Tooth position (posterior)") +
theme_classic()
ggarrange(ws, jaw, tsi, tsl, tp, nrow=3, ncol=2)
```
# Dataset for 3D mapping
For 3D mapping, we want to summarize the 10 most divergent taxa for each variable we present in the paper, and then average these values across individual.
```{r}
# Normalize counts
otu_norm <- as.data.frame(scale(raw_data[2:84], center=FALSE,
scale=colSums(raw_data[2:84])))
otu_norm$otu <- raw_data$Species
## Below, repeat for each variable
# Select subset of genera
otu_var <- left_join(weight, otu_norm, by=c("featureid"="otu")) %>%
select(-total_weight_scaled)
# Transpose
otu_var_t <- otu_var %>%
gather(sample, count, 2:84) %>%
spread(featureid, count)
# Fix names
otu_var_t <- otu_var_t %>%
clean_names()
# Sum anterior taxa per individual - tooth position
otu_toothposition_sum <- otu_toothposition_t %>%
rowwise() %>%
mutate(ind_anterior = sum(c(methanobrevibacter_olleyae, actinomyces_radicidentis, actinomyces_urogenitalis, lachnoanaerobaculum_sp_obrc5_5, actinomyces_sp_oral_taxon_170, eubacterium_sulci, streptococcus_sanguinis, eubacterium_brachy, peptostreptococcus_anaerobius, actinomyces_slackii))) %>%
mutate(ind_posterior = sum(c(olsenella_sp_hmsc062g07, olsenella_uli, methanobrevibacter_smithii, olsenella_umbonata, olsenella_phocaeensis, bacteroides_pyogenes, dialister_invisus, atopobium_sp_oral_taxon_810, olsenella_sp_kh2p3, methanobrevibacter_wolinii))) %>%
select(sample, ind_anterior, ind_posterior)
# Sum anterior taxa per individual - interproximal
otu_var_sum <- otu_var_t %>%
rowwise() %>%
mutate(ind_other = sum(c(actinobaculum_sp_oral_taxon_183, fusobacterium_sp_oral_taxon_203, treponema_sp_omz_838, treponema_medium, treponema_lecithinolyticum, johnsonella_ignava, bacteroidetes_oral_taxon_274, centipeda_periodontii, actinomyces_gerencseriae, eubacterium_saphenum))) %>%
mutate(ind_ip = sum(c(actinomyces_sp_oral_taxon_170, olsenella_umbonata, methanobrevibacter_sp_ye315, bacteroides_pyogenes, methanobrevibacter_millerae, olsenella_sp_kh2p3, methanobrevibacter_oralis, methanobrevibacter_smithii, methanobrevibacter_wolinii, methanobrevibacter_olleyae))) %>%
select(sample, ind_other, ind_ip)
# Sum anterior taxa per individual - weight
otu_var_sum <- otu_var_t %>%
rowwise() %>%
mutate(ind_low = sum(c(fusobacterium_sp_oral_taxon_203, actinobaculum_sp_oral_taxon_183, schaalia_meyeri, treponema_lecithinolyticum, actinomyces_gerencseriae, actinomyces_sp_hmsc062g12, eubacterium_saphenum, desulfomicrobium_orale, schaalia_georgiae, filifactor_alocis))) %>%
mutate(ind_high = sum(c(gordonibacter_pamelaeae, capnocytophaga_ochracea, atopobium_sp_oral_taxon_810, capnocytophaga_sp_oral_taxon_332, capnocytophaga_sp_oral_taxon_863, capnocytophaga_sp_oral_taxon_326, capnocytophaga_sp_ch_dc_os43, lautropia_mirabilis, eikenella_corrodens, eikenella_sp_hmsc061c02))) %>%
select(sample, ind_low, ind_high)
# Add metadata
otu_var_sum_meta <- left_join(meta, otu_var_sum, by=c("LibraryID"="sample")) %>%
filter(Type == "Calculus") %>%
filter(!ToothSide == "O")
# Summarize per location
otu_var_mapping <- otu_var_sum_meta %>%
select(ToothNr, ToothSide, ToothClass, ind_low, ind_high) %>%
group_by(ToothNr, ToothSide, ToothClass) %>%
summarize(low_mass_taxa = mean(ind_low),
high_mass_taxa = mean(ind_high))
# Save
write.table(otu_var_mapping ,"<path_to>/mass_species_20210610.txt", row.names = FALSE, quote=FALSE, sep="\t")
```