-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_dada2.R
107 lines (73 loc) · 3.39 KB
/
0_dada2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Heli Juottonen 2021 [email protected]
# R scripts for PacBio sequence analysis of the woodtiger moth frass transplant experiment
# https://github.com/helijuottonen/mothtransplant
# For the article: xxx
# Juottonen H 1, Moghadam NN 1, Murphy L 1, Mappes J 1,2, Galarza JA 1,2
# 1 Department of Biological and Environmental Sciences, University of Jyväskylä, Finland
# 2 Organismal and Evolutionary Biology Research Program, University of Helsinki, Finland
# 0: Analysis of PacBio 16S rRNA gene sequence data with dada2:
# from fastq files to sequence and taxonomy table
# Disclaimer: I'm a self-taught R user. I try my very best to make sure that the scripts work, but I do not
# guarantee that they are correct, make sense or follow any kind of best practices. Use at your own risk!
# from: https://benjjneb.github.io/LRASManuscript/LRASms_fecal.html
# load dada2 package
library(dada2); packageVersion("dada2")
# v. 1.16.0
# setting up file paths etc.
path <- "data/fastq"
list.files(path)
path.out <- "results/Figures"
path.rds <- "results/RDS"
fns1 <- list.files(path, pattern=".fastq", full.names=TRUE)
# primer sequences:
F27 <- "AGAGTTTGATCMTGGCTCAG"
R1492 <- "CCTTGTTACGACTTCACCCCAG"
rc <- dada2:::rc
# removing primers
nops <- file.path(path, "noprimers", basename(fns1))
prim <- removePrimers(fns1, nops, primer.fwd=F27, primer.rev=dada2:::rc(R1492), orient=TRUE)
saveRDS(nops, file.path(path.rds, "nops.rds"))
nops <- readRDS(file.path(path.rds, "nops.rds"))
# checking the read length histogram after primer removal
lens.fn <- lapply(nops, function(fn) nchar(getSequences(fn)))
lens <- do.call(c, lens.fn)
hist(lens, 100)
#filtering based on quality settings etc.
filts <- file.path(path, "noprimers", "filtered", basename(fns1))
track <- filterAndTrim(nops, filts2, minQ=3, minLen=1300, maxLen=1600, maxN=0, rm.phix=FALSE, maxEE=2)
#check how many sequences left
track
# de-replicating reads
drp <- derepFastq(filts, verbose=TRUE)
# learning and inspecting errors
err <- learnErrors(drp, errorEstimationFunction=PacBioErrfun, BAND_SIZE=32, multithread=TRUE)
saveRDS(err, file.path(path.rds, "moth_transpl.rds"))
plotErrors(err)
# looks ok based on the dada2 tutorials
# denoising
dd2 <- dada(drp, err=err, BAND_SIZE=32, multithread=TRUE)
saveRDS(dd2, file.path(path.rds, "moth_transpl_dd2.rds"))
cbind(ccs=prim[,1], primers=prim[,2], filtered=track[,2], denoised=sapply(dd2, function(x) sum(x$denoised)))
# making sequence table
st <- makeSequenceTable(dd2); dim(st)
# saving sequence table
saveRDS(st, file.path(path.rds, "moth_transpl_st.rds"))
# reading in the previously saved sequence table
st <- readRDS(file.path(path.rds, "moth_transpl_st.rds"))
# removing chimeras
# checking how many chimeras are detected
# minFoldParentOverAbundance setting from dada2 PacBio example
# default 2, doesn't make a big difference
bimst <- isBimeraDenovo(st, minFoldParentOverAbundance=3.5)
table(bimst)
#removing chimeras
st.nochim <- removeBimeraDenovo(st, method="consensus", minFoldParentOverAbundance=3.5, verbose=TRUE)
dim(st.nochim)
# checking what was removed
sum(st.nochim)/sum(st)
# saving sequence table without chimeras
saveRDS(st.nochim, file.path(path.rds, "moth_transpl_st_nochim.rds"))
# assign taxonomy against Silva 138.1:
tax <- assignTaxonomy(st.nochim, "data/silva_nr99_v138.1_train_set.fa.gz")
# saving taxonomy assignments
saveRDS(tax, file.path(path.rds, "moth_transpl_tax_138.1.rds"))