-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_kmers.R
68 lines (54 loc) · 2.64 KB
/
example_kmers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(data.table)
library(SummarizedExperiment)
library(GenomicRanges)
peaks <- diffloop::bedToGRanges("defined_peaks.bed")
# function to get counts
getCountsFromFrags <- function(frag_gz_file,
peaks_gr,
barcodes){
# Make GRanges of fragments that are solid for the cells that we care about
frags_valid <- data.table::fread(paste0("zcat < ", frag_gz_file)) %>%
data.frame() %>% filter(V4 %in% barcodes) %>% # filter for barcodes in our search set
GenomicRanges::makeGRangesFromDataFrame(seqnames.field = "V1", start.field = "V2", end.field = "V3", keep.extra.columns = TRUE)
# Get a denominator, per cell
denom <- table(GenomicRanges::mcols(frags_valid)$V4)
barcodes_found <- names(denom)
# Get the overlaps with peaks
ovPEAK <- GenomicRanges::findOverlaps(peaks_gr, frags_valid)
# Establish a numeric index for the barcodes for sparse matrix purposes
id <- factor(as.character(GenomicRanges::mcols(frags_valid)$V4), levels = barcodes_found)
# Make sparse matrix with counts with peaks by unique barcode
countdf <- data.frame(peaks = S4Vectors::queryHits(ovPEAK),
sample = as.numeric(id)[S4Vectors::subjectHits(ovPEAK)]) %>%
dplyr::group_by(peaks,sample) %>% dplyr::summarise(count = n()) %>% data.matrix()
m <- Matrix::sparseMatrix(i = c(countdf[,1], length(peaks_gr)),
j = c(countdf[,2], length(barcodes_found)),
x = c(countdf[,3],0))
colnames(m) <- barcodes_found
# Make a polished colData
colData <- data.frame(
sample = barcodes_found,
depth = as.numeric(denom),
FRIP = Matrix::colSums(m)/as.numeric(denom)
)
# Make sure that the SE can be correctly constructed
stopifnot(all(colData$sample == colnames(m)))
# Make summarized Experiment
SE <- SummarizedExperiment::SummarizedExperiment(
rowRanges = peaks_gr,
assays = list(counts = m),
colData = colData
)
return(SE)
}
# Import PBMC to a summarized experiment
bc_pub_pbmc <- as.character(read.table("public_pbmc/pbmc10k/filtered_peak_bc_matrix/barcodes.tsv")[,1])
pbmc_SE <- getCountsFromFrags("public_pbmc/pbmc10k/atac_v1_pbmc_10k_fragments.tsv.gz", peaks, bc_pub_pbmc)
# Now compatible with chromVAR... run the pipeline
# filterPeaks; etc.
SE <- addGCBias(se, genome = BSgenome.Hsapiens.UCSC.hg19)
# Match and compute kmers
KmerMatch <- matchKmers(7, se, BSgenome.Hsapiens.UCSC.hg19)
kmerdev <- computeDeviations(se, KmerMatch)
kmerdevTable <- assays(kmerdev)[["deviations"]]
write.table(kmerdevTable, file = "kmer_Zscores.txt", sep = "\t", quote = FALSE, row.names = TRUE, col.names = TRUE)