From db8318a9d9358af4480c2bb8121e85f1f53f51bb Mon Sep 17 00:00:00 2001 From: wong-nw Date: Tue, 3 Mar 2020 14:02:50 -0500 Subject: [PATCH 1/3] scRNASeq changes Feb 2020 --- Results-template/Scripts/integrateBatches.R | 241 ++++++++++---------- Results-template/Scripts/scrnaQC.R | 210 +++++++++-------- Rules/scrnaQC.snakefile | 12 +- cluster.json | 8 +- gui/scrnaseq.py | 88 ++++++- pipeliner2.py | 2 +- 6 files changed, 319 insertions(+), 242 deletions(-) diff --git a/Results-template/Scripts/integrateBatches.R b/Results-template/Scripts/integrateBatches.R index 3ed6a17..b07f3d9 100644 --- a/Results-template/Scripts/integrateBatches.R +++ b/Results-template/Scripts/integrateBatches.R @@ -21,39 +21,39 @@ matrix <- as.character(args[1]) outDirSeurat = as.character(args[2]) outDirMerge = as.character(args[3]) specie = as.character(args[4]) -resolution = as.numeric(args[5]) +resolution = as.character(args[5]) clusterAlg = as.numeric(args[6]) annotDB = as.character(args[7]) nAnchors = as.numeric(args[8]) -groups = as.character(args[9]) -contrasts = as.character(args[10]) - +citeseq = as.character(args[9]) +groups = as.character(args[10]) +contrasts = as.character(args[11]) +resolution = as.numeric(strsplit(gsub(",+",",",resolution),split=",")[[1]]) #remove excess commas, split into numeric vector file.names <- dir(path = matrix,pattern ="rds") -if (groups == "YES") { - -groupFile = read.delim("groups.tab",header=F,stringsAsFactors = F) -groupFile=groupFile[groupFile$V2 %in% stringr::str_split_fixed(contrasts,pattern = "-",n = Inf)[1,],] -#groupFile = groupFile[groupFile$V2 == strsplit(contrasts,"-")[[1]][1] | groupFile$V2 == strsplit(contrasts,"-")[[1]][2] ,] - -splitFiles = gsub(".rds","",file.names)#str_split_fixed(file.names,pattern = "[.rd]",n = 2) -file.names=file.names[match(groupFile$V1,splitFiles,nomatch = F)] -print(groupFile$V1) -print(splitFiles) -print(file.names) +if (groups == "YES") { + groupFile = read.delim("groups.tab",header=F,stringsAsFactors = F) + groupFile=groupFile[groupFile$V2 %in% stringr::str_split_fixed(contrasts,pattern = "-",n = Inf)[1,],] + #groupFile = groupFile[groupFile$V2 == strsplit(contrasts,"-")[[1]][1] | groupFile$V2 == strsplit(contrasts,"-")[[1]][2] ,] + + splitFiles = gsub(".rds","",file.names)#str_split_fixed(file.names,pattern = "[.rd]",n = 2) + file.names=file.names[match(groupFile$V1,splitFiles,nomatch = F)] + print(groupFile$V1) + print(splitFiles) + print(file.names) } readObj = list() for (obj in file.names) { - Name=strsplit(obj,".rds")[[1]][1] - assign(paste0("S_",Name),readRDS(paste0(matrix,"/",obj))) - readObj = append(readObj,paste0("S_",Name)) + Name=strsplit(obj,".rds")[[1]][1] + assign(paste0("S_",Name),readRDS(paste0(matrix,"/",obj))) + readObj = append(readObj,paste0("S_",Name)) } #for (obj in readObj) { - # print(obj) + # print(obj) #sample = CreateSeuratObject(GetAssayData(object = eval(parse(text =obj)),slot = "counts")[,colnames(x = eval(parse(text =obj)))]) #sample@assays$RNA@data = GetAssayData(object = eval(parse(text =obj)),slot = "data")[,colnames(x = eval(parse(text =obj)))] #sample@meta.data =eval(parse(text =obj))@meta.data @@ -65,8 +65,8 @@ for (obj in file.names) { combinedObj.list=list() i=1 for (p in readObj){ - combinedObj.list[[p]] <- eval(parse(text = readObj[[i]])) - i <- i + 1 + combinedObj.list[[p]] <- eval(parse(text = readObj[[i]])) + i <- i + 1 } @@ -74,8 +74,7 @@ reference.list <- combinedObj.list[unlist(readObj)] print(reference.list) for (i in 1:length(x = reference.list)) { # reference.list[[i]] <- NormalizeData(object = reference.list[[i]], verbose = FALSE) - reference.list[[i]] <- FindVariableFeatures(object = reference.list[[i]], - selection.method = "vst", nfeatures = nAnchors, verbose = FALSE) + reference.list[[i]] <- FindVariableFeatures(object = reference.list[[i]], selection.method = "vst", nfeatures = nAnchors, verbose = FALSE) } print(length(reference.list)) @@ -83,114 +82,114 @@ print(reference.list) combinedObj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30,anchor.features = nAnchors) combinedObj.integrated <- IntegrateData(anchorset = combinedObj.anchors, dims = 1:30) - DefaultAssay(object = combinedObj.integrated) <- "integrated" combinedObj.integrated <- ScaleData(object = combinedObj.integrated, verbose = FALSE) - - combinedObj.integratedRNA = combinedObj.integrated DefaultAssay(object = combinedObj.integratedRNA) <- "SCT" combinedObj.integratedRNA = FindVariableFeatures(combinedObj.integratedRNA,mean.cutoff = c(0.0125, 3),dispersion.cutoff = c(0.5, Inf),selection.method = "vst") combinedObj.integratedRNA <- ScaleData(object = combinedObj.integratedRNA, verbose = FALSE) - -mat1 <- t(as.matrix(FetchData(object = combinedObj.integratedRNA,slot = "counts",vars = rownames(combinedObj.integratedRNA)))) -urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) -varGenes_batch = VariableFeatures(combinedObj.integrated)[VariableFeatures(combinedObj.integrated) %in% rownames(urdObj@logupx.data)] -varGenes_merge = VariableFeatures(combinedObj.integratedRNA)[VariableFeatures(combinedObj.integratedRNA) %in% rownames(urdObj@logupx.data)] - -pcs_batch <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_batch,mp.factor = 1.2) -npcs_batch = sum(pcs_batch@pca.sig) -print(npcs_batch) - -pcs_merge <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_merge,mp.factor = 1.2) -npcs_merge = sum(pcs_merge@pca.sig) -print(npcs_merge) - - -runInt = function(obj,res,npcs){ -obj <- RunPCA(object = obj, npcs = 50, verbose = FALSE) -obj <- FindNeighbors(obj,dims = 1:npcs) -obj <- FindClusters(obj, reduction.type = "pca", dims.use = 1:npcs, save.SNN = T,resolution = res,algorithm = clusterAlg) - -obj <- RunUMAP(object = obj, reduction = "pca", - dims = 1:npcs,n.components = 3) - -obj$groups = groupFile$V2[match(obj$Sample, groupFile$V1,nomatch = F)] - - - -runSingleR = function(obj,refFile,fineORmain){ -avg = AverageExpression(obj,assays = "SCT") -avg = as.data.frame(avg) -ref = refFile -s = SingleR(test = as.matrix(avg),ref = ref,labels = ref[[fineORmain]]) - -clustAnnot = s$labels -names(clustAnnot) = colnames(avg) -names(clustAnnot) = gsub("SCT.","",names(clustAnnot)) - -obj$clustAnnot = clustAnnot[match(obj$seurat_clusters,names(clustAnnot))] -return(obj$clustAnnot) +if(ncol(combinedObj.integratedRNA)<50000){ + mat1 <- t(as.matrix(FetchData(object = combinedObj.integratedRNA,slot = "counts",vars = rownames(combinedObj.integratedRNA)))) + urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) + varGenes_batch = VariableFeatures(combinedObj.integrated)[VariableFeatures(combinedObj.integrated) %in% rownames(urdObj@logupx.data)] + varGenes_merge = VariableFeatures(combinedObj.integratedRNA)[VariableFeatures(combinedObj.integratedRNA) %in% rownames(urdObj@logupx.data)] + + pcs_batch <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_batch,mp.factor = 1.2) + npcs_batch = sum(pcs_batch@pca.sig) + print(npcs_batch) + + pcs_merge <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_merge,mp.factor = 1.2) + npcs_merge = sum(pcs_merge@pca.sig) + print(npcs_merge) +}else{ + npcs_batch=50 + npcs_merge=50 + print(npcs_batch) + print(npcs_merge) } -if(annotDB == "HPCA"){ -obj$clustAnnot <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.fine") -} - -if(annotDB == "BP_encode"){ -obj$clustAnnot <- runSingleR(obj,BlueprintEncodeData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,BlueprintEncodeData(),"label.fine") -} -if(annotDB == "monaco"){ -obj$clustAnnot <- runSingleR(obj,MonacoImmuneData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,MonacoImmuneData(),"label.fine") -} -if(annotDB == "immu_cell_exp"){ -obj$clustAnnot <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.fine") -} - -if(annotDB == "immgen"){ -obj$clustAnnot <- runSingleR(obj,ImmGenData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,ImmGenData(),"label.fine") -} -if(annotDB == "mouseRNAseq"){ -obj$clustAnnot <- runSingleR(obj,MouseRNAseqData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,MouseRNAseqData(),"label.fine") -} -return(obj) +runInt = function(obj,res,npcs){ + if (citeseq=="Yes"){ + obj = NormalizeData(obj,assay="CITESeq",normalization.method="CLR") + obj = ScaleData(obj,assay="CITESeq") + } + obj <- RunPCA(object = obj, npcs = 50, verbose = FALSE) + obj <- FindNeighbors(obj,dims = 1:npcs) + for (i in 1:length(res)){ + obj <- FindClusters(obj, reduction.type = "pca", dims.use = 1:npcs, save.SNN = T,resolution = res[i],algorithm = clusterAlg) + } + obj <- RunUMAP(object = obj, reduction = "pca",dims = 1:npcs,n.components = 3) + obj$groups = groupFile$V2[match(obj$Sample, groupFile$V1,nomatch = F)] + + runSingleR = function(obj,refFile,fineORmain){ #SingleR function call as implemented below + avg = AverageExpression(obj,assays = "SCT") + avg = as.data.frame(avg) + ref = refFile + s = SingleR(test = as.matrix(avg),ref = ref,labels = ref[[fineORmain]]) + + clustAnnot = s$labels + names(clustAnnot) = colnames(avg) + names(clustAnnot) = gsub("SCT.","",names(clustAnnot)) + + obj$clustAnnot = clustAnnot[match(obj$seurat_clusters,names(clustAnnot))] + return(obj$clustAnnot) + } + + if(annotDB == "HPCA"){ + obj$clustAnnot <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.fine") + } + if(annotDB == "BP_encode"){ + obj$clustAnnot <- runSingleR(obj,BlueprintEncodeData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,BlueprintEncodeData(),"label.fine") + } + if(annotDB == "monaco"){ + obj$clustAnnot <- runSingleR(obj,MonacoImmuneData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,MonacoImmuneData(),"label.fine") + } + if(annotDB == "immu_cell_exp"){ + obj$clustAnnot <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.fine") + } + + if(annotDB == "immgen"){ + obj$clustAnnot <- runSingleR(obj,ImmGenData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,ImmGenData(),"label.fine") + } + if(annotDB == "mouseRNAseq"){ + obj$clustAnnot <- runSingleR(obj,MouseRNAseqData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,MouseRNAseqData(),"label.fine") + } + return(obj) } -combinedObj.integrated = runInt(combinedObj.integrated,0.3,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.3.rds")) -combinedObj.integrated = runInt(combinedObj.integrated,0.6,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.6.rds")) -combinedObj.integrated = runInt(combinedObj.integrated,0.8,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.8.rds")) -combinedObj.integrated = runInt(combinedObj.integrated,1.0,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.0.rds")) -combinedObj.integrated = runInt(combinedObj.integrated,1.2,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.2.rds")) - - - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.3,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.3.rds")) - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.6,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.6.rds")) - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.8,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.8.rds")) - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.0,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.0.rds")) - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.2,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.2.rds")) - - +#combinedObj.integrated = runInt(combinedObj.integrated,0.3,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.3.rds")) +#combinedObj.integrated = runInt(combinedObj.integrated,0.6,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.6.rds")) +#combinedObj.integrated = runInt(combinedObj.integrated,0.8,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.8.rds")) +#combinedObj.integrated = runInt(combinedObj.integrated,1.0,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.0.rds")) +#combinedObj.integrated = runInt(combinedObj.integrated,1.2,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.2.rds")) + +combinedObj.integrated=runInt(combinedObj.integrated,resolution,npcs_batch) +saveRDS(combinedObj.integrated,outDirSeurat) + +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.3,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.3.rds")) +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.6,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.6.rds")) +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.8,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.8.rds")) +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.0,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.0.rds")) +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.2,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.2.rds")) + +combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,resolution,npcs_merge) +saveRDS(combinedObj.integratedRNA,outDirMerge) diff --git a/Results-template/Scripts/scrnaQC.R b/Results-template/Scripts/scrnaQC.R index 2640968..e914285 100644 --- a/Results-template/Scripts/scrnaQC.R +++ b/Results-template/Scripts/scrnaQC.R @@ -9,13 +9,16 @@ sample = basename(file) outRDS = as.character(args[2]) outRdata = as.character(args[3]) species = as.character(args[4]) -resolution = as.numeric(args[5]) +resolution = as.character(args[5]) clusterAlg = as.numeric(args[6]) annotDB = as.character(args[7]) +citeseq = as.character(args[8]) #matrix = paste0(file,"/outs/filtered_feature_bc_matrix.h5") matrix=file +resolution = as.numeric(strsplit(gsub(",+",",",resolution),split=",")[[1]]) #remove excess commas, split into numeric vector + #library(future) #plan("multiprocess", workers = 8) @@ -23,6 +26,7 @@ matrix=file #library(DropletUtils,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") #library(scran,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") #library(scater,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(BiocGenerics) library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") @@ -48,77 +52,73 @@ library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") ###Run Seurat Clustering seuratClustering = function(so){ - - -tail(strsplit(file,"/")[[1]],1) -so$Sample=tail(strsplit(file,"/")[[1]],1) -so = SCTransform(so) - -so <- RunPCA(object = so, features = VariableFeatures(object = so), do.print = TRUE, pcs.print = 1:5,genes.print = 0,verbose=F,npcs = 30) - -if(ncol(so)<50000) { -mat1 <- t(as.matrix(FetchData(object = so,slot = "counts",vars = rownames(so)))) -urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) -varGenes = so@assays$RNA@var.features[so@assays$RNA@var.features %in% rownames(urdObj@logupx.data)] -pcs <- calcPCA(urdObj, mp.factor = 1,pcs.store = 30,genes.use=varGenes) -npcs = sum(pcs@pca.sig) -} - -else { -npcs = 30 - -} - -so <- FindNeighbors(so,dims = 1:npcs) -so <- FindClusters(so,dims = 1:npcs, print.output = 0, resolution = resolution,algorithm = clusterAlg) -so <- RunUMAP(so,dims = 1:npcs) - - -return(so) + + tail(strsplit(file,"/")[[1]],1) + so$Sample=gsub(".h5","",tail(strsplit(file,"/")[[1]],1)) + so = SCTransform(so) + + so <- RunPCA(object = so, features = VariableFeatures(object = so), do.print = TRUE, pcs.print = 1:5,genes.print = 0,verbose=F,npcs = 30) + + if(ncol(so)<50000) { + mat1 <- t(as.matrix(FetchData(object = so,slot = "counts",vars = rownames(so)))) + urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) + varGenes = so@assays$RNA@var.features[so@assays$RNA@var.features %in% rownames(urdObj@logupx.data)] + pcs <- calcPCA(urdObj, mp.factor = 1,pcs.store = 30,genes.use=varGenes) + npcs = sum(pcs@pca.sig) + } + else { + npcs = 30 + } + + so <- FindNeighbors(so,dims = 1:npcs) + for (i in 1:length(resolution)){ + so <- FindClusters(so,dims = 1:npcs, print.output = 0, resolution = resolution[i],algorithm = clusterAlg) + } + so <- RunUMAP(so,dims = 1:npcs) + + return(so) } doublets <-function(dfso){ - dfso <- SCTransform(dfso) - dfso <- RunPCA(dfso, pc.genes = dfso@var.genes, pcs.print = 0,verbose = F,npcs =30) - npcs = 10 - dfso <- RunUMAP(dfso, verbose=TRUE,dims = 1:npcs) - - - sweep.res.list_kidney <- paramSweep_v3(dfso,PCs = 1:10, sct = T) - sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE) - print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") - bcmvn_kidney <- find.pK(sweep.stats_kidney) - ## pK Identification (ground-truth) ------------------------------------------------------------------------------------------ -# sweep.res.list_kidney <- paramSweep_v3(dfso, PCs = 1:10, sct = FALSE) -# gt.calls <- dfso@meta.data[rownames(sweep.res.list_kidney[[1]]), "GT"] -# sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = TRUE, GT.calls = gt.calls) -# bcmvn_kidney <- find.pK(sweep.stats_kidney) - - - ## Homotypic Doublet Proportion Estimate ------------------------------------------------------------------------------------- - homotypic.prop <- modelHomotypic(dfso$annot) - perc = 0.008 * (length(colnames(dfso))/1000) - nExp_poi <- round(perc*length(colnames(dfso)))#dfso@cell.names - nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop)) - print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") - ## Run DoubletFinder with varying classification stringencies ---------------------------------------------------------------- - dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE,PCs = 1:10,sct = T) - pAAN=tail(names(dfso@meta.data),2)[1] - dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = pAAN,PCs = 1:10,sct = T) + dfso <- SCTransform(dfso) + dfso <- RunPCA(dfso, pc.genes = dfso@var.genes, pcs.print = 0,verbose = F,npcs =30) + npcs = 10 + dfso <- RunUMAP(dfso, verbose=TRUE,dims = 1:npcs) + + + sweep.res.list_kidney <- paramSweep_v3(dfso,PCs = 1:10, sct = T) + sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE) + #print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + bcmvn_kidney <- find.pK(sweep.stats_kidney) + ## pK Identification (ground-truth) ------------------------------------------------------------------------------------------ + #sweep.res.list_kidney <- paramSweep_v3(dfso, PCs = 1:10, sct = FALSE) + #gt.calls <- dfso@meta.data[rownames(sweep.res.list_kidney[[1]]), "GT"] + #sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = TRUE, GT.calls = gt.calls) + #bcmvn_kidney <- find.pK(sweep.stats_kidney) + + ## Homotypic Doublet Proportion Estimate ------------------------------------------------------------------------------------- + homotypic.prop <- modelHomotypic(dfso$annot) + perc = 0.008 * (length(colnames(dfso))/1000) + nExp_poi <- round(perc*length(colnames(dfso)))#dfso@cell.names + nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop)) + print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") + ## Run DoubletFinder with varying classification stringencies ---------------------------------------------------------------- + dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE,PCs = 1:10,sct = T) + pAAN=tail(names(dfso@meta.data),2)[1] + dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = pAAN,PCs = 1:10,sct = T) ## Plot results -------------------------------------------------------------------------------------------------------------- - #DF1=tail(names(dfso@meta.data),2)[1] - # DF2=tail(names(dfso@meta.data),2)[2] -# dfso@meta.data[,"DF_hi.lo"] <- dfso[[DF2]] -# dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet" & dfso@meta.data[[DF2]] == "Singlet")] <- "Doublet_lo" - # dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi" - return(dfso) + #DF1=tail(names(dfso@meta.data),2)[1] + #DF2=tail(names(dfso@meta.data),2)[2] + #dfso@meta.data[,"DF_hi.lo"] <- dfso[[DF2]] + #dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet" & dfso@meta.data[[DF2]] == "Singlet")] <- "Doublet_lo" + #dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi" + return(dfso) } convertHumanGeneList <- function(x){ - require("biomaRt") human = useMart("ensembl", dataset = "hsapiens_gene_ensembl") mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl") @@ -130,83 +130,93 @@ convertHumanGeneList <- function(x){ return(humanx) } -so_BC = Read10X_h5(matrix) -so_BC = CreateSeuratObject(so_BC) +if (citeseq=="Yes"){ + fileInput = Read10X_h5(matrix) + so_BC = CreateSeuratObject(fileInput[[1]]) + so_BC[['CITESeq']] = CreateAssayObject(counts=fileInput[[2]]) + so_BC = NormalizeData(so_BC,assay="CITESeq",normalization.method="CLR") + so_BC = ScaleData(so_BC,assay="CITESeq") +}else{ + so_BC = Read10X_h5(matrix) + so_BC = CreateSeuratObject(so_BC) +} if(species=="human"){so_BC[["percent.mt"]] <- PercentageFeatureSet(so_BC, pattern = "^MT-")} if(species=="mouse"){so_BC[["percent.mt"]] <- PercentageFeatureSet(so_BC, pattern = "^mt-")} - - nCount_out = outliers_mad(so_BC$nCount_RNA,threshold = 3)$LL_CI_MAD nFeature_out = outliers_mad(so_BC$nFeature_RNA,threshold = 3)$LL_CI_MAD mt_out = outliers_mad(so_BC$percent.mt,threshold = 3)$UL_CI_MAD so_filt <- subset(so_BC, subset = nFeature_RNA > nFeature_out & nCount_RNA > nFeature_out & percent.mt < mt_out) - load("/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/immgenMod.RData") #rownames(sce_norm) = gsub("_","-",rownames(sce_norm)) #rownames(sce_norm) = make.names(sce_norm@rowRanges@elementMetadata$Symbol,unique = T) - - so = seuratClustering(so_filt) if(species=="human"){ -s.genes <- cc.genes$s.genes -g2m.genes <- cc.genes$g2m.genes + s.genes <- cc.genes$s.genes + g2m.genes <- cc.genes$g2m.genes } if(species=="mouse"){ -s.genes <- convertHumanGeneList(cc.genes$s.genes) -g2m.genes <- convertHumanGeneList(cc.genes$g2m.genes) + s.genes <- convertHumanGeneList(cc.genes$s.genes) + g2m.genes <- convertHumanGeneList(cc.genes$g2m.genes) } so = CellCycleScoring(so,s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE) runSingleR = function(obj,refFile,fineORmain){ -sce = as.SingleCellExperiment(obj,assay = "SCT") -ref = refFile -s = SingleR(test = sce, ref = ref,labels = ref[[fineORmain]]) -return(s$pruned.labels) -print(head(s$pruned.labels)) + sce = as.SingleCellExperiment(obj,assay = "SCT") + ref = refFile + s = SingleR(test = sce, ref = ref,labels = ref[[fineORmain]]) + #return(s$pruned.labels) + return(s) + print(head(s$pruned.labels)) } if(species == "human"){ -so$HPCA_main <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.main") -so$HPCA <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.fine") -so$BP_encode_main <- runSingleR(so,BlueprintEncodeData(),"label.main") -so$BP_encode <- runSingleR(so,BlueprintEncodeData(),"label.fine") -so$monaco_main <- runSingleR(so,MonacoImmuneData(),"label.main") -so$monaco <- runSingleR(so,MonacoImmuneData(),"label.fine") -so$immu_cell_exp_main <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.main") -so$immu_cell_exp <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.fine") - + so@misc$SingleR$HPCA_main <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.main") + so$HPCA_main = so@misc$SingleR$HPCA_main$pruned.labels + so@misc$SingleR$HPCA <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.fine") + so$HPCA = so@misc$SingleR$HPCA$pruned.labels + so@misc$SingleR$BP_encode_main <- runSingleR(so,BlueprintEncodeData(),"label.main") + so$BP_encode_main=so@misc$SingleR$BP_encode_main$pruned.labels + so@misc$SingleR$BP_encode <- runSingleR(so,BlueprintEncodeData(),"label.fine") + so$BP_encode=so@misc$SingleR$BP_encode$pruned.labels + so@misc$SingleR$monaco_main <- runSingleR(so,MonacoImmuneData(),"label.main") + so$monaco_main=so@misc$SingleR$monaco_main$pruned.labels + so@misc$SingleR$monaco <- runSingleR(so,MonacoImmuneData(),"label.fine") + so$monaco = so@misc$SingleR$monaco$pruned.labels + so@misc$SingleR$dice_main <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.main") + so$dice_main = so@misc$SingleR$dice_main$pruned.labels + so@misc$SingleR$dice <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.fine") + so$dice = so@misc$SingleR$dice$pruned.labels + so@misc$SingleR$Novershtern_main <- runSingleR(so,NovershternHematopoieticData(),"label.main") + so$Novershtern_main = so@misc$SingleR$Novershtern_main$pruned.labels + so@misc$SingleR$Novershtern <- runSingleR(so,NovershternHematopoieticData(),"label.fine") + so$Novershtern = so@misc$SingleR$Novershtern$pruned.labels } if(species == "mouse"){ - -so$immgen_main <- runSingleR(so,ImmGenData(),"label.main") -so$immgen <- runSingleR(so,ImmGenData(),"label.fine") -so$mouseRNAseq_main <- runSingleR(so,MouseRNAseqData(),"label.main") -so$mouseRNAseq <- runSingleR(so,MouseRNAseqData(),"label.fine") - + so@misc$SingleR$immgen_main <- runSingleR(so,ImmGenData(),"label.main") + so$immgen_main = so@misc$SingleR$immgen_main$pruned.labels + so@misc$SingleR$immgen <- runSingleR(so,ImmGenData(),"label.fine") + so$immgen = so@misc$SingleR$immgen$pruned.labels + so@misc$SingleR$mouseRNAseq_main <- runSingleR(so,MouseRNAseqData(),"label.main") + so$mouseRNAseq_main = so@misc$SingleR$mouseRNAseq_main$pruned.labels + so@misc$SingleR$mouseRNAseq <- runSingleR(so,MouseRNAseqData(),"label.fine") + so$mouseRNAseq = so@misc$SingleR$mouseRNAseq$pruned.labels } - - - so$annot = so[[paste0(annotDB,"_main")]] so = doublets(so) so$DF_hi.lo = so[[tail(names(so@meta.data),1)]] so_noDoublet=subset(so,cells=names(so$DF_hi.lo)[so$DF_hi.lo =="Singlet"]) - saveRDS(so_noDoublet,outRDS) save.image(outRdata) - - - diff --git a/Rules/scrnaQC.snakefile b/Rules/scrnaQC.snakefile index e59bf6c..154bacc 100644 --- a/Rules/scrnaQC.snakefile +++ b/Rules/scrnaQC.snakefile @@ -14,7 +14,8 @@ workpath=config['project']['workpath'] specie = config['project']['SPECIES'] #human resolution = config['project']['CLUSTRESOLUTION']#"0.8" clustAlg = config['project']['CLUSTALG'] #1-3 -annotDB = config['project']['ANNOTDB']#"HPCA" #mouse +annotDB = config['project']['ANNOTDB']#"HPCA" #mouse +citeseq = config['project']['CITESEQ'] nAnchors = "2000" groups = "YES" #YES/NO @@ -42,7 +43,7 @@ rule all: rule qc_scrna: input: - join(workpath,"{name}") + join(workpath,"{name}.h5") output: rds=join(workpath,"filtered","{name}.rds"), rdata=join(workpath,"QC","{name}.RData"), @@ -53,11 +54,12 @@ rule qc_scrna: resolution = resolution, clustAlg = clustAlg, annotDB = annotDB, + citeseq = citeseq, outDir= join(workpath,"QC") shell: """ module load R/3.6.0; -Rscript Scripts/scrnaQC.R {input} {output.rds} {output.rdata} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB}; +Rscript Scripts/scrnaQC.R {input} {output.rds} {output.rdata} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.citeseq}; touch {output.qc} """ @@ -72,6 +74,7 @@ rule qcReport_scrna: shell: """ module load R/3.6.0 Rscript Scripts/scrnaQC_Reports.R {input.path} +mv Scripts/samples_QC.html QC """ rule integratedBatch: @@ -93,10 +96,11 @@ rule integratedBatch: nAnchors = nAnchors, groups = groups, dir = join(workpath,"filtered"), + citeseq = citeseq, contrasts = "{myGroups}" shell: """ module load R/3.6.0; -Rscript Scripts/integrateBatches.R {params.dir} {output.rdsBatch} {output.mergeRDS} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.nAnchors} {params.groups} {params.contrasts}; +Rscript Scripts/integrateBatches.R {params.dir} {output.rdsBatch} {output.mergeRDS} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.nAnchors} {params.citeseq} {params.groups} {params.contrasts}; """ diff --git a/cluster.json b/cluster.json index b73e093..356b90e 100644 --- a/cluster.json +++ b/cluster.json @@ -455,8 +455,8 @@ }, "integratedBatch":{ "threads": "8", - "mem": "64g", - "time": "1-00:00:00" + "mem": "200g", + "time": "2-00:00:00" }, "kraken_pe": { "cpus-per-task": "24", @@ -654,8 +654,8 @@ }, "qc_scrna":{ "threads": "1", - "mem": "48g", - "time": "8:00:00" + "mem": "150g", + "time": "1-00:00:00" }, "qcReport_scrna":{ "threads": "1", diff --git a/gui/scrnaseq.py b/gui/scrnaseq.py index d18615d..a00ea94 100644 --- a/gui/scrnaseq.py +++ b/gui/scrnaseq.py @@ -81,8 +81,8 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : #annotation db: human: HPCA/BP Encode; mouse: immgen/mouseRNAseq self.annotDB = annotDB = StringVar() annotLabel = Label(qcOptions,text="Annotation database: ") - annotHuman = ["HPCA","BP Encode"] - annotMouse = ["immgen","mouseRNAseq"] + annotHuman = ["Human Primary Cell Atlas","Blueprint/ENCODE","Monaco Immune","Database of Immune Cell Expression (DICE)"] + annotMouse = ["ImmGen","Mouse RNASeq"] annotDropdown = [] genome = self.global_info.annotation.get() if (genome == "GRCh38"): @@ -96,8 +96,16 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : annotLabel.grid(row=10,column=1,sticky=W,padx =10, pady =5) annotDB_om.grid(row=10,column=2,sticky=W,padx=0,pady=5) + self.citeseq = citeseq = StringVar() + citeseqLabel = Label(qcOptions, text = "CITESeq Included: ") + citeseqDropdown = ["Yes","No"] + citeseq.set(citeseqDropdown[1]) + citeseqOptMenu = OptionMenu(qcOptions, citeseq, *citeseqDropdown) + citeseqLabel.grid(row=11,column=1,sticky=W,padx=10,pady=5) + citeseqOptMenu.grid(row=11,column=2,sticky=W,padx=0,pady=5) + #set groups, mostly for relabeling purposes - self.add_info( qcOptions ) #Position 11 + self.add_info( qcOptions ) #Position 13 # self.option_controller() # groups_buttonL = Label(qcOptions, text="Sample Information: ") @@ -111,14 +119,55 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : ######### DIFFERENTIAL EXPRESSION ######### self.deOptions = deOptions = LabelFrame( eframe, text = "Differential Gene Expression") + #option for which object to run DE on + self.rdsObject = rdsObject = StringVar() + rdsLabel = Label(deOptions, text = "Use the pre- or post-batch corrected data:") + rdsDropdown = ["Merged (Pre-batch correction)","Integrated (Post-batch correction)","Both"] + rdsObject.set(rdsDropdown[2]) + rdsOptMenu=OptionMenu(deOptions,rdsObject,*rdsDropdown) + rdsLabel.grid(row=8,column=1,sticky=W,padx=10,pady=5) + rdsOptMenu.grid(row=8,column=2,sticky=W,padx=0,pady=5) + #option for cluster resolution for DE self.resolutionDE = resolutionDE = StringVar() - resolutionDE.set("0.8") + resolutionDE.set("0.4,0.6,0.8,1.0,1.2") resolutionDELabel = Label(deOptions,text="Clustering Resolution: \nChoose a previous resolution or \nselect a new resolution to run") resolutionDEEntry = Entry(deOptions,bd=2,width=25,textvariable=resolutionDE) - resolutionDELabel.grid(row=10,column=1,sticky=W,padx=10,pady=5) - resolutionDEEntry.grid(row=10,column=2,sticky=W,padx=0,pady=5) + resolutionDELabel.grid(row=9,column=1,sticky=W,padx=10,pady=5) + resolutionDEEntry.grid(row=9,column=2,sticky=W,padx=0,pady=5) + + #options for difftest + self.testDE = testDE = StringVar() + testDELabel = Label(deOptions, text = "Statistical test for differential expression:") + testDEDropdown = ["MAST","DESeq2","Likelihood Ratio","Logistic regression","Negative Binomial","Wilcoxon","Student's T"] + testDE.set(testDEDropdown[0]) + testDEMenu = OptionMenu(deOptions,testDE,*testDEDropdown) + testDELabel.grid(row=10,column=1,sticky=W,padx=10,pady=5) + testDEMenu.grid(row=10,column=2,sticky=W,padx=0,pady=5) + #options for filters + self.deMinPct = deMinPct = StringVar() + deMinPctLabel = Label(deOptions, text = "Minimum fraction of cells expressing DE genes:") + deMinPct.set("0.1") + deMinPctEntry = Entry(deOptions,bd=2,width = 10, textvariable=deMinPct) + deMinPctLabel.grid(row=11,column=1,sticky=W,padx=10,pady=5) + deMinPctEntry.grid(row=11,column=2,sticky=W,padx=0,pady=5) + + self.deMinFC = deMinFC = StringVar() + deMinFCLabel = Label(deOptions, text = "Minimum fold change to report DE genes:") + deMinFC.set("0.25") + deMinFCEntry = Entry(deOptions,bd=2,width = 10, textvariable=deMinFC) + deMinFCLabel.grid(row=12,column=1,sticky=W,padx=10,pady=5) + deMinFCEntry.grid(row=12,column=2,sticky=W,padx=0,pady=5) + + #use groups and contrasts for differential expression, have options to create new groups + self.om_groups = LabelFrame(deOptions, text="Sample Information") + self.groups_button = Button(self.om_groups, text="Set Groups", command = self.popup_groups ) + self.groups_button.grid(row=5, column=5, padx=10, pady=5) + self.contrasts_button = Button(self.om_groups, text="Set Contrasts", command = self.popup_groups ) + self.contrasts_button.grid(row=5, column=6, padx=10, pady=5) + self.om_groups.grid(row=13,column=0,columnspan=6,sticky=W,padx=20,pady=10) + #option for merged/integrated object # self.integrateOption = integrateOption = StringVar() # integrateLabel = Label(deOptions, text="Merged or Integrated (batch corrected): ") @@ -128,7 +177,6 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : # integrateLabel.grid(row=9,column=1,sticky=W,padx=10,pady=5) # integrateOptMenu.grid(row=9,column=2,sticky=W,padx=0,pady=5) - self.add_info( deOptions ) #Position 11 self.option_controller() ### SUBFUNCTION FOR CREATING GROUPS.TAB AND CLUSTERS.TAB WINDOW ###### @@ -148,7 +196,7 @@ def add_info( self, parent ) : self.groups_button.grid( row=5, column=5, padx=10, pady=5 ) self.contrasts_button.grid( row=5, column=6, padx=10, pady=5 ) - self.info.grid(row=11,column=0, columnspan=6, sticky=W, padx=20, pady=10) + self.info.grid(row=13,column=0, columnspan=6, sticky=W, padx=20, pady=10) def popup_groups( self ) : self.popup_window( "Groups Information", "groups.tab" ) @@ -356,15 +404,26 @@ def makejson(self, *args): PD=dict() smparams=[] + algorithmDict = {"Louvain (Original)": 1, "Louvain (with Multilevel Refinement)": 2,"SLM (Smart Local Moving)":3} - algorithm = algorithmDict[self.clustAlg.get()] + + annotationDict = {"Human Primary Cell Atlas":"HPCA", "Blueprint/ENCODE":"BP_encode","Monaco Immune":"monaco","Database of Immune Cell Expression (DICE)":"dice","ImmGen":"immgen","Mouse RNASeq":"mouseRNAseq"} + annotation = annotationDict[self.annotDB.get()] + resolutionStr = self.resolution.get() resolutionStr = re.sub("\s",",",resolutionStr) #remove whitespaces resolutionStr = re.sub("[,]+",",",resolutionStr) #remove multiple commas - + rdsDict = {"Merged (Pre-batch correction)":"merged","Integrated (Post-batch correction)":"integrated","Both":"both"} + rdsSelect = rdsDict[self.rdsObject.get()] + + deResolutionStr = self.resolutionDE.get() + deResolutionStr = re.sub("\s",",",deResolutionStr) #remove whitespace + deResolutionStr = re.sub("[,]+",",",deResolutionStr) #remove multiple commas + deTestDict = {"MAST":"MAST","DESeq2":"DESeq2","Likelihood Ratio":"bimod","Logistic regression":"LR","Negative Binomial":"negbinom","Wilcoxon":"wilcox","Student's T":"t"} + testDESelect = deTestDict[self.testDE.get()] for i in range(len(self.parameters)): @@ -414,11 +473,11 @@ def makejson(self, *args): "technique" : gi.technique.get(), "CLUSTALG" : algorithm, - "ANNOTDB" : self.annotDB.get(), + "ANNOTDB" : annotation, "CLUSTRESOLUTION": resolutionStr, # "INTEGRATEDE": self.integrateOption.get(), - "CLUSTERDE": self.resolutionDE.get(), "SPECIES": species, + "CITESEQ": self.citeseq.get(), # "CRID": self.scrCRID.get(), # "EXPECTED": self.scrExpected.get(), # "COUNTSPATH": self.countpath.get(), @@ -426,6 +485,11 @@ def makejson(self, *args): # "DOCYCLEREGRESS": self.docycleregress.get(), # "RESOLUTION": self.scrRes.get(), # "PCS": self.scrPCs.get(), + "FILEDE": rdsSelect, + "CLUSTERDE": deResolutionStr, + "TESTDE": testDESelect, + "DEMINPCT" : self.deMinPct.get(), + "DEMINFC": self.deMinFC.get(), "contrasts" : contrasts, "groups": groups } diff --git a/pipeliner2.py b/pipeliner2.py index bc5c485..75c6038 100755 --- a/pipeliner2.py +++ b/pipeliner2.py @@ -254,7 +254,7 @@ def set_pipeline( self, *args ) : annotation=self.annotation.get() set1=['Select the genome','hg19','mm10','mm9','hg38','hs37d5','hs38d1','hg38_30_KSHV','hg38_HPV16','canFam3','Mmul_8.0.1', 'hg38_30', 'mm10_M21'] set2=['Select the genome','hg19','mm10','mm9','hg38'] - set3=['Select the genome','GRCh38'] + set3=['Select the genome','GRCh38','mm10'] set4=['Select the genome','hg19','mm10','hg38'] if self.pipelineframe : From dbbe3bb40dc7167a99b3df4f9b54271769aaf7bb Mon Sep 17 00:00:00 2001 From: wong-nw Date: Wed, 8 Apr 2020 10:11:32 -0400 Subject: [PATCH 2/3] March updates --- Results-template/Scripts/integrateBatches.R | 51 +++++++++++++++++---- Results-template/Scripts/scrnaQC.R | 27 ++++++----- Results-template/Scripts/scrnaQC.Rmd | 4 +- Results-template/Scripts/scrnaQC_Reports.R | 2 +- Rules/scrnaQC.snakefile | 8 ++-- 5 files changed, 61 insertions(+), 31 deletions(-) diff --git a/Results-template/Scripts/integrateBatches.R b/Results-template/Scripts/integrateBatches.R index b07f3d9..e60260b 100644 --- a/Results-template/Scripts/integrateBatches.R +++ b/Results-template/Scripts/integrateBatches.R @@ -1,11 +1,14 @@ -library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library("SingleR",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA") +print(.libPaths()) + +library(Biobase)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(farver)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(S4Vectors)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("SingleR")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") library(scRNAseq) library(SingleCellExperiment) -library("destiny",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library("URD",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib") +library("destiny")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("URD")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib") library("Seurat") library(dplyr) library(Matrix) @@ -79,11 +82,39 @@ for (i in 1:length(x = reference.list)) { print(length(reference.list)) print(reference.list) -combinedObj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30,anchor.features = nAnchors) -combinedObj.integrated <- IntegrateData(anchorset = combinedObj.anchors, dims = 1:30) +#combinedObj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30,anchor.features = nAnchors) +#combinedObj.integrated <- IntegrateData(anchorset = combinedObj.anchors, dims = 1:30) +#UPDATE VIA SEURAT VIGNETTE +reference.features <- SelectIntegrationFeatures(object.list = reference.list, nfeatures = 3000) +reference.list <- PrepSCTIntegration(object.list = reference.list, anchor.features = reference.features) +reference.anchors=FindIntegrationAnchors(object.list = reference.list, normalization.method = "SCT",anchor.features = reference.features) +combinedObj.integrated=IntegrateData(anchorset = reference.anchors, normalization.method = "SCT") + +#Recover SingleR Annotations and Statistics +singleRList=list() +for (i in 1:length(reference.list)){ + obj=reference.list[[names(reference.list)[i]]] + sample=gsub("^S_","",names(reference.list)[i]) + idTag=gsub(".*_","",names(which(combinedObj.integrated$Sample==sample))[1]) + singleR=obj@misc$SingleR + for (j in 1:length(singleR)){ + if(names(singleR)[j]%in%names(singleRList)){ + indivSingleR=singleR[[j]] + rownames(indivSingleR)=paste(rownames(indivSingleR),idTag,sep="_") + indivSingleR=indivSingleR[intersect(rownames(indivSingleR),colnames(combinedObj.integrated)),] + singleRList[[names(singleR)[j]]]=rbind(singleRList[[names(singleR)[j]]],indivSingleR) + }else{ + indivSingleR=singleR[[j]] + rownames(indivSingleR)=paste(rownames(indivSingleR),idTag,sep="_") + indivSingleR=indivSingleR[intersect(rownames(indivSingleR),colnames(combinedObj.integrated)),] + singleRList[[names(singleR)[j]]]=indivSingleR + } + } +} +combinedObj.integrated@misc$SingleR=singleRList DefaultAssay(object = combinedObj.integrated) <- "integrated" -combinedObj.integrated <- ScaleData(object = combinedObj.integrated, verbose = FALSE) +#combinedObj.integrated <- ScaleData(object = combinedObj.integrated, verbose = FALSE) #DO NOT SCALE AFTER INTEGRATION, PER SEURAT VIGNETTE combinedObj.integratedRNA = combinedObj.integrated DefaultAssay(object = combinedObj.integratedRNA) <- "SCT" @@ -119,7 +150,7 @@ runInt = function(obj,res,npcs){ obj <- RunPCA(object = obj, npcs = 50, verbose = FALSE) obj <- FindNeighbors(obj,dims = 1:npcs) for (i in 1:length(res)){ - obj <- FindClusters(obj, reduction.type = "pca", dims.use = 1:npcs, save.SNN = T,resolution = res[i],algorithm = clusterAlg) + obj <- FindClusters(obj, reduction = "pca", dims = 1:npcs, save.SNN = T,resolution = res[i],algorithm = clusterAlg) } obj <- RunUMAP(object = obj, reduction = "pca",dims = 1:npcs,n.components = 3) obj$groups = groupFile$V2[match(obj$Sample, groupFile$V1,nomatch = F)] diff --git a/Results-template/Scripts/scrnaQC.R b/Results-template/Scripts/scrnaQC.R index e914285..a3b37f9 100644 --- a/Results-template/Scripts/scrnaQC.R +++ b/Results-template/Scripts/scrnaQC.R @@ -1,6 +1,7 @@ #.libPaths(c("/data//CCBR_Pipeliner/db/PipeDB/scrna_lib/R-3.6.1/library",.libPaths()[1],.libPaths()[2],.libPaths()[3])) -.libPaths(c("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA",.libPaths()[1],.libPaths()[2],.libPaths()[3])) - +#.libPaths(c("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA",.libPaths()[1],.libPaths()[2],.libPaths()[3])) +.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA") +print(.libPaths()) args <- commandArgs(trailingOnly = TRUE) @@ -27,27 +28,25 @@ resolution = as.numeric(strsplit(gsub(",+",",",resolution),split=",")[[1]]) #rem #library(scran,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") #library(scater,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") library(BiocGenerics) -library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library("Seurat",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(Biobase)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(farver)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(S4Vectors)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("Seurat")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("DoubletFinder")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") library(AnnotationDbi) -library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("modes")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") library(SingleR) library(scRNAseq) library(SingleCellExperiment) -library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -#library("SingleR",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library("URD",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library(Routliers,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("URD")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(Routliers)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") library(dplyr) library(Matrix) library(reshape2) library(tools) -library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("DoubletFinder")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") + ###Run Seurat Clustering diff --git a/Results-template/Scripts/scrnaQC.Rmd b/Results-template/Scripts/scrnaQC.Rmd index aa5883f..5b03698 100755 --- a/Results-template/Scripts/scrnaQC.Rmd +++ b/Results-template/Scripts/scrnaQC.Rmd @@ -11,8 +11,8 @@ dir<-params$dir ```{r setup, echo=FALSE, warning=FALSE,message=FALSE,fig.keep='all'} - -library("Seurat",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA") +library("Seurat")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") #opts_chunk$set(root.dir = dir) ``` diff --git a/Results-template/Scripts/scrnaQC_Reports.R b/Results-template/Scripts/scrnaQC_Reports.R index 32fe1ae..d75d593 100755 --- a/Results-template/Scripts/scrnaQC_Reports.R +++ b/Results-template/Scripts/scrnaQC_Reports.R @@ -4,6 +4,6 @@ dir <- as.character(args[1]) setwd(dir) #sample=strsplit(dir,".Rdat")[[1]][1] #outDir = as.character(args[2]) -rmarkdown::render("Scripts/scrnaQC.Rmd", output_file="samples_QC.html", params = list( +rmarkdown::render("../Scripts/scrnaQC.Rmd", output_file="samples_QC.html", params = list( dir=dir )) diff --git a/Rules/scrnaQC.snakefile b/Rules/scrnaQC.snakefile index 154bacc..3a9c8b6 100644 --- a/Rules/scrnaQC.snakefile +++ b/Rules/scrnaQC.snakefile @@ -58,7 +58,7 @@ rule qc_scrna: outDir= join(workpath,"QC") shell: """ -module load R/3.6.0; +module load R/3.6.1; Rscript Scripts/scrnaQC.R {input} {output.rds} {output.rdata} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.citeseq}; touch {output.qc} """ @@ -72,8 +72,8 @@ rule qcReport_scrna: params: rname='pl:qcReport_scrna', shell: """ -module load R/3.6.0 -Rscript Scripts/scrnaQC_Reports.R {input.path} +module load R/3.6.1; +Rscript Scripts/scrnaQC_Reports.R {input.path}; mv Scripts/samples_QC.html QC """ @@ -99,7 +99,7 @@ rule integratedBatch: citeseq = citeseq, contrasts = "{myGroups}" shell: """ -module load R/3.6.0; +module load R/3.6.1; Rscript Scripts/integrateBatches.R {params.dir} {output.rdsBatch} {output.mergeRDS} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.nAnchors} {params.citeseq} {params.groups} {params.contrasts}; """ From 57ca868f7cbb96eb805d37afbd07e56659958e8e Mon Sep 17 00:00:00 2001 From: Skyler Kuhn Date: Fri, 10 Apr 2020 12:20:45 -0400 Subject: [PATCH 3/3] Merging activeDev for v4.0.1 release (#444) * Fix sequenza error * Scrnaseq (#437) Scrnaseq: miRNA-seq changes and scRNA-seq changes * Symlink counts matrix if "counts/" exists in users rawdata directory * Dectect if users is starting from a raw counts matrix * Updating pop-up box dialogue after detection of new counts matrix filetype * Find differential expression metadata in "/path/to/rawdata/counts/" * Readability: checking for counts matrix outside conditional statement * Adding "--alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10" to the first and second pass of STAR * Explicitly loading 'python/2.7': default version of python changing to '3.7' Co-authored-by: Mayank Tandon Co-authored-by: wong-nw <38703762+wong-nw@users.noreply.github.com> Co-authored-by: jlac --- Results-template/Scripts/doublets.R | 63 +++ Results-template/Scripts/integrateBatches.R | 196 +++++++++ Results-template/Scripts/run_sequenza.R | 56 ++- Results-template/Scripts/scrnaQC.R | 212 ++++++++++ Results-template/Scripts/scrnaQC.Rmd | 44 ++ Results-template/Scripts/scrnaQC_Reports.R | 9 + Rules/InitialChIPseqQC.snakefile | 8 +- Rules/initialqcrnaseq.snakefile | 8 +- Rules/miRSeq.snakefile | 437 ++++++++++++++++++++ Rules/rnaseq.snakefile | 26 +- Rules/rnaseqforfusions.rl | 2 +- Rules/scrnaQC.snakefile | 103 +++++ Rules/strelka.rl | 2 +- Rules/wgs_strelka.rl | 2 +- cluster.json | 36 ++ gui/frame.py | 44 +- gui/mirseq.py | 42 +- gui/rnaseq.py | 13 +- gui/scrnaseq.py | 264 ++++++------ hg38.json | 7 +- mm10.json | 9 +- rules.json | 5 +- standard-bin.json | 21 + 23 files changed, 1429 insertions(+), 180 deletions(-) create mode 100644 Results-template/Scripts/doublets.R create mode 100644 Results-template/Scripts/integrateBatches.R create mode 100644 Results-template/Scripts/scrnaQC.R create mode 100755 Results-template/Scripts/scrnaQC.Rmd create mode 100755 Results-template/Scripts/scrnaQC_Reports.R create mode 100644 Rules/miRSeq.snakefile create mode 100644 Rules/scrnaQC.snakefile diff --git a/Results-template/Scripts/doublets.R b/Results-template/Scripts/doublets.R new file mode 100644 index 0000000..b9441bd --- /dev/null +++ b/Results-template/Scripts/doublets.R @@ -0,0 +1,63 @@ +library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("Seurat") + +args <- commandArgs(trailingOnly = TRUE) + +rds <- as.character(args[1]) +sample = basename(rds) +rdata = as.character(args[2]) +load(rdata) + + +doublets <-function(dfso){ + dfso <- NormalizeData(dfso, verbose = F) + dfso <- ScaleData(dfso,verbose = F) + dfso <- FindVariableFeatures(dfso, do.plot=FALSE) + dfso <- RunPCA(dfso, pc.genes = dfso@var.genes, pcs.print = 0,verbose = F,npcs =30) + npcs = 10 + dfso <- RunUMAP(dfso, verbose=TRUE,dims = 1:npcs) + + + sweep.res.list_kidney <- paramSweep_v3(dfso,PCs = 1:10, sct = FALSE) + sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE) + bcmvn_kidney <- find.pK(sweep.stats_kidney) + + ## pK Identification (ground-truth) ------------------------------------------------------------------------------------------ +# sweep.res.list_kidney <- paramSweep_v3(dfso, PCs = 1:10, sct = FALSE) +# gt.calls <- dfso@meta.data[rownames(sweep.res.list_kidney[[1]]), "GT"] +# sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = TRUE, GT.calls = gt.calls) +# bcmvn_kidney <- find.pK(sweep.stats_kidney) + + + ## Homotypic Doublet Proportion Estimate ------------------------------------------------------------------------------------- + homotypic.prop <- modelHomotypic(dfso$annot) + perc = 0.008 * (length(colnames(dfso))/1000) + nExp_poi <- round(perc*length(colnames(dfso)))#dfso@cell.names + nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop)) + + ## Run DoubletFinder with varying classification stringencies ---------------------------------------------------------------- + dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE,PCs = 1:10) + pAAN=tail(names(dfso@meta.data),2)[1] + dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = pAAN,PCs = 1:10) + + ## Plot results -------------------------------------------------------------------------------------------------------------- + DF1=tail(names(dfso@meta.data),2)[1] + DF2=tail(names(dfso@meta.data),2)[2] + + dfso@meta.data[,"DF_hi.lo"] <- dfso[[DF1]] + dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet" & dfso@meta.data[[DF2]] == "Singlet")] <- "Doublet_lo" + dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi" + return(dfso$DF_hi.lo) +} + +#so = readRDS(file) +so$annot = so$HPCA +doublets = doublets(so) +so$DF_hi.lo = doublets +#saveRDS(so,"x2.rds") +so=SubsetData(so,cells=names(so$DF_hi.lo)[so$DF_hi.lo =="Singlet"]) +saveRDS(so,rds) +save(list=c("sce_BC","sce_filt","so"),file = rdata) + + diff --git a/Results-template/Scripts/integrateBatches.R b/Results-template/Scripts/integrateBatches.R new file mode 100644 index 0000000..3ed6a17 --- /dev/null +++ b/Results-template/Scripts/integrateBatches.R @@ -0,0 +1,196 @@ +library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("SingleR",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(scRNAseq) +library(SingleCellExperiment) +library("destiny",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("URD",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib") +library("Seurat") +library(dplyr) +library(Matrix) +library(tools) +library(stringr) + + + +args <- commandArgs(trailingOnly = TRUE) + +matrix <- as.character(args[1]) +#output = as.character(args[2]) +outDirSeurat = as.character(args[2]) +outDirMerge = as.character(args[3]) +specie = as.character(args[4]) +resolution = as.numeric(args[5]) +clusterAlg = as.numeric(args[6]) +annotDB = as.character(args[7]) +nAnchors = as.numeric(args[8]) +groups = as.character(args[9]) +contrasts = as.character(args[10]) + + + +file.names <- dir(path = matrix,pattern ="rds") + +if (groups == "YES") { + +groupFile = read.delim("groups.tab",header=F,stringsAsFactors = F) +groupFile=groupFile[groupFile$V2 %in% stringr::str_split_fixed(contrasts,pattern = "-",n = Inf)[1,],] +#groupFile = groupFile[groupFile$V2 == strsplit(contrasts,"-")[[1]][1] | groupFile$V2 == strsplit(contrasts,"-")[[1]][2] ,] + +splitFiles = gsub(".rds","",file.names)#str_split_fixed(file.names,pattern = "[.rd]",n = 2) +file.names=file.names[match(groupFile$V1,splitFiles,nomatch = F)] +print(groupFile$V1) +print(splitFiles) +print(file.names) +} + +readObj = list() +for (obj in file.names) { + Name=strsplit(obj,".rds")[[1]][1] + assign(paste0("S_",Name),readRDS(paste0(matrix,"/",obj))) + readObj = append(readObj,paste0("S_",Name)) + } + +#for (obj in readObj) { + # print(obj) + #sample = CreateSeuratObject(GetAssayData(object = eval(parse(text =obj)),slot = "counts")[,colnames(x = eval(parse(text =obj)))]) + #sample@assays$RNA@data = GetAssayData(object = eval(parse(text =obj)),slot = "data")[,colnames(x = eval(parse(text =obj)))] + #sample@meta.data =eval(parse(text =obj))@meta.data + #assign(obj,sample) + +#} + + +combinedObj.list=list() +i=1 +for (p in readObj){ + combinedObj.list[[p]] <- eval(parse(text = readObj[[i]])) + i <- i + 1 + } + + +reference.list <- combinedObj.list[unlist(readObj)] +print(reference.list) +for (i in 1:length(x = reference.list)) { +# reference.list[[i]] <- NormalizeData(object = reference.list[[i]], verbose = FALSE) + reference.list[[i]] <- FindVariableFeatures(object = reference.list[[i]], + selection.method = "vst", nfeatures = nAnchors, verbose = FALSE) +} + +print(length(reference.list)) +print(reference.list) +combinedObj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30,anchor.features = nAnchors) +combinedObj.integrated <- IntegrateData(anchorset = combinedObj.anchors, dims = 1:30) + + +DefaultAssay(object = combinedObj.integrated) <- "integrated" +combinedObj.integrated <- ScaleData(object = combinedObj.integrated, verbose = FALSE) + + + +combinedObj.integratedRNA = combinedObj.integrated +DefaultAssay(object = combinedObj.integratedRNA) <- "SCT" + +combinedObj.integratedRNA = FindVariableFeatures(combinedObj.integratedRNA,mean.cutoff = c(0.0125, 3),dispersion.cutoff = c(0.5, Inf),selection.method = "vst") +combinedObj.integratedRNA <- ScaleData(object = combinedObj.integratedRNA, verbose = FALSE) + + +mat1 <- t(as.matrix(FetchData(object = combinedObj.integratedRNA,slot = "counts",vars = rownames(combinedObj.integratedRNA)))) +urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) +varGenes_batch = VariableFeatures(combinedObj.integrated)[VariableFeatures(combinedObj.integrated) %in% rownames(urdObj@logupx.data)] +varGenes_merge = VariableFeatures(combinedObj.integratedRNA)[VariableFeatures(combinedObj.integratedRNA) %in% rownames(urdObj@logupx.data)] + +pcs_batch <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_batch,mp.factor = 1.2) +npcs_batch = sum(pcs_batch@pca.sig) +print(npcs_batch) + +pcs_merge <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_merge,mp.factor = 1.2) +npcs_merge = sum(pcs_merge@pca.sig) +print(npcs_merge) + + +runInt = function(obj,res,npcs){ +obj <- RunPCA(object = obj, npcs = 50, verbose = FALSE) +obj <- FindNeighbors(obj,dims = 1:npcs) +obj <- FindClusters(obj, reduction.type = "pca", dims.use = 1:npcs, save.SNN = T,resolution = res,algorithm = clusterAlg) + +obj <- RunUMAP(object = obj, reduction = "pca", + dims = 1:npcs,n.components = 3) + +obj$groups = groupFile$V2[match(obj$Sample, groupFile$V1,nomatch = F)] + + + +runSingleR = function(obj,refFile,fineORmain){ +avg = AverageExpression(obj,assays = "SCT") +avg = as.data.frame(avg) +ref = refFile +s = SingleR(test = as.matrix(avg),ref = ref,labels = ref[[fineORmain]]) + +clustAnnot = s$labels +names(clustAnnot) = colnames(avg) +names(clustAnnot) = gsub("SCT.","",names(clustAnnot)) + +obj$clustAnnot = clustAnnot[match(obj$seurat_clusters,names(clustAnnot))] +return(obj$clustAnnot) +} + +if(annotDB == "HPCA"){ +obj$clustAnnot <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.main") +obj$clustAnnotDetail <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.fine") +} + +if(annotDB == "BP_encode"){ +obj$clustAnnot <- runSingleR(obj,BlueprintEncodeData(),"label.main") +obj$clustAnnotDetail <- runSingleR(obj,BlueprintEncodeData(),"label.fine") +} +if(annotDB == "monaco"){ +obj$clustAnnot <- runSingleR(obj,MonacoImmuneData(),"label.main") +obj$clustAnnotDetail <- runSingleR(obj,MonacoImmuneData(),"label.fine") +} +if(annotDB == "immu_cell_exp"){ +obj$clustAnnot <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.main") +obj$clustAnnotDetail <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.fine") +} + +if(annotDB == "immgen"){ +obj$clustAnnot <- runSingleR(obj,ImmGenData(),"label.main") +obj$clustAnnotDetail <- runSingleR(obj,ImmGenData(),"label.fine") +} +if(annotDB == "mouseRNAseq"){ +obj$clustAnnot <- runSingleR(obj,MouseRNAseqData(),"label.main") +obj$clustAnnotDetail <- runSingleR(obj,MouseRNAseqData(),"label.fine") +} +return(obj) +} + +combinedObj.integrated = runInt(combinedObj.integrated,0.3,npcs_batch) +saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.3.rds")) +combinedObj.integrated = runInt(combinedObj.integrated,0.6,npcs_batch) +saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.6.rds")) +combinedObj.integrated = runInt(combinedObj.integrated,0.8,npcs_batch) +saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.8.rds")) +combinedObj.integrated = runInt(combinedObj.integrated,1.0,npcs_batch) +saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.0.rds")) +combinedObj.integrated = runInt(combinedObj.integrated,1.2,npcs_batch) +saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.2.rds")) + + + +combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.3,npcs_merge) +saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.3.rds")) + +combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.6,npcs_merge) +saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.6.rds")) + +combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.8,npcs_merge) +saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.8.rds")) + +combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.0,npcs_merge) +saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.0.rds")) + +combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.2,npcs_merge) +saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.2.rds")) + + diff --git a/Results-template/Scripts/run_sequenza.R b/Results-template/Scripts/run_sequenza.R index 61f5fbc..e193474 100644 --- a/Results-template/Scripts/run_sequenza.R +++ b/Results-template/Scripts/run_sequenza.R @@ -2,40 +2,62 @@ args = commandArgs(trailingOnly=TRUE) library(sequenza) -library(parallel) -seqz_file = args[1] -out_dir = args[2] -n_cores = as.numeric(args[3]) -sampleid = args[4] - -if (! file.exists(seqz_file)) { +if (length(args)==0) { + stop("Must provide a seqz file") +} else { + seqz_file = args[1] + if (! file.exists(seqz_file)) { stop(paste0("Can't find this SEQZ output file: ", seqz_file)) + } +} + +if (length(args) > 1) { + out_dir = args[2] +} else { + out_dir = dirname(seqz_file) +} + +if (length(args) > 2) { + sampleid = args[3] +} else { + sampleid = gsub(".seqz.gz","",basename(seqz_file)) +} + +if (length(args) > 3) { + n_cores = as.numeric(args[4]) +} else { + n_cores = as.numeric(Sys.getenv("SLURM_CPUS_PER_TASK")) } + if (is.na(n_cores)) { n_cores = 1 } -if (is.na(out_dir)) { - out_dir = dirname(seqz_file) -} print(paste0("Using ",n_cores," cores...")) -#date() -#print("Reading seqz file...") -#seqz.data <- read.seqz(seqz_file) -#str(seqz.data, vec.len = 2) - date() print("Extracting seqz data...") -seqzdata <- sequenza.extract(seqz_file, min.reads = 30, min.reads.normal= 10) +seqzdata <- sequenza.extract(seqz_file, min.reads = 30, min.reads.normal= 10, parallel=n_cores) date() print("Fitting model...") CP.example <- sequenza.fit(seqzdata, mc.cores = n_cores) +## Sequenza.extract seems to fail if too few mutations +num_mutations <- unlist(lapply(seqzdata$mutations, nrow)) +chrom_list <- names(num_mutations)[num_mutations > 3] +## But it might actually be segments, idk? +#num_segments <- unlist(lapply(seqzdata$segments, nrow)) +#chrom_list <- names(num_mutations)[num_segments > 1] + +not_included <- setdiff(names(num_mutations), chrom_list) print("Printing results...") -sequenza.results(sequenza.extract = seqzdata,cp.table = CP.example, sample.id = sampleid, out.dir=out_dir) +if (length(not_included) > 0) { + print("Excluding these chromosomes because of too few mutations...") + print(not_included) +} +sequenza.results(sequenza.extract = seqzdata,cp.table = CP.example, sample.id = sampleid, out.dir=out_dir, chromosome.list=chrom_list) date() print("Done") diff --git a/Results-template/Scripts/scrnaQC.R b/Results-template/Scripts/scrnaQC.R new file mode 100644 index 0000000..2640968 --- /dev/null +++ b/Results-template/Scripts/scrnaQC.R @@ -0,0 +1,212 @@ +#.libPaths(c("/data//CCBR_Pipeliner/db/PipeDB/scrna_lib/R-3.6.1/library",.libPaths()[1],.libPaths()[2],.libPaths()[3])) +.libPaths(c("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA",.libPaths()[1],.libPaths()[2],.libPaths()[3])) + + +args <- commandArgs(trailingOnly = TRUE) + +file <- as.character(args[1]) +sample = basename(file) +outRDS = as.character(args[2]) +outRdata = as.character(args[3]) +species = as.character(args[4]) +resolution = as.numeric(args[5]) +clusterAlg = as.numeric(args[6]) +annotDB = as.character(args[7]) +#matrix = paste0(file,"/outs/filtered_feature_bc_matrix.h5") + +matrix=file + +#library(future) +#plan("multiprocess", workers = 8) + + +#library(DropletUtils,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +#library(scran,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +#library(scater,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("Seurat",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(AnnotationDbi) +library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") + +library(SingleR) +library(scRNAseq) +library(SingleCellExperiment) +library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +#library("SingleR",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("URD",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(Routliers,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(dplyr) +library(Matrix) +library(reshape2) +library(tools) +library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") + + +###Run Seurat Clustering +seuratClustering = function(so){ + + +tail(strsplit(file,"/")[[1]],1) +so$Sample=tail(strsplit(file,"/")[[1]],1) +so = SCTransform(so) + +so <- RunPCA(object = so, features = VariableFeatures(object = so), do.print = TRUE, pcs.print = 1:5,genes.print = 0,verbose=F,npcs = 30) + +if(ncol(so)<50000) { +mat1 <- t(as.matrix(FetchData(object = so,slot = "counts",vars = rownames(so)))) +urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) +varGenes = so@assays$RNA@var.features[so@assays$RNA@var.features %in% rownames(urdObj@logupx.data)] +pcs <- calcPCA(urdObj, mp.factor = 1,pcs.store = 30,genes.use=varGenes) +npcs = sum(pcs@pca.sig) +} + +else { +npcs = 30 + +} + +so <- FindNeighbors(so,dims = 1:npcs) +so <- FindClusters(so,dims = 1:npcs, print.output = 0, resolution = resolution,algorithm = clusterAlg) +so <- RunUMAP(so,dims = 1:npcs) + + +return(so) +} + +doublets <-function(dfso){ + dfso <- SCTransform(dfso) + dfso <- RunPCA(dfso, pc.genes = dfso@var.genes, pcs.print = 0,verbose = F,npcs =30) + npcs = 10 + dfso <- RunUMAP(dfso, verbose=TRUE,dims = 1:npcs) + + + sweep.res.list_kidney <- paramSweep_v3(dfso,PCs = 1:10, sct = T) + sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE) + print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + bcmvn_kidney <- find.pK(sweep.stats_kidney) + ## pK Identification (ground-truth) ------------------------------------------------------------------------------------------ +# sweep.res.list_kidney <- paramSweep_v3(dfso, PCs = 1:10, sct = FALSE) +# gt.calls <- dfso@meta.data[rownames(sweep.res.list_kidney[[1]]), "GT"] +# sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = TRUE, GT.calls = gt.calls) +# bcmvn_kidney <- find.pK(sweep.stats_kidney) + + + ## Homotypic Doublet Proportion Estimate ------------------------------------------------------------------------------------- + homotypic.prop <- modelHomotypic(dfso$annot) + perc = 0.008 * (length(colnames(dfso))/1000) + nExp_poi <- round(perc*length(colnames(dfso)))#dfso@cell.names + nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop)) + print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") + ## Run DoubletFinder with varying classification stringencies ---------------------------------------------------------------- + dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE,PCs = 1:10,sct = T) + pAAN=tail(names(dfso@meta.data),2)[1] + dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = pAAN,PCs = 1:10,sct = T) + + ## Plot results -------------------------------------------------------------------------------------------------------------- + #DF1=tail(names(dfso@meta.data),2)[1] + # DF2=tail(names(dfso@meta.data),2)[2] +# dfso@meta.data[,"DF_hi.lo"] <- dfso[[DF2]] +# dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet" & dfso@meta.data[[DF2]] == "Singlet")] <- "Doublet_lo" + # dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi" + return(dfso) +} + + + +convertHumanGeneList <- function(x){ + + require("biomaRt") + human = useMart("ensembl", dataset = "hsapiens_gene_ensembl") + mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl") + + genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T) + + humanx <- unique(genesV2[, 2]) + + return(humanx) +} + +so_BC = Read10X_h5(matrix) +so_BC = CreateSeuratObject(so_BC) + +if(species=="human"){so_BC[["percent.mt"]] <- PercentageFeatureSet(so_BC, pattern = "^MT-")} +if(species=="mouse"){so_BC[["percent.mt"]] <- PercentageFeatureSet(so_BC, pattern = "^mt-")} + + + +nCount_out = outliers_mad(so_BC$nCount_RNA,threshold = 3)$LL_CI_MAD +nFeature_out = outliers_mad(so_BC$nFeature_RNA,threshold = 3)$LL_CI_MAD +mt_out = outliers_mad(so_BC$percent.mt,threshold = 3)$UL_CI_MAD + +so_filt <- subset(so_BC, subset = nFeature_RNA > nFeature_out & nCount_RNA > nFeature_out & percent.mt < mt_out) + + +load("/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/immgenMod.RData") + +#rownames(sce_norm) = gsub("_","-",rownames(sce_norm)) +#rownames(sce_norm) = make.names(sce_norm@rowRanges@elementMetadata$Symbol,unique = T) + + +so = seuratClustering(so_filt) + +if(species=="human"){ +s.genes <- cc.genes$s.genes +g2m.genes <- cc.genes$g2m.genes +} + +if(species=="mouse"){ +s.genes <- convertHumanGeneList(cc.genes$s.genes) +g2m.genes <- convertHumanGeneList(cc.genes$g2m.genes) +} + + +so = CellCycleScoring(so,s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE) + +runSingleR = function(obj,refFile,fineORmain){ +sce = as.SingleCellExperiment(obj,assay = "SCT") +ref = refFile +s = SingleR(test = sce, ref = ref,labels = ref[[fineORmain]]) +return(s$pruned.labels) +print(head(s$pruned.labels)) +} + +if(species == "human"){ +so$HPCA_main <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.main") +so$HPCA <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.fine") +so$BP_encode_main <- runSingleR(so,BlueprintEncodeData(),"label.main") +so$BP_encode <- runSingleR(so,BlueprintEncodeData(),"label.fine") +so$monaco_main <- runSingleR(so,MonacoImmuneData(),"label.main") +so$monaco <- runSingleR(so,MonacoImmuneData(),"label.fine") +so$immu_cell_exp_main <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.main") +so$immu_cell_exp <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.fine") + +} + +if(species == "mouse"){ + +so$immgen_main <- runSingleR(so,ImmGenData(),"label.main") +so$immgen <- runSingleR(so,ImmGenData(),"label.fine") +so$mouseRNAseq_main <- runSingleR(so,MouseRNAseqData(),"label.main") +so$mouseRNAseq <- runSingleR(so,MouseRNAseqData(),"label.fine") + +} + + + + +so$annot = so[[paste0(annotDB,"_main")]] + +so = doublets(so) +so$DF_hi.lo = so[[tail(names(so@meta.data),1)]] +so_noDoublet=subset(so,cells=names(so$DF_hi.lo)[so$DF_hi.lo =="Singlet"]) + + +saveRDS(so_noDoublet,outRDS) +save.image(outRdata) + + + diff --git a/Results-template/Scripts/scrnaQC.Rmd b/Results-template/Scripts/scrnaQC.Rmd new file mode 100755 index 0000000..aa5883f --- /dev/null +++ b/Results-template/Scripts/scrnaQC.Rmd @@ -0,0 +1,44 @@ +--- +params: + dir: "path/to/count/matrix" +--- + +```{r headers, include=FALSE, warning=FALSE, message=FALSE} +dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") +dir<-params$dir +``` + + + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE,fig.keep='all'} + +library("Seurat",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +#opts_chunk$set(root.dir = dir) +``` + + +```{r plots, echo=FALSE , warning=FALSE,fig.keep='all',results='asis'} + +setwd(dir) +file.names <- dir(path = dir ,pattern =".RData") + +for (file in file.names){ +fullPath=paste0(dir,"/",file) +load(file) + +Name=strsplit(file,".RData")[[1]][1] + +print(Name) + +print(VlnPlot(so_BC, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)) #before + +print(VlnPlot(so_filt, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)) #after + +print(DimPlot(so,group.by = "annot")) #annot + +print(DimPlot(so,group.by = "DF_hi.lo")) #double + +} + + +``` diff --git a/Results-template/Scripts/scrnaQC_Reports.R b/Results-template/Scripts/scrnaQC_Reports.R new file mode 100755 index 0000000..32fe1ae --- /dev/null +++ b/Results-template/Scripts/scrnaQC_Reports.R @@ -0,0 +1,9 @@ +args <- commandArgs(trailingOnly = TRUE) + +dir <- as.character(args[1]) +setwd(dir) +#sample=strsplit(dir,".Rdat")[[1]][1] +#outDir = as.character(args[2]) +rmarkdown::render("Scripts/scrnaQC.Rmd", output_file="samples_QC.html", params = list( + dir=dir +)) diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index 677cb89..452679b 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -610,7 +610,7 @@ rule deeptools_QC: deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'], run: import re - commoncmd="module load {params.deeptoolsver}; module load python;" + commoncmd="module load {params.deeptoolsver}; module load python/2.7;" listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines())) ext=listfile[0][0] bws=listfile[1] @@ -639,7 +639,7 @@ rule deeptools_fingerprint: nthreads="8" run: import re - commoncmd="module load {params.deeptoolsver}; module load python;" + commoncmd="module load {params.deeptoolsver}; module load python/2.7;" listfile=list(map(lambda z:z.strip().split(),open(input.prep,'r').readlines())) ext=listfile[0][0] bams=listfile[1] @@ -662,7 +662,7 @@ rule deeptools_fingerprint_Q5DD: nthreads="8" run: import re - commoncmd="module load {params.deeptoolsver}; module load python;" + commoncmd="module load {params.deeptoolsver}; module load python/2.7;" listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines())) ext=listfile[0][0] bams=listfile[1] @@ -690,7 +690,7 @@ rule deeptools_genes: nthreads="16" run: import re - commoncmd="module load {params.deeptoolsver}; module load python;" + commoncmd="module load {params.deeptoolsver}; module load python/2.7;" listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines())) ext=listfile[0][0] bws=listfile[1] diff --git a/Rules/initialqcrnaseq.snakefile b/Rules/initialqcrnaseq.snakefile index ef35bb0..63a2ca2 100644 --- a/Rules/initialqcrnaseq.snakefile +++ b/Rules/initialqcrnaseq.snakefile @@ -272,7 +272,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} rl=int(open(join(input.qcdir,"readlength.txt")).readlines()[0].strip())-1 dbrl=sorted(list(map(lambda x:int(re.findall("genes-(\d+)",x)[0]),glob.glob(params.stardir+'*/',recursive=False)))) bestdbrl=next(x[1] for x in enumerate(dbrl) if x[1] >= rl) - cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" "+input.file2+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMtype BAM Unsorted" + cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" "+input.file2+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMtype BAM Unsorted --alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10" shell(cmd) A=open(join(workpath,"run.json"),'r') a=eval(A.read()) @@ -339,7 +339,7 @@ cat {input.files} |sort|uniq|awk -F \"\\t\" '{{if ($5>0 && $6==1) {{print}}}}'|c rl=int(open(join(input.qcdir,"readlength.txt")).readlines()[0].strip())-1 dbrl=sorted(list(map(lambda x:int(re.findall("genes-(\d+)",x)[0]),glob.glob(params.stardir+'*/',recursive=False)))) bestdbrl=next(x[1] for x in enumerate(dbrl) if x[1] >= rl) - cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" "+input.file2+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMunmapped "+params.outsamunmapped+" --outWigType "+params.wigtype+" --outWigStrand "+params.wigstrand+" --sjdbFileChrStartEnd "+str(input.tab)+" --sjdbGTFfile "+params.gtffile+" --limitSjdbInsertNsj "+str(params.nbjuncs)+" --quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM SortedByCoordinate;" + cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" "+input.file2+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMunmapped "+params.outsamunmapped+" --outWigType "+params.wigtype+" --outWigStrand "+params.wigstrand+" --sjdbFileChrStartEnd "+str(input.tab)+" --sjdbGTFfile "+params.gtffile+" --limitSjdbInsertNsj "+str(params.nbjuncs)+" --quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM SortedByCoordinate --alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10;" shell(cmd) cmd="sleep 120;cd {workpath};mv {workpath}/{star_dir}/{params.prefix}.Aligned.toTranscriptome.out.bam {workpath}/{bams_dir}; mv {workpath}/{star_dir}/{params.prefix}.Log.final.out {workpath}/{log_dir}" shell(cmd) @@ -563,7 +563,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} rl=int(open(join(input.qcdir,"readlength.txt")).readlines()[0].strip())-1 dbrl=sorted(list(map(lambda x:int(re.findall("genes-(\d+)",x)[0]),glob.glob(params.stardir+'*/',recursive=False)))) bestdbrl=next(x[1] for x in enumerate(dbrl) if x[1] >= rl) - cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMtype BAM Unsorted" + cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMtype BAM Unsorted --alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10" shell(cmd) A=open(join(workpath,"run.json"),'r') a=eval(A.read()) @@ -629,7 +629,7 @@ cat {input.files} |sort|uniq|awk -F \"\\t\" '{{if ($5>0 && $6==1) {{print}}}}'|c rl=int(open(join(input.qcdir,"readlength.txt")).readlines()[0].strip())-1 dbrl=sorted(list(map(lambda x:int(re.findall("genes-(\d+)",x)[0]),glob.glob(params.stardir+'*/',recursive=False)))) bestdbrl=next(x[1] for x in enumerate(dbrl) if x[1] >= rl) - cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMunmapped "+params.outsamunmapped+" --outWigType "+params.wigtype+" --outWigStrand "+params.wigstrand+" --sjdbFileChrStartEnd "+str(input.tab)+" --sjdbGTFfile "+params.gtffile+" --limitSjdbInsertNsj "+str(params.nbjuncs)+" --quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM SortedByCoordinate" + cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMunmapped "+params.outsamunmapped+" --outWigType "+params.wigtype+" --outWigStrand "+params.wigstrand+" --sjdbFileChrStartEnd "+str(input.tab)+" --sjdbGTFfile "+params.gtffile+" --limitSjdbInsertNsj "+str(params.nbjuncs)+" --quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM SortedByCoordinate --alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10" shell(cmd) cmd="sleep 120;cd {workpath};mv {workpath}/{star_dir}/{params.prefix}.Aligned.toTranscriptome.out.bam {workpath}/{bams_dir}; mv {workpath}/{star_dir}/{params.prefix}.Log.final.out {workpath}/{log_dir}" shell(cmd) diff --git a/Rules/miRSeq.snakefile b/Rules/miRSeq.snakefile new file mode 100644 index 0000000..ad31dbc --- /dev/null +++ b/Rules/miRSeq.snakefile @@ -0,0 +1,437 @@ +from snakemake.utils import R +from os.path import join +configfile: "run.json" +# configfile: "run1.json.1" +from os import listdir +import os + +#various subroutines +def check_existence(filename): + if not os.path.exists(filename): + exit("File: %s does not exists!"%(filename)) + +def check_readaccess(filename): + check_existence(filename) + if not os.access(filename,os.R_OK): + exit("File: %s exists, but cannot be read!"%(filename)) + +def check_writeaccess(filename): + check_existence(filename) + if not os.access(filename,os.W_OK): + exit("File: %s exists, but cannot be read!"%(filename)) + +def createConstrasts(cList): + contrastsList = [] + for i in range(0, len(cList)-1, 2): + contrastsList.append("-".join(cList[i:i+2])) + return contrastsList + +def get_cpm_cutoff(degdir,contrasts_w_cpm_cutoff_list,cpm_cutoff_list): + return cpm_cutoff_list[contrasts_w_cpm_cutoff_list.index(degdir)] + +samples=config['project']['groups']['rsamps'] + +workpath=config['project']['workpath'] + +# contrastsList = createConstrasts(config['project']['contrasts']['rcontrasts']) +# cpm_cutoff_list = config['project']['contrasts']['rcontrasts_cpm_cutoff'] +# mincounts = config['project']['contrasts']['rcontrasts_min_counts'] + +# contrasts_w_cpm_cutoff_list = [] +# for i in zip(contrastsList,cpm_cutoff_list,mincounts): +# contrasts_w_cpm_cutoff_list.append(str(i[0]) + "_" + str(i[1]) + "_" + str(i[2])) +# print(contrasts_w_cpm_cutoff_list, contrastsList) + + +#trim #cutadapt +se = "" +pe = "" +workpath = config['project']['workpath'] +# +# if config['project']['nends'] == 2 : +# se="no" +# elif config['project']['nends'] == 1 : +# se="yes" + +se = "yes" +trim_dir='trim' +kraken_dir='kraken' +log_dir="logfiles" +# bams_dir="bams" +# degall_dir="DEG_ALL" +pfamily = config['project']['pfamily'] + +for d in [trim_dir,kraken_dir,log_dir]:#trim_dir,kraken_dir,bams_dir,star_dir,log_dir,rseqc_dir,preseq_dir,degall_dir + if not os.path.exists(join(workpath,d)): + os.mkdir(join(workpath,d)) + +if not os.path.exists("slurmfiles"): + os.mkdir ("slurmfiles") + +if se=="yes": + if (config['project']['novelMirs'] == "YES"): + rule all: + params: + batch = '--time=168:00:00' + input: + join(workpath,'rawQC'), + join(workpath,'QC'), + expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples), + expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples), + expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples), + join(workpath,'Reports/multiqc_report.html'), + expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), + expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples), + expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples), + expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples), + # # # # expand(join(workpath,"fasta/trim/{name}.trim.fasta.gz"),name=samples), + # # # # expand(join(workpath,"fasta/trim/{name}.trim.fasta"),name=samples), + expand(join(workpath,"fasta","{name}.trimfasta.done"),name=samples), + join(workpath,"fasta","collapsed_trim.fa"), + join(workpath,"mirdeep2_1p/miRNAs_expressed_all_samples_1p.txt"), + join(workpath,"fasta/novel_mature.fa"), + join(workpath,"fasta/novel_hairpin.fa"), + join(workpath,"mirdeep2_2p/novel_miRNAs_expressed_all_samples_2p.txt"), + join(workpath,"mirdeep2_results/annotatedCounts.txt"), + join(workpath,"mirdeep2_results/novelCounts.txt") + elif(config['project']['novelMirs'] == "NO"): + rule all: + params: + batch = '--time=72:00:00' + input: + join(workpath,'rawQC'), + join(workpath,'QC'), + expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples), + expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples), + expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples), + join(workpath,'Reports/multiqc_report.html'), + expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), + expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples), + expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples), + expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples), + # # # # expand(join(workpath,"fasta/trim/{name}.trim.fasta.gz"),name=samples), + # # # # expand(join(workpath,"fasta/trim/{name}.trim.fasta"),name=samples), + expand(join(workpath,"fasta","{name}.trimfasta.done"),name=samples), + join(workpath,"fasta","collapsed_trim.fa"), + join(workpath,"mirdeep2_out","miRNAs_expressed_all_samples.txt"), + join(workpath,"mirdeep2_results/annotated_counts.txt") + + + rule raw_fastqc: + input: + expand("{name}.R1.fastq.gz", name = samples) + output: + join(workpath,"rawQC") + priority: 2 + params: + rname = 'pl:rawfastqc', + batch = '--cpus-per-task=32 --mem=110g --time=48:00:00', + fastqcver = config['bin'][pfamily]['tool_versions']['FASTQCVER'], + threads: 32 + shell: """ + mkdir -p {output}; + module load {params.fastqcver}; + fastqc {input} -t {threads} -o {output}; + """ + rule trim_se: + input: + infq=join(workpath,"{name}.R1.fastq.gz") #+config['project']['filetype'] used if needed for file flexibility + output: + out11=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz") + #err=join(workpath,"QC","{name}_run_cutadapt.err") + params: + rname='pl:trim_se', + batch='--cpus-per-task=32 --mem=110g --time=48:00:00', + cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'], + fastawithadaptersetd=config['bin'][pfamily]['tool_parameters']['FASTAWITHADAPTERSETD'], + leadingquality=config['bin'][pfamily]['tool_parameters']['LEADINGQUALITY'], + trailingquality=config['bin'][pfamily]['tool_parameters']['TRAILINGQUALITY'], + minlen=config['bin'][pfamily]['tool_parameters']['MINLEN'] + threads: 32 + shell:""" + module load {params.cutadaptver}; + cutadapt --nextseq-trim=2 --trim-n -n 5 -O 5 -q {params.leadingquality},{params.trailingquality} -m {params.minlen} -b file:{params.fastawithadaptersetd} -j {threads} -o {output.out11} {input.infq} + """ + + rule kraken: + input: + fq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz") + output: + krakentaxa=join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"), + kronahtml=join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html") + params: + rname='pl:kraken', + prefix="{name}", + outdir=join(workpath,kraken_dir), + bacdb=config['bin'][pfamily]['tool_parameters']['KRAKENBACDB'], + krakenver=config['bin'][pfamily]['tool_versions']['KRAKENVER'], + kronatoolsver=config['bin'][pfamily]['tool_versions']['KRONATOOLSVER'], + threads: 24 + shell: """ + module load {params.krakenver}; + module load {params.kronatoolsver}; + if [ ! -d {params.outdir} ]; then mkdir {params.outdir}; fi + if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID ; fi + cd /lscratch/$SLURM_JOBID; + cp -rv {params.bacdb} /lscratch/$SLURM_JOBID/; + dbName=`echo {params.bacdb}|awk -F "/" '{{print \$NF}}'`; + kraken --db /lscratch/$SLURM_JOBID/$dbName --fastq-input --gzip-compressed --threads {threads} --output /lscratch/$SLURM_JOBID/{params.prefix}.krakenout --preload {input.fq} + kraken-translate --mpa-format --db /lscratch/$SLURM_JOBID/`echo {params.bacdb}|awk -F "/" '{{print \$NF}}'` /lscratch/$SLURM_JOBID/{params.prefix}.krakenout |cut -f2|sort|uniq -c|sort -k1,1nr > /lscratch/$SLURM_JOBID/{params.prefix}.krakentaxa + cut -f2,3 /lscratch/$SLURM_JOBID/{params.prefix}.krakenout | ktImportTaxonomy - -o /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml + mv /lscratch/$SLURM_JOBID/{params.prefix}.krakentaxa {output.krakentaxa} + mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} + """ + + rule fastqc: + input: + expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples), + output: + join(workpath,"QC") + priority: 2 + params: + rname='pl:fastqc', + batch='--cpus-per-task=32 --mem=110g --time=48:00:00', + fastqcver=config['bin'][pfamily]['tool_versions']['FASTQCVER'], + threads: 32 + shell: """ + if [ ! -d {output} ]; then mkdir -p {output}; fi + module load {params.fastqcver}; + fastqc {input} -t {threads} -o {output}; + module load python/3.5; + python Scripts/get_read_length.py {output} > {output}/readlength.txt 2> {output}/readlength.err + """ + + rule fastq_screen: + input: + file1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz") + output: + out1=join(workpath,"FQscreen","{name}.R1.trim_screen.txt"), + out2=join(workpath,"FQscreen","{name}.R1.trim_screen.png"), + out3=join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"), + out4=join(workpath,"FQscreen2","{name}.R1.trim_screen.png"), + params: + rname='pl:fqscreen', + batch='--cpus-per-task=24 --mem=64g --time=10:00:00', + fastq_screen=config['bin'][pfamily]['tool_versions']['FASTQ_SCREEN'], + outdir = join(workpath,"FQscreen"), + outdir2 = join(workpath,"FQscreen2"), + fastq_screen_config=config['bin'][pfamily]['tool_parameters']['FASTQ_SCREEN_CONFIG'], + fastq_screen_config2=config['bin'][pfamily]['tool_parameters']['FASTQ_SCREEN_CONFIG2'], + perlver=config['bin'][pfamily]['tool_versions']['PERLVER'], + bowtie2ver=config['bin'][pfamily]['tool_versions']['BOWTIE2VER'], + threads: 24 + shell: """ + module load {params.bowtie2ver}; + module load {params.perlver}; + {params.fastq_screen} --conf {params.fastq_screen_config} --outdir {params.outdir} --threads {threads} --subset 1000000 --aligner bowtie2 --force {input.file1} + {params.fastq_screen} --conf {params.fastq_screen_config2} --outdir {params.outdir2} --threads {threads} --subset 1000000 --aligner bowtie2 --force {input.file1} + """ + rule multiqc: + input: + expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples), + output: + join(workpath,"Reports","multiqc_report.html") + params: + rname="pl:multiqc", + logsdir=join(workpath,log_dir), + outdir=join(workpath,"Reports"), + multiqcver=config['bin'][pfamily]['tool_versions']['MULTIQCVER'], + qcconfig=config['bin'][pfamily]['tool_parameters']['CONFMULTIQC'] + threads: 1 + shell: """ + module load {params.multiqcver} + cd {params.outdir} + multiqc -f -c {params.qcconfig} --interactive -e cutadapt -d ../ + cd {workpath}/slurmfiles + multiqc -f --interactive . + """ + + rule fastaConversion: + input: + join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), + output: + join(workpath,"fasta","{name}.trimfasta.done") + params: + rname='pl:fastq2fasta', + batch='--cpus-per-task=32 --mem=64g --time=24:00:00', + fastxtoolkitver = config['bin'][pfamily]['tool_versions']['FASTXTOOLKITVER'], + outfile = join(workpath,"fasta/trim/{name}.trim.fasta"), + outdir = join(workpath,"fasta","trim") + + threads: 32 + shell: """ + module load {params.fastxtoolkitver}; + mkdir -p {params.outdir} + zcat {input} | fastq_to_fasta -n -v -o {params.outfile} + sed -i -e 's/\s/|/g' {params.outfile} + touch {output} + """ + + rule mirdeep2_mapper: + input: + configFile=join(workpath,"mirdeep_config.txt"), + # file1=expand(join(workpath,"fasta/trim/{name}.trim.fasta.gz"),name=samples) + fileCatch=expand(join(workpath,"fasta","{name}.trimfasta.done"),name=samples) + # fileCatch=join(workpath,"fasta/trim/{name}.trimfasta.done") + output: + #join(workpath,"fasta/trim/{name}.trim.fasta.gz"), + collapsedFile=join(workpath,"fasta","collapsed_trim.fa"), + arfFile=join(workpath,"trimmed.arf") + params: + rname='pl:mirdeep2_mapper', + #outdir = join(workpath,"fasta/collapsed"), + mirdeepver=config['bin'][pfamily]['tool_versions']['MIRDEEP2VER'], + bowtieindex = config['references'][pfamily]['BOWTIE_REF'], + trimmedFolder = join(workpath,"fasta","trim"), + out_tar = join(workpath,"fasta","trim.tar.gz"), + # file1=expand(join(workpath,"fasta/trim/{name}.trim.fasta"),name=samples) + threads: 8 + shell: """ + module load {params.mirdeepver} + mapper.pl {input.configFile} -d -c -j -l 18 -m -q -p {params.bowtieindex} -s {output.collapsedFile} -t {output.arfFile} -v -o 8 2>> logfiles/mapper.log + tar -czvf {params.out_tar} {params.trimmedFolder} --remove-files + """ + + rule mirdeep2_1p: + input: + collapsedFile = join(workpath,"fasta","collapsed_trim.fa"), + arfFile = join(workpath,"trimmed.arf") + output: + annotAlign=join(workpath,"mirdeep2_1p","miRNAs_expressed_all_samples_1p.txt"), + novelmaturefasta = join(workpath,"fasta/novel_mature.fa"), + novelhairpinfasta = join(workpath,"fasta/novel_hairpin.fa") + params: + rname="pl:mirdeep2_1p", + mirdeepver=config['bin'][pfamily]['tool_versions']['MIRDEEP2VER'], + genomefasta = config['references'][pfamily]['FASTA_REF'], + maturefasta = config['references'][pfamily]['MIRBASE_MATURE'], + hairpinfasta = config['references'][pfamily]['MIRBASE_HAIRPIN'], + organism=config['references'][pfamily]['ORGANISM'], + resultdir=join(workpath,"mirdeep2_1p"), + + threads: 2 + shell: """ + module load {params.mirdeepver} + mkdir -p /lscratch/$SLURM_JOBID + mkdir -p {params.resultdir} + cd {params.resultdir} + sed -e 's/\s.*$//g' {params.genomefasta} > /lscratch/$SLURM_JOBID/genome.fa #remove whitespace, per miRDeep2 + sed -e 's/Homo sapiens/Homo_sapiens/g' {params.maturefasta}| sed -e 's/\s/|/g' > /lscratch/$SLURM_JOBID/mature.fa + sed -e 's/Homo sapiens/Homo_sapiens/g' {params.hairpinfasta}| sed -e 's/\s/|/g' > /lscratch/$SLURM_JOBID/hairpin.fa + + miRDeep2.pl {input.collapsedFile} /lscratch/$SLURM_JOBID/genome.fa {input.arfFile} /lscratch/$SLURM_JOBID/mature.fa none /lscratch/$SLURM_JOBID/hairpin.fa -t {params.organism} -P + cat mirna_results*/novel_mature_*.fa <(sed '/^>.*/s/$/_star/' mirdeep2_1p/mirna_results*/novel_star*.fa) > {output.novelmaturefasta} + cp mirna_results*/novel_pres_*.fa {output.novelhairpinfasta} + mv miRNAs_expressed_all*.csv {output.annotAlign} + """ + rule mirdeep2_2p: + input: + collapsedFile = join(workpath,"fasta","collapsed_trim.fa"), + arfFile = join(workpath,"trimmed.arf"), + novelmaturefasta = join(workpath,"fasta","novel_mature.fa"), + novelhairpinfasta = join(workpath,"fasta","novel_hairpin.fa") + output: + join(workpath,"mirdeep2_2p","novel_miRNAs_expressed_all_samples_2p.txt") + params: + rname="pl:mirdeep2_2p", + mirdeepver=config['bin'][pfamily]['tool_versions']['MIRDEEP2VER'], + genomefasta = config['references'][pfamily]['FASTA_REF'], + organism=config['references'][pfamily]['ORGANISM'], + resultdir=join(workpath,"mirdeep2_2p") + threads: 16 + shell: """ + module load {params.mirdeepver} + mkdir -p /lscratch/$SLURM_JOBID + mkdir -p {params.resultdir} + cd {params.resultdir} + sed -e 's/\s.*$//g' {params.genomefasta} > /lscratch/$SLURM_JOBID/genome.fa #remove whitespace, per miRDeep2 + quantifier.pl -r {input.collapsedFile} -m {input.novelmaturefasta} -p {input.novelhairpinfasta} -T 8 + mv miRNAs_expressed_all*.csv {output} + """ + rule capture_counts_2p: + input: + mirbaseResults=join(workpath,"mirdeep2_1p/miRNAs_expressed_all_samples_1p.txt"), + novelResults=join(workpath,"mirdeep2_2p/novel_miRNAs_expressed_all_samples_2p.txt") + output: + mirbaseCts=join(workpath,"mirdeep2_results/annotatedCounts.txt"), + novelCts=join(workpath,"mirdeep2_results/novelCounts.txt") + params: + rname="pl:capture_counts_2p", + configFile=join(workpath,"mirdeep_config.txt"), + resultdir = join(workpath,"mirdeep2_results"), + sampleCounts = len(samples)+4 + threads: 4 + shell: """ + mkdir -p {params.resultdir} + cut -f 1-{params.sampleCounts} {input.mirbaseResults} > {output.mirbaseCts} + cut -f 1-{params.sampleCounts} {input.novelResults} > {output.novelCts} + """ + # + rule mirdeep2_single: + input: + collapsedFile = join(workpath,"fasta/collapsed_trim.fa"), + configFile=join(workpath,"mirdeep_config.txt"), + output: + join(workpath,"mirdeep2_out","miRNAs_expressed_all_samples.txt") + params: + rname="pl:mirdeep2_single", + maturefasta = config['references'][pfamily]['MIRBASE_MATURE'], + hairpinfasta = config['references'][pfamily]['MIRBASE_HAIRPIN'], + mirdeepver=config['bin'][pfamily]['tool_versions']['MIRDEEP2VER'], + genomefasta = config['references'][pfamily]['FASTA_REF'], + organism=config['references'][pfamily]['ORGANISM'], + resultdir=join(workpath,"mirdeep2_out") + threads:16 + shell: """ + module load {params.mirdeepver} + mkdir -p /lscratch/$SLURM_JOBID + mkdir -p {params.resultdir} + cd {params.resultdir} + #sed -e 's/\s.*$//g' {params.genomefasta} > /lscratch/$SLURM_JOBID/genome.fa #remove whitespace, per miRDeep2 + sed -e 's/Homo sapiens/Homo_sapiens/g' {params.maturefasta}| sed -e 's/\s/|/g' > /lscratch/$SLURM_JOBID/mature.fa + sed -e 's/Homo sapiens/Homo_sapiens/g' {params.hairpinfasta}| sed -e 's/\s/|/g' > /lscratch/$SLURM_JOBID/hairpin.fa + quantifier.pl -c {input.configFile} -r {input.collapsedFile} -m /lscratch/$SLURM_JOBID/mature.fa -p /lscratch/$SLURM_JOBID/hairpin.fa -T 8 -t {params.organism} -P + mv miRNAs_expressed_all*.csv {output} + """ + + rule capture_counts_single: + input: + mirbaseResults=join(workpath,"mirdeep2_out/miRNAs_expressed_all_samples.txt"), + output: + mirbaseCts=join(workpath,"mirdeep2_results/annotated_counts.txt"), + params: + rname="pl:capture_counts_single", + configFile=join(workpath,"mirdeep_config.txt"), + resultdir = join(workpath,"mirdeep2_results"), + sampleCounts = len(samples)+4, + groupsFile = join(workpath,"groups.tab") + threads: 4 + shell: """ + mkdir -p {params.resultdir} + cut -f 1-{params.sampleCounts} {input.mirbaseResults} > {output.mirbaseCts} + while read FILE ABB; do FILE=${{FILE%%.trim*}};FILE=${{FILE##*trim/}}; sed -i "s/$ABB/$FILE/" {output.mirbaseCts}; done < {params.configFile} + while read FILE GROUP NAME; do sed -i "s/$FILE/$NAME/" {output.mirbaseCts}; done < {params.groupsFile} + sed -i 's/^#//' {output.mirbaseCts} + """ + + # + + +#needed rules: +#QC Side +#trim: cutadapt - done +#kraken: -done +# +#post-trim fastqc- done +#multiqc - done +#Alignment +#fastq to fasta:fastxtoolkit - done + +#collapse (miRDeep2) - done +#align to mature miRs (Bowtie1) -done +#align to precursor miRs (Bowtie1) - done +#align to genome (Bowtie1/miRDeep2 pass1) - done +#align to novel (Bowtie1/miRDeep2 pass2) - done +#expand counts (miRDeep2) +#ShortStack integration +#Differential expression analysis (edgeR) diff --git a/Rules/rnaseq.snakefile b/Rules/rnaseq.snakefile index 7745cdb..d2e6018 100644 --- a/Rules/rnaseq.snakefile +++ b/Rules/rnaseq.snakefile @@ -67,7 +67,7 @@ debug = expand(join(workpath,"DEG_{con}_{cpm}_{minsamples}", "DESeq2_DEG_{con}_a #print(debug) -if config['project']['DEG'] == "yes" and config['project']['TRIM'] == "yes": +if config['project']['DEG'] == "yes" and config['project']['from_counts'] == "False": rule all: params: batch="--time=168:00:00", @@ -96,6 +96,30 @@ if config['project']['DEG'] == "yes" and config['project']['TRIM'] == "yes": #Venn Diagram (Overlap of sig genes between limma, DESeq2, edgeR) expand(join(workpath,"DEG_{deg_dir}","limma_edgeR_DESeq2_vennDiagram.png"), deg_dir=contrasts_w_cpm_cutoff_list), +elif config['project']['DEG'] == "yes" and config['project']['from_counts'] == "True": + rule all: + params: + batch="--time=168:00:00", + + input: + #Initialize DEG (Filter raw counts matrix by cpm and min sample thresholds) + expand(join(workpath,"DEG_{deg_dir}", "RawCountFile_RSEM_genes_filtered.txt"), deg_dir=contrasts_w_cpm_cutoff_list), + expand(join(workpath,"DEG_{deg_dir}", "RawCountFile_RSEM_genes.txt"), deg_dir=contrasts_w_cpm_cutoff_list), + + #DEG (limma, DESeq2, edgeR) + expand(join(workpath,"DEG_{deg_dir}", "DESeq2_PCA.png"), deg_dir=contrasts_w_cpm_cutoff_list), + expand(join(workpath,"DEG_{deg_dir}", "edgeR_prcomp.png"), deg_dir=contrasts_w_cpm_cutoff_list), + expand(join(workpath,"DEG_{deg_dir}", "limma_MDS.png"), deg_dir=contrasts_w_cpm_cutoff_list), + expand(join(workpath,"DEG_{con}_{cpm}_{minsamples}", "DESeq2_DEG_{con}_all_genes.txt"), zip, con=contrastsList, cpm=cpm_cutoff_list,minsamples=mincounts), + expand(join(workpath,"DEG_{con}_{cpm}_{minsamples}", "limma_DEG_{con}_all_genes.txt"), zip, con=contrastsList, cpm=cpm_cutoff_list,minsamples=mincounts), + expand(join(workpath,"DEG_{con}_{cpm}_{minsamples}", "edgeR_DEG_{con}_all_genes.txt"), zip, con=contrastsList, cpm=cpm_cutoff_list,minsamples=mincounts), + + #PCA (limma, DESeq2, edgeR) + expand(join(workpath,"DEG_{deg_dir}","PcaReport.html"), deg_dir=contrasts_w_cpm_cutoff_list), + + #Venn Diagram (Overlap of sig genes between limma, DESeq2, edgeR) + expand(join(workpath,"DEG_{deg_dir}","limma_edgeR_DESeq2_vennDiagram.png"), deg_dir=contrasts_w_cpm_cutoff_list), + rule init_deg: input: diff --git a/Rules/rnaseqforfusions.rl b/Rules/rnaseqforfusions.rl index f523a43..e685ea7 100644 --- a/Rules/rnaseqforfusions.rl +++ b/Rules/rnaseqforfusions.rl @@ -69,7 +69,7 @@ rule stats: input: file1= "{name}.star_rg_added.sorted.dmark.bam" output: outstar1="{name}.RnaSeqMetrics.txt", outstar2="{name}.flagstat.concord.txt", outstar3="{name}.InsertSizeMetrics.txt", outstar4="{name}.InsertSizeHisto.pdf" params: rname='pl:stats',batch='--mem=24g --time=10:00:00 --gres=lscratch:800',refflat=config['references'][pfamily]['REFFLAT'],rrnalist=config['references'][pfamily]['RRNALIST'],picardstrand=config['bin'][pfamily]['PICARDSTRAND'] - shell: "module load R/3.5;module load picard/2.17.11; java -Xmx10g -jar $PICARDJARPATH/picard.jar CollectRnaSeqMetrics REF_FLAT={params.refflat} INPUT={input.file1} OUTPUT={output.outstar1} RIBOSOMAL_INTERVALS={params.rrnalist} STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND TMP_DIR=/lscratch/$SLURM_JOBID VALIDATION_STRINGENCY=SILENT; java -Xmx10g -jar $PICARDJARPATH/picard.jar CollectInsertSizeMetrics INPUT={input.file1} OUTPUT={output.outstar3} HISTOGRAM_FILE={output.outstar4} MINIMUM_PCT=0.5 TMP_DIR=/lscratch/$SLURM_JOBID ;module load samtools; samtools flagstat {input.file1} > {output.outstar2}; module load python; python Scripts/bam_count_concord_stats.py {input.file1} >> {output.outstar2} " + shell: "module load R/3.5;module load picard/2.17.11; java -Xmx10g -jar $PICARDJARPATH/picard.jar CollectRnaSeqMetrics REF_FLAT={params.refflat} INPUT={input.file1} OUTPUT={output.outstar1} RIBOSOMAL_INTERVALS={params.rrnalist} STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND TMP_DIR=/lscratch/$SLURM_JOBID VALIDATION_STRINGENCY=SILENT; java -Xmx10g -jar $PICARDJARPATH/picard.jar CollectInsertSizeMetrics INPUT={input.file1} OUTPUT={output.outstar3} HISTOGRAM_FILE={output.outstar4} MINIMUM_PCT=0.5 TMP_DIR=/lscratch/$SLURM_JOBID ;module load samtools; samtools flagstat {input.file1} > {output.outstar2}; module load python/2.7; python Scripts/bam_count_concord_stats.py {input.file1} >> {output.outstar2} " rule prernaseqc: input: expand("{name}.star_rg_added.sorted.dmark.bam", name=samples) diff --git a/Rules/scrnaQC.snakefile b/Rules/scrnaQC.snakefile new file mode 100644 index 0000000..e59bf6c --- /dev/null +++ b/Rules/scrnaQC.snakefile @@ -0,0 +1,103 @@ +from snakemake.utils import R +from os.path import join +from os import listdir +import os +import re + +configfile: "run.json" +samples=config['project']['groups']['rsamps'] +workpath=config['project']['workpath'] + +# workpath="/data/abdelmaksoudaa/klion" +# samples = [f for f in os.listdir('.') if re.search('4H', f)] + +specie = config['project']['SPECIES'] #human +resolution = config['project']['CLUSTRESOLUTION']#"0.8" +clustAlg = config['project']['CLUSTALG'] #1-3 +annotDB = config['project']['ANNOTDB']#"HPCA" #mouse +nAnchors = "2000" +groups = "YES" #YES/NO + + +contList = [] +contrasts = open("contrasts.tab") +for line in contrasts: + line = line.strip("\n") +# print(re.sub('\t', '-', line)) + contList.append(re.sub('\t', '-', line)) + + +intType = ["seurat_batch_corrected","merged"] + +rule all: + params: + batch='--time=168:00:00', + input: + expand(join(workpath,"QC","{name}.RData"),name=samples), + expand(join(workpath,"filtered","{name}.rds"),name=samples), + expand(join(workpath,"flags","{name}.txt"),name=samples), + join(workpath,"QC","samples_QC.html"), + expand(join(workpath,"integration","seurat_batch_corrected","{myGroups}","{myGroups}.rds"),myGroups=contList), + expand(join(workpath,"integration","merged","{myGroups}","{myGroups}.rds"),myGroups=contList), + +rule qc_scrna: + input: + join(workpath,"{name}") + output: + rds=join(workpath,"filtered","{name}.rds"), + rdata=join(workpath,"QC","{name}.RData"), + qc = join(workpath,"flags","{name}.txt"), + params: + rname='pl:qc_scrna', + specie = specie, + resolution = resolution, + clustAlg = clustAlg, + annotDB = annotDB, + outDir= join(workpath,"QC") + shell: """ + +module load R/3.6.0; +Rscript Scripts/scrnaQC.R {input} {output.rds} {output.rdata} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB}; +touch {output.qc} + """ + +rule qcReport_scrna: + input: + files = expand(join(workpath,"QC","{name}.RData"),name=samples), + path=join (workpath, "QC") + output: + join(workpath,"QC","samples_QC.html") + params: + rname='pl:qcReport_scrna', + shell: """ +module load R/3.6.0 +Rscript Scripts/scrnaQC_Reports.R {input.path} + """ + +rule integratedBatch: + input: + rds=expand(join(workpath,"filtered","{name}.rds"),name=samples), + flag=expand(join(workpath,"flags","{name}.txt"),name=samples) + output: + rdsBatch=join(workpath,"integration","seurat_batch_corrected","{myGroups}","{myGroups}.rds"), + mergeRDS=join(workpath,"integration","merged","{myGroups}","{myGroups}.rds"), + params: + rname='pl:integratedBatch', + batch='--cpus-per-task=8 --mem=48g --time=24:00:00', + specie = specie, + resolution = resolution, + clustAlg = clustAlg, + annotDB = annotDB, +# outDirSeurat= rdsBatch#join(workpath,"integration","seurat_batch_corrected_RDS"), +# outDirMerged= mergeRDS#join(workpath,"integration","mergedRDS"), + nAnchors = nAnchors, + groups = groups, + dir = join(workpath,"filtered"), + contrasts = "{myGroups}" + shell: """ +module load R/3.6.0; +Rscript Scripts/integrateBatches.R {params.dir} {output.rdsBatch} {output.mergeRDS} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.nAnchors} {params.groups} {params.contrasts}; + """ + + + diff --git a/Rules/strelka.rl b/Rules/strelka.rl index 67e8827..749856c 100755 --- a/Rules/strelka.rl +++ b/Rules/strelka.rl @@ -13,4 +13,4 @@ rule strelka: names=temp(config['project']['workpath']+"/strelka_out/{x}.samples"), params: normalsample=lambda wildcards: config['project']['pairs'][wildcards.x][0],tumorsample=lambda wildcards: config['project']['pairs'][wildcards.x][1],dir=config['project']['workpath'],gatk=config['bin'][pfamily]['GATK'],genome=config['references'][pfamily]['GENOME'],targets="exome_targets.bed",snpeff=config['bin'][pfamily]['SNPEFF'],snpeffgenome=config['references'][pfamily]['SNPEFF_GENOME'],effconfig=config['references'][pfamily]['SNPEFF_CONFIG'],rname="pl:strelka_calls" threads: 4 - shell: "module load strelka/2.9.0; module load python; configureStrelkaSomaticWorkflow.py --ref={params.genome} --tumor={params.dir}/{input.tumor} --normal={params.dir}/{input.normal} --runDir={output.outdir} --exome; cd {output.outdir}; ./runWorkflow.py -m local -j {threads}; {params.gatk} -T CombineVariants -R {params.genome} --variant results/variants/somatic.snvs.vcf.gz --variant results/variants/somatic.indels.vcf.gz --assumeIdenticalSamples --filteredrecordsmergetype KEEP_UNCONDITIONAL -o {output.vcf}; {params.gatk} -T SelectVariants -R {params.genome} --variant {output.vcf} --excludeFiltered -o {output.vcffiltered}; cd {params.dir}/strelka_out; module load samtools/1.6; echo $'NORMAL {params.normalsample}\nTUMOR {params.tumorsample}' > {output.names}; bcftools reheader -o {output.final} -s {output.names} {output.vcffiltered}; module load snpEff/4.3t; java -Xmx12g -jar $SNPEFF_JAR -v {params.snpeffgenome} -c {params.effconfig} -cancer -canon -csvStats {output.csvstats} -stats {output.htmlstats} -cancerSamples {params.dir}/pairs {output.final} > {output.out}" \ No newline at end of file + shell: "module load strelka/2.9.0; module load python/2.7; configureStrelkaSomaticWorkflow.py --ref={params.genome} --tumor={params.dir}/{input.tumor} --normal={params.dir}/{input.normal} --runDir={output.outdir} --exome; cd {output.outdir}; ./runWorkflow.py -m local -j {threads}; {params.gatk} -T CombineVariants -R {params.genome} --variant results/variants/somatic.snvs.vcf.gz --variant results/variants/somatic.indels.vcf.gz --assumeIdenticalSamples --filteredrecordsmergetype KEEP_UNCONDITIONAL -o {output.vcf}; {params.gatk} -T SelectVariants -R {params.genome} --variant {output.vcf} --excludeFiltered -o {output.vcffiltered}; cd {params.dir}/strelka_out; module load samtools/1.6; echo $'NORMAL {params.normalsample}\nTUMOR {params.tumorsample}' > {output.names}; bcftools reheader -o {output.final} -s {output.names} {output.vcffiltered}; module load snpEff/4.3t; java -Xmx12g -jar $SNPEFF_JAR -v {params.snpeffgenome} -c {params.effconfig} -cancer -canon -csvStats {output.csvstats} -stats {output.htmlstats} -cancerSamples {params.dir}/pairs {output.final} > {output.out}" \ No newline at end of file diff --git a/Rules/wgs_strelka.rl b/Rules/wgs_strelka.rl index f6cc947..10ccb6a 100644 --- a/Rules/wgs_strelka.rl +++ b/Rules/wgs_strelka.rl @@ -13,4 +13,4 @@ rule wgs_strelka: names=temp(config['project']['workpath']+"/strelka_out/{x}.samples"), params: normalsample=lambda wildcards: config['project']['pairs'][wildcards.x][0],tumorsample=lambda wildcards: config['project']['pairs'][wildcards.x][1],dir=config['project']['workpath'],gatk=config['bin'][pfamily]['GATK'],genome=config['references'][pfamily]['GENOME'],snpeff=config['bin'][pfamily]['SNPEFF'],snpeffgenome=config['references'][pfamily]['SNPEFF_GENOME'],effconfig=config['references'][pfamily]['SNPEFF_CONFIG'],rname="pl:strelka_calls" threads: 4 - shell: "module load strelka/2.9.0; module load python; configureStrelkaSomaticWorkflow.py --ref={params.genome} --tumor={params.dir}/{input.tumor} --normal={params.dir}/{input.normal} --runDir={output.outdir}; cd {output.outdir}; ./runWorkflow.py -m local -j {threads}; {params.gatk} -T CombineVariants -R {params.genome} --variant results/variants/somatic.snvs.vcf.gz --variant results/variants/somatic.indels.vcf.gz --assumeIdenticalSamples --filteredrecordsmergetype KEEP_UNCONDITIONAL -o {output.vcf}; {params.gatk} -T SelectVariants -R {params.genome} --variant {output.vcf} --excludeFiltered -o {output.vcffiltered}; cd {params.dir}/strelka_out; module load samtools/1.6; echo $'NORMAL {params.normalsample}\nTUMOR {params.tumorsample}' > {output.names}; bcftools reheader -o {output.final} -s {output.names} {output.vcffiltered}; module load snpEff/4.3t; java -Xmx12g -jar $SNPEFF_JAR -v {params.snpeffgenome} -c {params.effconfig} -cancer -canon -csvStats {output.csvstats} -stats {output.htmlstats} -cancerSamples {params.dir}/pairs {output.final} > {output.out}" \ No newline at end of file + shell: "module load strelka/2.9.0; module load python/2.7; configureStrelkaSomaticWorkflow.py --ref={params.genome} --tumor={params.dir}/{input.tumor} --normal={params.dir}/{input.normal} --runDir={output.outdir}; cd {output.outdir}; ./runWorkflow.py -m local -j {threads}; {params.gatk} -T CombineVariants -R {params.genome} --variant results/variants/somatic.snvs.vcf.gz --variant results/variants/somatic.indels.vcf.gz --assumeIdenticalSamples --filteredrecordsmergetype KEEP_UNCONDITIONAL -o {output.vcf}; {params.gatk} -T SelectVariants -R {params.genome} --variant {output.vcf} --excludeFiltered -o {output.vcffiltered}; cd {params.dir}/strelka_out; module load samtools/1.6; echo $'NORMAL {params.normalsample}\nTUMOR {params.tumorsample}' > {output.names}; bcftools reheader -o {output.final} -s {output.names} {output.vcffiltered}; module load snpEff/4.3t; java -Xmx12g -jar $SNPEFF_JAR -v {params.snpeffgenome} -c {params.effconfig} -cancer -canon -csvStats {output.csvstats} -stats {output.htmlstats} -cancerSamples {params.dir}/pairs {output.final} > {output.out}" \ No newline at end of file diff --git a/cluster.json b/cluster.json index 636b99d..03335e6 100644 --- a/cluster.json +++ b/cluster.json @@ -453,6 +453,11 @@ "HOMER_motif": { "threads": "48" }, + "integratedBatch":{ + "threads": "8", + "mem": "64g", + "time": "1-00:00:00" + }, "kraken_pe": { "cpus-per-task": "24", "gres": "lscratch:256", @@ -497,11 +502,26 @@ "threads": "1", "time": "2-00:00:00" }, + "mirdeep2_1p": { + "mem": "16g", + "threads": "4", + "time": "1-00:00:00" + }, + "mirdeep2_2p": { + "mem": "16g", + "threads": "16", + "time": "1-00:00:00" + }, "mirdeep2_mapper": { "mem": "16g", "threads": "1", "time": "4-00:00:00" }, + "mirdeep2_single":{ + "mem": "16g", + "threads": "16", + "time": "1-00:00:00" + }, "mirseq_gencode_classification": { "mem": "16g", "threads": "1", @@ -632,6 +652,16 @@ "preseq": { "mem": "24g" }, + "qc_scrna":{ + "threads": "1", + "mem": "48g", + "time": "8:00:00" + }, + "qcReport_scrna":{ + "threads": "1", + "mem": "48g", + "time": "1:00:00" + }, "qualibam": { "threads": "8", "mem": "12g", @@ -736,6 +766,11 @@ "mem": "100g", "threads": "4" }, + "sequenza": { + "mem": "32g", + "threads": "24", + "time": "4:00:00" + }, "sleuth": { "mem": "110g", "threads": "4" @@ -873,3 +908,4 @@ "time": "2-00:00:00" } } + diff --git a/gui/frame.py b/gui/frame.py index d967a08..dc1e50b 100755 --- a/gui/frame.py +++ b/gui/frame.py @@ -217,6 +217,13 @@ def set_data_directory( self ): outtxt="\n" print(outtxt) + # Check existence of counts matrix + try: + countdir = listdir(os.path.join(self.datapath.get(),"counts")) + counts_exist = [f for f in countdir if f.endswith('.txt')] + except FileNotFoundError: + counts_exist = [] + self.data_count['text'] = str( len(self.datafiles) ) outtxt="\n" print( "Found", self.data_count['text'], filetype, "files!" ) @@ -234,6 +241,11 @@ def set_data_directory( self ): outtxt+="Paired - end files:\n" for f,g in zip(sorted(fR1),sorted(fR2)): outtxt+="%s\t%s\n"%(f,g) + elif counts_exist and self.pipeline_name == 'RNAseq': + print ("Raw counts matrix detected!") + outtxt_short="File found ... counts matrix".format() + self.data_count['text'] += '... counts matrix' + outtxt+="Counts file: {}\n".format(', '.join([f for f in listdir(os.path.join(self.datapath.get(),"counts")) if f.endswith('.txt')])) else: outtxt_short="Some files many be missing or misnamed!!\n" self.data_count['text'] += " ... FILES MAY BE MISSING!!!" @@ -638,7 +650,18 @@ def make_symlinks(self): print("Found 'peakcall.tab': Symlinking file!") cmd="cp {0}peakcall.tab {1}".format(data, self.workpath.get()) p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) - + elif pl == 'RNAseq': + print("\n\nChecking for 'counts/' directory...") + if os.path.isdir(os.path.join(data,"counts")): + print("Found 'counts/' directory: Symlinking counts matrix!") + countspath = os.path.join(data,"counts") + cmd="mkdir {1}/DEG_ALL/ {1}/STAR_files/ && touch {1}/STAR_files/sampletable.txt && ln -s {0}/*[._]txt {1}/DEG_ALL/RawCountFile_RSEM_genes.txt".format(countspath,self.workpath.get()) + #print(cmd,"\n","Pipeline_name", pl,"\n" , "Data", data) + p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) + if os.path.isfile(os.path.join(data,"counts","groups.tab")) and os.path.isfile(os.path.join(data,"counts","contrasts.tab")): + print("Found 'groups.tab' and 'contrasts.tab': Symlinking sample metadata!") + cmd="ln -s {0}/groups.tab {1}/; ln -s {0}/contrasts.tab {1}/;".format(countspath,self.workpath.get()) + p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) else: cmd="for f in `ls {0}*[._]{1}`;do ln -s $f {2};done".format(data,FT, self.workpath.get()) p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) @@ -664,6 +687,25 @@ def make_symlinks(self): print("Found 'peakcall.tab': Symlinking file!") cmd="cp {0}peakcall.tab {1}".format(data, self.workpath.get()) p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) + elif pl == 'scRNAseq': + print (os.path.join(data,"h5")) + if os.path.isdir(os.path.join(data,"h5")): + print ("Found 'h5/' directory: Symlinking h5 files") + h5Path = os.path.join(data,"h5") + cmd = "for f in `ls {0}/*[._]h5`;do ln -s $f {1};done".format(h5Path,self.workpath.get()) + p = Popen(cmd, shell=True, stdin = PIPE, stdout=PIPE, stderr = STDOUT, close_fds=True) + elif pl == 'RNAseq': + print("\n\nChecking for 'counts/' directory...") + if os.path.isdir(os.path.join(data,"counts")): + print("Found 'counts/' directory: Symlinking counts matrix!") + countspath = os.path.join(data,"counts") + cmd="mkdir {1}/DEG_ALL/ {1}/STAR_files/ && touch {1}/STAR_files/sampletable.txt && ln -s {0}/*[._]txt {1}/DEG_ALL/RawCountFile_RSEM_genes.txt".format(countspath,self.workpath.get()) + #print(cmd,"\n","Pipeline_name", pl,"\n" , "Data", data) + p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) + if os.path.isfile(os.path.join(data,"counts","groups.tab")) and os.path.isfile(os.path.join(data,"counts","contrasts.tab")): + print("Found 'groups.tab' and 'contrasts.tab': Symlinking sample metadata!") + cmd="ln -s {0}/groups.tab {1}/; ln -s {0}/contrasts.tab {1}/;".format(countspath,self.workpath.get()) + p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) except Exception as e: showerror("Error",str(e)) diff --git a/gui/mirseq.py b/gui/mirseq.py index 70941f1..6e5e72e 100755 --- a/gui/mirseq.py +++ b/gui/mirseq.py @@ -40,18 +40,35 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : label = Label(eframe,text="Pipeline")#,fg=textLightColor,bg=baseColor) label.grid(row=3,column=0,sticky=W,padx=10,pady=5) - Pipelines=["CAPmirseq-plus"] + PipelineLabels=["miRSeq_v2","CAPmirseq-plus"] + Pipelines=["miRSeq_v2","CAPmirseq-plus"] + + self.label2pipeline = { k:v for k,v in zip(PipelineLabels, Pipelines)} + PipelineLabel = self.PipelineLabel = StringVar() Pipeline = self.Pipeline = StringVar() - Pipeline.set(Pipelines[0]) + PipelineLabel.set(PipelineLabels[0]) - om = OptionMenu(eframe, Pipeline, *Pipelines, command=self.option_controller) + om = OptionMenu(eframe, PipelineLabel, *PipelineLabels, command=self.option_controller) om.config()#bg = widgetBgColor,fg=widgetFgColor) om["menu"].config()#bg = widgetBgColor,fg=widgetFgColor) #om.pack(side=LEFT,padx=20,pady=5) om.grid(row=3,column=1,sticky=W,padx=10,pady=5) + + # self.mirseqOpts = mirseqOpts = LabelFrame(eframe,text="miRSeq Settings") + self.novelMirsFrame = novelMirsFrame = Frame(eframe) + self.novelMirs = novelMirs = StringVar() + novelMirLabel = Label(novelMirsFrame, text= "Identify Novel miRNAs: ") + novelMirsDropdown = ["YES","NO"] + novelMirs.set(novelMirsDropdown[1]) + novelMirs_om = OptionMenu(novelMirsFrame,novelMirs,*novelMirsDropdown) + novelMirLabel.grid(row=5,column=1,sticky=W,padx=10,pady=5) + novelMirs_om.grid(row=5,column=2,sticky=W,padx=10,pady=0) + # novelMirsFrame.grid(row=5,column=0, columnspan=4, sticky=W, padx=20, pady=10 ) + + self.add_info( eframe ) - + self.option_controller() def add_info( self, parent ) : if not self.info : @@ -70,7 +87,7 @@ def add_info( self, parent ) : self.contrasts_button.grid( row=5, column=6, padx=10, pady=5 ) - self.info.grid(row=10,column=0, columnspan=6, sticky=W, padx=20, pady=10 ) + self.info.grid(row=11,column=0, columnspan=6, sticky=W, padx=20, pady=10 ) def popup_groups( self ) : @@ -115,7 +132,19 @@ def loadfunc() : info.grid(row=7,column=0, columnspan=6, sticky=W, padx=20, pady=10 ) top.focus_force() - + def option_controller( self, *args, **kwargs ) : + + PipelineFrame.option_controller( self ) + + self.Pipeline.set( self.label2pipeline[self.PipelineLabel.get()] ) + print( self.Pipeline.get() ) + + if self.Pipeline.get() == 'miRSeq_v2' : + self.novelMirsFrame.grid(row=5,column=0, columnspan=4, sticky=W, padx=20, pady=10 ) + elif self.Pipeline.get() == "CAPmirseq-plus": + self.novelMirsFrame.grid_forget() + + def makejson(self, *args): #print(args[0]) caller=args[0] @@ -302,6 +331,7 @@ def makejson(self, *args): "TRIM": "yes", "groups": groups, "contrasts": contrasts, + "novelMirs": self.novelMirs.get(), } } diff --git a/gui/rnaseq.py b/gui/rnaseq.py index 19b1fc1..f197dc0 100755 --- a/gui/rnaseq.py +++ b/gui/rnaseq.py @@ -37,7 +37,7 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : self.info = None eframe = self.eframe = LabelFrame(self,text="Options") - #,fg=textLightColor,bg=baseColor) + #fg=textLightColor,bg=baseColor) eframe.grid( row=5, column=1, sticky=W, columnspan=7, padx=10, pady=5 ) label = Label(eframe,text="Pipeline")#,fg=textLightColor,bg=baseColor) @@ -83,7 +83,7 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : self.om4 = OptionMenu(eframe, rDeg, *rDegs, command=self.option_controller) self.om4.grid(row=6,column=1,sticky=W,padx=10,pady=5) - # Show Groups Only (For initialqcrnaseq) + # Show Groups Only (For initialqcrnaseq) self.om_groups = LabelFrame(eframe, text="Sample Information") self.groups_button = Button(self.om_groups, text="Set Groups", command = self.popup_groups ) self.groups_button.grid(row=5, column=5, padx=10, pady=5) @@ -382,6 +382,12 @@ def makejson(self, *args): SD=AD['references']['rnaseq']['STARDIR'] # tkinter.messagebox.showinfo("initLock","SD={0}".format(SD)) gi = self.global_info + + # Boolean to indicate whether Pipliner should start from raw counts matrix + from_counts = False + if os.path.isdir(os.path.join(self.datapath.get(),"counts")): + from_counts = True + PD={ 'project': { 'pfamily': gi.pfamily.get(), @@ -397,7 +403,8 @@ def makejson(self, *args): 'pipeline': self.Pipeline.get(), 'version':"4.0", 'annotation': gi.annotation.get(), - 'datapath': self.datapath.get(), + 'datapath': self.datapath.get(), + 'from_counts': "{}".format(from_counts), 'targetspath': self.targetspath.get(), 'nends': self.nends, 'filetype': filetype , diff --git a/gui/scrnaseq.py b/gui/scrnaseq.py index 340a566..d18615d 100644 --- a/gui/scrnaseq.py +++ b/gui/scrnaseq.py @@ -42,8 +42,8 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : label = Label(eframe,text="Pipeline")#,fg=textLightColor,bg=baseColor) label.grid(row=3,column=0,sticky=W,padx=10,pady=5) - PipelineLabels=["CellRanger","Initial/QC","Clustering","Multi-Sample Clustering" ] - Pipelines=["cellranger","scrnaseqinit","scrnaseqcluster", "scrnaseqmulticluster"] + PipelineLabels=["Initial QC","Differential Expression"] + Pipelines=["scrnaQC","scrnaDE"] self.label2pipeline = { k:v for k,v in zip(PipelineLabels, Pipelines)} @@ -56,124 +56,104 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : om["menu"].config()#bg = widgetBgColor,fg=widgetFgColor) #om.pack(side=LEFT,padx=20,pady=5) om.grid(row=3,column=1,sticky=W,padx=10,pady=5) - - self.crOpts = crOpts = LabelFrame( eframe, - text="CellRanger Settings" ) - self.scrCRID = scrCRID = StringVar() - scrCRID.set("SPECIFY_PREFIX_HERE") - self.scrExpected = scrExpected = StringVar() - scrExpected.set("3000") - - scrcridL = Label(crOpts, text="CellRanger Sample ID: ") - scrcridE = Entry(crOpts, bd =2, width=25, textvariable=scrCRID) - scrcridL.grid(row=9,column=1,sticky=W,padx=10,pady=5) - scrcridE.grid(row=9,column=2,sticky=W,padx=0,pady=5) - - screxpectedL = Label(crOpts, text="Expected number of cells: ") - screxpectedE = Entry(crOpts, bd =2, width=8, textvariable=scrExpected) - screxpectedL.grid(row=10,column=1,sticky=W,padx=10,pady=5) - screxpectedE.grid(row=10,column=2,sticky=W,padx=0,pady=5) - - self.clusterOpts = clusterOpts = LabelFrame( eframe, - text="Clustering and tSNE Options" ) - - self.scrPCs = scrPCs = StringVar() - scrPCs.set("12") - self.scrRes = scrRes = StringVar() - scrRes.set("0.6") - - #scrPCs.trace('w', lambda a,b,c,x="scrPCs": makejson(x)) - #Filter out genes < [5] read counts in < [2] samples - scrpcsL = Label(clusterOpts, text="Include principal components 1 through ") - scrpcsE = Entry(clusterOpts, bd =2, width=3, textvariable=scrPCs) - scrresL = Label(clusterOpts, text="with clustering resolution: ") - scrresE = Entry(clusterOpts, bd =2, width=3, textvariable=scrRes) + +######## QUALITY CONTROL/FILTERING/CLUSTERING ############### + self.qcOptions = qcOptions = LabelFrame( eframe, text = "QC, Filtering, and Initial Clustering") + + #algorithm selection + self.clustAlg = clustAlg = StringVar() + clustAlgLabel = Label(qcOptions, text="Clustering Algorithm: ") + clustAlgDropdown = ["SLM (Smart Local Moving)", "Louvain (Original)","Louvain (with Multilevel Refinement)"] + clustAlg.set(clustAlgDropdown[0]) + clustAlgOptMenu = OptionMenu(qcOptions, clustAlg, *clustAlgDropdown) + clustAlgLabel.grid(row=8,column=1,sticky=W,padx=10,pady=5) + clustAlgOptMenu.grid(row=8,column=2,sticky=W,padx=0,pady=5) - scrpcsL.grid(row=9,column=1,sticky=W,padx=10,pady=5) - scrpcsE.grid(row=9,column=2,sticky=W,padx=0,pady=5) - scrresL.grid(row=9,column=3,sticky=W,padx=5,pady=5) - scrresE.grid(row=9,column=4,sticky=W,padx=0,pady=5) - #scrRes.trace('w', lambda a,b,c,x="scrPCs": makejson(x)) + #clustering resolutions + self.resolution = resolution = StringVar() + resolution.set("0.4,0.6,0.8,1.0,1.2") + resolutionLabel = Label(qcOptions,text="Clustering Resolution(s): \nSeparate with commas") + resolutionEntry = Entry(qcOptions,bd=2,width=25,textvariable=resolution) + resolutionLabel.grid(row=9,column=1,sticky=W,padx=10,pady=5) + resolutionEntry.grid(row=9,column=2,sticky=W,padx=0,pady=5) - clusterOpts.grid( row=8, column=0, columnspan=4, sticky=W, padx=20, pady=10 ) + #annotation db: human: HPCA/BP Encode; mouse: immgen/mouseRNAseq + self.annotDB = annotDB = StringVar() + annotLabel = Label(qcOptions,text="Annotation database: ") + annotHuman = ["HPCA","BP Encode"] + annotMouse = ["immgen","mouseRNAseq"] + annotDropdown = [] + genome = self.global_info.annotation.get() + if (genome == "GRCh38"): + annotDropdown = annotHuman.copy() + elif (genome == "mm10"): + annotDropdown = annotMouse.copy() + else: + annotDropdown.append("No genome selected") + annotDB.set(annotDropdown[0]) + annotDB_om = OptionMenu(qcOptions,annotDB,*annotDropdown) + annotLabel.grid(row=10,column=1,sticky=W,padx =10, pady =5) + annotDB_om.grid(row=10,column=2,sticky=W,padx=0,pady=5) - - self.multiclusterOpts = multiclusterOpts = LabelFrame( eframe, - text = "Multi-Sample Clustering and tSNE Options") - - scrccsL = Label(multiclusterOpts, text="Include canonical components 1 through ") - scrccsE = Entry(multiclusterOpts, bd =2, width=3, textvariable=scrPCs) - scrmcresL = Label(multiclusterOpts, text="with clustering resolution: ") - scrmcresE = Entry(multiclusterOpts, bd =2, width=3, textvariable=scrRes) - - scrccsL.grid(row=9,column=1,sticky=W,padx=10,pady=5) - scrccsE.grid(row=9,column=2,sticky=W,padx=0,pady=5) - scrmcresL.grid(row=9,column=3,sticky=W,padx=5,pady=5) - scrmcresE.grid(row=9,column=4,sticky=W,padx=0,pady=5) - - - self.qcOpts = qcOpts = LabelFrame( eframe, - text="Initial Settings" ) - countL = Label( qcOpts, text="Counts/Matrix Dir:" ) - countL.grid(row=9, column=1, sticky=W, padx=10, pady=5 ) - countpath=StringVar() - self.countpath = countpath - count_entry = Entry(qcOpts, - bd =2, - width = 50, - #bg = entryBgColor, - #fg = entryFgColor, - textvariable = countpath, state='normal' - ) - count_entry.grid( row=9, column=2, columnspan=3 ) - self.count_button = count_button = Button( qcOpts, - text="Open Directory", - command=self.set_count_directory ) - count_button.grid( row=9, column=5 ) - - self.mattype = mattype = StringVar() - mattypeL = Label(qcOpts, text="Count matrix format: ") - scrMatTypeDropdown = ["cellranger", "cellranger_raw", "zumi", "biorad"] - mattype.set(scrMatTypeDropdown[0]) - mattype_om = OptionMenu(qcOpts, mattype, *scrMatTypeDropdown) - mattypeL.grid(row=10,column=1,sticky=W,padx=10,pady=5) - mattype_om.grid(row=10,column=2,sticky=W,padx=0,pady=5) + #set groups, mostly for relabeling purposes + self.add_info( qcOptions ) #Position 11 + # self.option_controller() + + # groups_buttonL = Label(qcOptions, text="Sample Information: ") + # groups_button = Button(qcOptions, + # text="Set Groups", + # command = self.popup_groups ) + # groups_buttonL.grid(row=12,column=1,sticky=W,padx=10,pady=5) + # groups_button.grid(row=12,column=2,sticky=W,padx=0,pady=5) + # + # +######### DIFFERENTIAL EXPRESSION ######### + self.deOptions = deOptions = LabelFrame( eframe, text = "Differential Gene Expression") - self.docycleregress = docycleregress = StringVar() - docycleregressL = Label(qcOpts, text="Do cell cycle regression? ") - scrCycleDropdown = ["TRUE", "FALSE"] - docycleregress.set(scrCycleDropdown[0]) - cycle_om = OptionMenu(qcOpts, docycleregress, *scrCycleDropdown) - docycleregressL.grid(row=11,column=1,sticky=W,padx=10,pady=5) - cycle_om.grid(row=11,column=2,sticky=W,padx=0,pady=5) - - usecycleregressL_c = Label(clusterOpts, text="Use cell cycle regressed data? ") - docycleregress.set(scrCycleDropdown[0]) - cycle_om_c = OptionMenu(clusterOpts, docycleregress, *scrCycleDropdown) - usecycleregressL_c.grid(row=10,column=1,sticky=W,padx=10,pady=5) - cycle_om_c.grid(row=10,column=2,sticky=W,padx=0,pady=5) - - usecycleregressL_mc = Label(multiclusterOpts, text="Use cell cycle regressed data? ") - docycleregress.set(scrCycleDropdown[0]) - cycle_om_mc = OptionMenu(multiclusterOpts, docycleregress, *scrCycleDropdown) - usecycleregressL_mc.grid(row=10,column=1,sticky=W,padx=10,pady=5) - cycle_om_mc.grid(row=10,column=2,sticky=W,padx=0,pady=5) - - - groups_buttonL = Label(qcOpts, text="SAMPLE INFORMATION: ") - groups_button = Button(qcOpts, - text="Set Groups", - command = self.popup_groups ) - groups_buttonL.grid(row=12,column=1,sticky=W,padx=10,pady=5) - groups_button.grid(row=12,column=2,sticky=W,padx=0,pady=5) - ##################### + #option for cluster resolution for DE + self.resolutionDE = resolutionDE = StringVar() + resolutionDE.set("0.8") + resolutionDELabel = Label(deOptions,text="Clustering Resolution: \nChoose a previous resolution or \nselect a new resolution to run") + resolutionDEEntry = Entry(deOptions,bd=2,width=25,textvariable=resolutionDE) + resolutionDELabel.grid(row=10,column=1,sticky=W,padx=10,pady=5) + resolutionDEEntry.grid(row=10,column=2,sticky=W,padx=0,pady=5) + #option for merged/integrated object + # self.integrateOption = integrateOption = StringVar() + # integrateLabel = Label(deOptions, text="Merged or Integrated (batch corrected): ") + # integrateDropdown = ["Merged","Integrated"] + # integrateOption.set(integrateDropdown[0]) + # integrateOptMenu = OptionMenu(deOptions, integrateOption, *integrateDropdown) + # integrateLabel.grid(row=9,column=1,sticky=W,padx=10,pady=5) + # integrateOptMenu.grid(row=9,column=2,sticky=W,padx=0,pady=5) + + self.add_info( deOptions ) #Position 11 self.option_controller() +### SUBFUNCTION FOR CREATING GROUPS.TAB AND CLUSTERS.TAB WINDOW ###### + def add_info( self, parent ) : + if not self.info : + self.info = LabelFrame(parent, text="Sample Information") + self.groups_button = Button(self.info, + text="Set Groups", + command = self.popup_groups ) + self.contrasts_button = Button(self.info, + text="Set Contrasts", + command = self.popup_contrasts ) + + #self.pairs_load_button.pack( side=BOTTOM, padx=5, pady=5 ) + #self.pairs_save_button.pack( side=BOTTOM, padx=5, pady=5 ) + + self.groups_button.grid( row=5, column=5, padx=10, pady=5 ) + self.contrasts_button.grid( row=5, column=6, padx=10, pady=5 ) + + self.info.grid(row=11,column=0, columnspan=6, sticky=W, padx=20, pady=10) + def popup_groups( self ) : self.popup_window( "Groups Information", "groups.tab" ) - + def popup_contrasts( self ) : + self.popup_window( "Contrasts Information", "contrasts.tab" ) def popup_window( self, text, filename ) : top = Toplevel() @@ -223,27 +203,13 @@ def option_controller( self, *args, **kwargs ) : self.Pipeline.set( self.label2pipeline[self.PipelineLabel.get()] ) print( self.Pipeline.get() ) - if self.Pipeline.get() == 'cellranger' : - self.clusterOpts.grid_forget() - self.crOpts.grid(row=8,column=0, columnspan=6, sticky=W, padx=20, pady=10 ) - self.qcOpts.grid_forget() - self.multiclusterOpts.grid_forget() - elif self.Pipeline.get() == 'scrnaseqcluster' : - self.clusterOpts.grid( row=8, column=0, columnspan=4, sticky=W, padx=20, pady=10 ) - self.crOpts.grid_forget() - self.qcOpts.grid_forget() - self.multiclusterOpts.grid_forget() - elif self.Pipeline.get() == 'scrnaseqinit' : - self.clusterOpts.grid_forget() - self.crOpts.grid_forget() - self.qcOpts.grid( row=8, column=0, columnspan=4, sticky=W, padx=20, pady=10 ) - self.multiclusterOpts.grid_forget() - elif self.Pipeline.get() == 'scrnaseqmulticluster' : - self.clusterOpts.grid_forget() - self.crOpts.grid_forget() - self.qcOpts.grid_forget() - self.multiclusterOpts.grid( row=8, column=0, columnspan=4, sticky=W, padx=20, pady=10 ) + if self.Pipeline.get() == 'scrnaQC' : + self.deOptions.grid_forget() + self.qcOptions.grid(row=8,column=0, columnspan=6, sticky=W, padx=20, pady=10 ) + elif self.Pipeline.get() == 'scrnaDE' : + self.deOptions.grid( row=8, column=0, columnspan=4, sticky=W, padx=20, pady=10 ) + self.qcOptions.grid_forget() def makejson_wrapper( self, *args, **kwargs ) : @@ -390,6 +356,15 @@ def makejson(self, *args): PD=dict() smparams=[] + algorithmDict = {"Louvain (Original)": 1, "Louvain (with Multilevel Refinement)": 2,"SLM (Smart Local Moving)":3} + + algorithm = algorithmDict[self.clustAlg.get()] + resolutionStr = self.resolution.get() + resolutionStr = re.sub("\s",",",resolutionStr) #remove whitespaces + resolutionStr = re.sub("[,]+",",",resolutionStr) #remove multiple commas + + + for i in range(len(self.parameters)): @@ -401,7 +376,12 @@ def makejson(self, *args): ), "r" ).read() ) - gi = self.global_info + gi = self.global_info + species = "" + if gi.annotation.get() == "GRCh38": + species = "human" + elif gi.annotation.get() == "mm10": + species = "mouse" PD={ 'project': { 'pfamily': gi.pfamily.get(), @@ -432,13 +412,21 @@ def makejson(self, *args): "cluster": "cluster_medium.json", "description": gi.description.get('1.0',END), "technique" : gi.technique.get(), - "CRID": self.scrCRID.get(), - "EXPECTED": self.scrExpected.get(), - "COUNTSPATH": self.countpath.get(), - "MATTYPE": self.mattype.get(), - "DOCYCLEREGRESS": self.docycleregress.get(), - "RESOLUTION": self.scrRes.get(), - "PCS": self.scrPCs.get(), + + "CLUSTALG" : algorithm, + "ANNOTDB" : self.annotDB.get(), + "CLUSTRESOLUTION": resolutionStr, + # "INTEGRATEDE": self.integrateOption.get(), + "CLUSTERDE": self.resolutionDE.get(), + "SPECIES": species, + # "CRID": self.scrCRID.get(), + # "EXPECTED": self.scrExpected.get(), + # "COUNTSPATH": self.countpath.get(), + # "MATTYPE": self.mattype.get(), + # "DOCYCLEREGRESS": self.docycleregress.get(), + # "RESOLUTION": self.scrRes.get(), + # "PCS": self.scrPCs.get(), + "contrasts" : contrasts, "groups": groups } } diff --git a/hg38.json b/hg38.json index 82424dc..90d01bf 100755 --- a/hg38.json +++ b/hg38.json @@ -289,7 +289,12 @@ "MUTECTGENOME": "/fdb/muTect/human_g1k_v37.fasta", "MUTECTVARIANTS": "--dbsnp /fdb/muTect/dbsnp_137.b37.vcf --cosmic /data/CCBR_Pipeliner/db/PipeDB/lib/COSMIC_82_hg38.vcf.gz" }, -"mirseq": { + "mirseq": { + "BOWTIE_REF": "/data/CCBR_Pipeliner/db/PipeDB/miRseq/bowtie_indices/hg38", + "FASTA_REF": "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/hg38_basic.fa", + "MIRBASE_MATURE": "/data/CCBR_Pipeliner/db/PipeDB/miRseq/miRBase_v22/hsa_mature.fa", + "MIRBASE_HAIRPIN": "/data/CCBR_Pipeliner/db/PipeDB/miRseq/miRBase_v22/hsa_hairpin.fa", + "ORGANISM": "Human", "run_info":{ "TOOL": "miRNA", "VERSION": "1.1", diff --git a/mm10.json b/mm10.json index aade463..0961b35 100755 --- a/mm10.json +++ b/mm10.json @@ -1,4 +1,4 @@ - {"references": { +{"references": { "exomeseq": { "VARDICTBED": "/data/CCBR_Pipeliner/db/PipeDB/lib/mm10.chroms.bed", "BWAGENOME": "/fdb/igenomes/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/genome.fa", @@ -256,6 +256,13 @@ "DB29": "/data/CCBR_Pipeliner/db/PipeDB/lib/snpEff_db/dbNSFP2.9.txt.gz", "MUTECTCOSMIC": "", "MUTECTSNP": "" +}, +"mirseq":{ + "BOWTIE_REF": "/data/CCBR_Pipeliner/db/PipeDB/miRseq/bowtie_indices/mm10", + "FASTA_REF": "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/mm10_basic.fa", + "MIRBASE_MATURE": "/data/CCBR_Pipeliner/db/PipeDB/miRseq/miRBase_v22/mmu_mature.fa", + "MIRBASE_HAIRPIN": "/data/CCBR_Pipeliner/db/PipeDB/miRseq/miRBase_v22/mmu_hairpin.fa", + "ORGANISM": "Mouse" }, "ChIPseq":{ "BLACKLISTBWAINDEX":"/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.blacklist.chrM.chr_rDNA", diff --git a/rules.json b/rules.json index a57723f..a6a0dae 100755 --- a/rules.json +++ b/rules.json @@ -183,7 +183,10 @@ "scrnaseqinit": "scrnaseq.snakefile", "scrnaseqcluster": "scrnaseq.snakefile", "CAPmirseq-plus": "CAPmirseq-plus.snakefile", - + "miRSeq_v2": "miRSeq.snakefile", + "scRNA_v2": "scrna.snakefile", + "scrnaQC": "scrnaQC.snakefile", + "scrnaDE": "scrnaDE.snakefile", "InitialChIPseqQC":"InitialChIPseqQC.snakefile", "ChIPseq":"ChIPseq.snakefile", } diff --git a/standard-bin.json b/standard-bin.json index ef042cd..4fbc029 100644 --- a/standard-bin.json +++ b/standard-bin.json @@ -339,6 +339,18 @@ }, "mirseq":{ + "tool_versions": { + "FASTQCVER": "fastqc/0.11.5", + "CUTADAPTVER": "cutadapt/1.18", + "KRAKENVER": "kraken/1.1", + "KRONATOOLSVER": "kronatools/2.7", + "FASTXTOOLKITVER": "fastxtoolkit/0.0.14", + "FASTQ_SCREEN": "/data/CCBR_Pipeliner/db/PipeDB/bin/fastq_screen_v0.9.3/fastq_screen", + "BOWTIE2VER": "bowtie/2-2.3.4", + "PERLVER": "perl/5.24.3", + "MULTIQCVER": "multiqc/1.4", + "MIRDEEP2VER": "mirdeep/2.0.0.8" + }, "tool_paths":{ @@ -370,6 +382,15 @@ # "CUTADAPT_PARAMS": "-b AATCTCGTATGCCGTCTTCTGCTTGC -O 3 -m 17 -f fastq", # "CUTADAPT_PARAMS": "-a CAAGCAGAAGACGGCATACGAGAT -O 3 -m 17 -f fastq", # "CUTADAPT_PARAMS": "-b CAAGCAGAAGACGGCATACGAGATNNNNNNGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA -b TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG -O 3 -m 17 -f fastq", + "FASTAWITHADAPTERSETD": "/data/CCBR_Pipeliner/db/PipeDB/dev/TruSeq_and_nextera_adapters.consolidated.fa", + "LEADINGQUALITY": 10, + "TRAILINGQUALITY": 10, + "MINLEN": 17, + "KRAKENBACDB": "/fdb/kraken/20170202_bacteria", + "FASTQ_SCREEN_CONFIG": "/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen.conf", + "FASTQ_SCREEN_CONFIG2": "/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_2.conf", + "CONFMULTIQC": "/data/CCBR_Pipeliner/db/PipeDB/Rnaseq/multiqc_config.yaml", + "CUTADAPT_PARAMS": "-b CAAGCAGAAGACGGCATACGAGAT -b TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -O 3 -m 17 -f fastq", "TRIMMOMATIC": "java -Xmx8g -Djava.io.tmpdir=/scratch -jar /usr/local/apps/trimmomatic/Trimmomatic-0.33/trimmomatic-0.33.jar", "TRIMMOMATIC.ADAPTERS": "/data/CCBR_Pipeliner/db/PipeDB/miRseq/References/adapters2.fa",