diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py index 0bdfc7d2..b473d34f 100644 --- a/drop/config/SampleAnnotation.py +++ b/drop/config/SampleAnnotation.py @@ -9,7 +9,7 @@ class SampleAnnotation: - FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE"] + FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE", "SPLICE_COUNTS_DIR"] SAMPLE_ANNOTATION_COLUMNS = FILE_TYPES + [ "RNA_ID", "DNA_ID", "DROP_GROUP", "GENE_ANNOTATION", "PAIRED_END", "COUNT_MODE", "COUNT_OVERLAPS", "STRAND", "GENOME" @@ -32,6 +32,7 @@ def __init__(self, file, root, genome): self.dnaIDs = self.createGroupIds(file_type="DNA_VCF_FILE", sep=',') # external counts self.extGeneCountIDs = self.createGroupIds(file_type="GENE_COUNTS_FILE", sep=',') + self.extSpliceCountIDs = self.createGroupIds(file_type="SPLICE_COUNTS_DIR", sep=',') def parse(self, sep='\t'): """ @@ -80,10 +81,12 @@ def createSampleFileMapping(self): columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ] """ - assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE'], 'DNA_ID': ['DNA_VCF_FILE']} + assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE', 'SPLICE_COUNTS_DIR'], 'DNA_ID': ['DNA_VCF_FILE']} assay_subsets = [] for id_, file_types in assay_mapping.items(): for file_type in file_types: + if file_type not in self.sa.columns: + continue df = self.annotationTable[[id_, file_type]].dropna().drop_duplicates().copy() df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True) df['ASSAY'] = id_ @@ -249,17 +252,24 @@ def getGenomes(self, value, group, file_type="RNA_ID", return {sample_id: value for sample_id in subset[file_type].tolist()} - def getImportCountFiles(self, annotation, group, file_type="GENE_COUNTS_FILE", - annotation_key="GENE_ANNOTATION", group_key="DROP_GROUP",exact_match = True): + def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_FILE", + annotation_key: str = "GENE_ANNOTATION", group_key: str = "DROP_GROUP", + exact_match = True, asSet: bool = True): """ - :param annotation: annotation name as specified in config and GENE_ANNOTATION column - :param group: a group of the DROP_GROUP column. exact match is passed to subsetter, false allows for substring matching + :param annotation: annotation name as specified in config and GENE_ANNOTATION column. Can be None + :param group: a group of the DROP_GROUP column. + :param exact_match: exact_match is passed to subsetter, False allows for substring matching + :param asSet: if True, default, a set is returned else a list. :return: set of unique external count file names """ + #subset for the annotation_key in the annotation group and the group_key in the group - subset = self.subsetSampleAnnotation(annotation_key, annotation,exact_match=exact_match) - subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=exact_match) - return set(subset[file_type].tolist()) + subset = self.subsetSampleAnnotation(annotation_key, annotation, exact_match=exact_match) + subset = self.subsetSampleAnnotation(group_key, group, subset, exact_match=exact_match) + ans = subset[file_type].tolist() + if asSet: + ans = set(ans) + return ans def getRow(self, column, value): sa = self.annotationTable @@ -277,7 +287,8 @@ def getGroupedIDs(self, assays): Get group to IDs mapping :param assays: list of or single assay the IDs should be from. Can be file_type or 'RNA'/'DNA' """ - assays = [assays] if isinstance(assays, str) else assays + if isinstance(assays, str): + assays = [assays] groupedIDs = defaultdict(list) for assay in assays: if "RNA" in assay: @@ -286,6 +297,8 @@ def getGroupedIDs(self, assays): groupedIDs.update(self.dnaIDs) elif "GENE_COUNT" in assay: groupedIDs.update(self.extGeneCountIDs) + elif "SPLICE_COUNT" in assay: + groupedIDs.update(self.extSpliceCountIDs) else: raise ValueError(f"'{assay}' is not a valid assay name") return groupedIDs diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py index 4d3f3651..d5954cb8 100644 --- a/drop/config/submodules/AberrantSplicing.py +++ b/drop/config/submodules/AberrantSplicing.py @@ -1,3 +1,6 @@ +import numpy as np +import pandas as pd + from snakemake.io import expand from drop import utils @@ -43,10 +46,10 @@ def getSplitCountFiles(self, dataset): :return: list of files """ ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA") - file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \ - "sample_tmp" / "splitCounts" - done_files = str(file_stump / "sample_{sample_id}.done") - return expand(done_files, sample_id=ids) + file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" + count_file = str(file_stump / "splitCounts-{sample_id}.RDS") + return expand(count_file, sample_id=ids) + def getNonSplitCountFiles(self, dataset): """ @@ -55,7 +58,25 @@ def getNonSplitCountFiles(self, dataset): :return: list of files """ ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA") - file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \ - "sample_tmp" / "nonSplitCounts" - done_files = str(file_stump / "sample_{sample_id}.done") + file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / \ + "cache" / "nonSplicedCounts" / f"raw-{dataset}" + done_files = str(file_stump / "nonSplicedCounts-{sample_id}.h5") return expand(done_files, sample_id=ids) + + + def getExternalCounts(self, group: str, fileType: str = "k_j_counts"): + """ + Get externally provided splice count data dir based on the given group. + If a file type is given the corresponding file within the folder is returned. + :param group: DROP group name from wildcard + :param fileType: name of the file without extension which is to be returned + :return: list of directories or files + """ + ids = self.sa.getIDsByGroup(group, assay="SPLICE_COUNT") + extCountFiles = self.sa.getImportCountFiles(annotation=None, group=group, + file_type="SPLICE_COUNTS_DIR", asSet=False) + if fileType is not None: + extCountFiles = np.asarray(extCountFiles)[pd.isna(extCountFiles) == False].tolist() + extCountFiles = [x + "/" + fileType + ".tsv.gz" for x in extCountFiles] + return extCountFiles + diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml index f79852d0..ca9cc6c9 100755 --- a/drop/demo/config_relative.yaml +++ b/drop/demo/config_relative.yaml @@ -17,6 +17,7 @@ exportCounts: excludeGroups: - mae - import_exp + - fraser_ex aberrantExpression: run: true @@ -36,6 +37,7 @@ aberrantSplicing: run: true groups: - fraser + - fraser_ex recount: true longRead: false keepNonStandardChrs: true diff --git a/drop/demo/sample_annotation_relative.tsv b/drop/demo/sample_annotation_relative.tsv old mode 100755 new mode 100644 index 0da07a27..8f6d12ba --- a/drop/demo/sample_annotation_relative.tsv +++ b/drop/demo/sample_annotation_relative.tsv @@ -1,23 +1,25 @@ -RNA_ID RNA_BAM_FILE DNA_VCF_FILE DNA_ID DROP_GROUP PAIRED_END COUNT_MODE COUNT_OVERLAPS STRAND HPO_TERMS GENE_COUNTS_FILE GENE_ANNOTATION GENOME -HG00096.1.M_111124_6 Data/rna_bam/HG00096.1.M_111124_6_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00096 outrider,mae TRUE IntersectionStrict TRUE no HP:0009802,HP:0010896 -HG00103.4.M_120208_3 Data/rna_bam/HG00103.4.M_120208_3_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00103 outrider,mae TRUE IntersectionStrict TRUE no HP:0004582,HP:0031959 -HG00106.4.M_120208_5 Data/rna_bam/HG00106.4.M_120208_5_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00106 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0002895,HP:0006731 -HG00111.2.M_111215_4 Data/rna_bam/HG00111.2.M_111215_4_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00111 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0100491,HP:0100871 -HG00116.2.M_120131_1 Data/rna_bam/HG00116.2.M_120131_1_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00116 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0030613,HP:0012767 -HG00126.1.M_111124_8 Data/rna_bam/HG00126.1.M_111124_8_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00126 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0000290,HP:0000293 -HG00132.2.M_111215_4 Data/rna_bam/HG00132.2.M_111215_4_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00132 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0006489,HP:0006490 -HG00149.1.M_111124_6 Data/rna_bam/HG00149.1.M_111124_6_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00149 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0000014,HP:0000020,HP:0032663 -HG00150.4.M_120208_7 Data/rna_bam/HG00150.4.M_120208_7_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00150 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0030809,HP:0006144 -HG00176.4.M_120208_2 Data/rna_bam/HG00176.4.M_120208_2_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00176 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0005215,HP:0010234 -HG00096.1.M_111124_6_trunc Data/rna_bam/HG00096.1.M_111124_6_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00096 fraser TRUE IntersectionStrict TRUE no HP:0009802,HP:0010896 -HG00103.4.M_120208_3_trunc Data/rna_bam/HG00103.4.M_120208_3_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00103 fraser TRUE IntersectionStrict TRUE no HP:0004582,HP:0031959 -HG00106.4.M_120208_5_trunc Data/rna_bam/HG00106.4.M_120208_5_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00106 fraser TRUE IntersectionStrict TRUE no HP:0002895,HP:0006731 -HG00111.2.M_111215_4_trunc Data/rna_bam/HG00111.2.M_111215_4_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00111 fraser TRUE IntersectionStrict TRUE no HP:0100491,HP:0100871 -HG00116.2.M_120131_1_trunc Data/rna_bam/HG00116.2.M_120131_1_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00116 fraser TRUE IntersectionStrict TRUE no HP:0030613,HP:0012767 -HG00126.1.M_111124_8_trunc Data/rna_bam/HG00126.1.M_111124_8_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00126 fraser TRUE IntersectionStrict TRUE no HP:0000290,HP:0000293 -HG00132.2.M_111215_4_trunc Data/rna_bam/HG00132.2.M_111215_4_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00132 fraser TRUE IntersectionStrict TRUE no HP:0006489,HP:0006490 -HG00149.1.M_111124_6_trunc Data/rna_bam/HG00149.1.M_111124_6_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00149 fraser TRUE IntersectionStrict TRUE no HP:0000014,HP:0000020,HP:0032663 -HG00150.4.M_120208_7_trunc Data/rna_bam/HG00150.4.M_120208_7_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00150 fraser TRUE IntersectionStrict TRUE no HP:0030809,HP:0006144 -HG00176.4.M_120208_2_trunc Data/rna_bam/HG00176.4.M_120208_2_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00176 fraser TRUE IntersectionStrict TRUE no HP:0005215,HP:0010234 -HG00178.4.M_120208_8 import_exp Data/external_geneCounts.tsv.gz v29 -HG00181.4.M_120208_4 import_exp Data/external_geneCounts.tsv.gz v29 +RNA_ID RNA_BAM_FILE DNA_VCF_FILE DNA_ID DROP_GROUP PAIRED_END COUNT_MODE COUNT_OVERLAPS STRAND HPO_TERMS GENE_COUNTS_FILE GENE_ANNOTATION GENOME SPLICE_COUNTS_DIR +HG00096.1.M_111124_6 Data/rna_bam/HG00096.1.M_111124_6_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00096 outrider,mae TRUE IntersectionStrict TRUE no HP:0009802,HP:0010896 +HG00103.4.M_120208_3 Data/rna_bam/HG00103.4.M_120208_3_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00103 outrider,mae TRUE IntersectionStrict TRUE no HP:0004582,HP:0031959 +HG00106.4.M_120208_5 Data/rna_bam/HG00106.4.M_120208_5_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00106 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0002895,HP:0006731 +HG00111.2.M_111215_4 Data/rna_bam/HG00111.2.M_111215_4_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00111 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0100491,HP:0100871 +HG00116.2.M_120131_1 Data/rna_bam/HG00116.2.M_120131_1_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00116 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0030613,HP:0012767 +HG00126.1.M_111124_8 Data/rna_bam/HG00126.1.M_111124_8_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00126 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0000290,HP:0000293 +HG00132.2.M_111215_4 Data/rna_bam/HG00132.2.M_111215_4_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00132 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0006489,HP:0006490 +HG00149.1.M_111124_6 Data/rna_bam/HG00149.1.M_111124_6_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00149 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0000014,HP:0000020,HP:0032663 +HG00150.4.M_120208_7 Data/rna_bam/HG00150.4.M_120208_7_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00150 fraser_ex,outrider,import_exp TRUE IntersectionStrict TRUE no HP:0030809,HP:0006144 +HG00176.4.M_120208_2 Data/rna_bam/HG00176.4.M_120208_2_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00176 fraser_ex,outrider,import_exp TRUE IntersectionStrict TRUE no HP:0005215,HP:0010234 +HG00096.1.M_111124_6_trunc Data/rna_bam/HG00096.1.M_111124_6_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00096 fraser,fraser_ex TRUE IntersectionStrict TRUE no HP:0009802,HP:0010896 +HG00103.4.M_120208_3_trunc Data/rna_bam/HG00103.4.M_120208_3_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00103 fraser,fraser_ex TRUE IntersectionStrict TRUE no HP:0004582,HP:0031959 +HG00106.4.M_120208_5_trunc Data/rna_bam/HG00106.4.M_120208_5_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00106 fraser,fraser_ex TRUE IntersectionStrict TRUE no HP:0002895,HP:0006731 +HG00111.2.M_111215_4_trunc Data/rna_bam/HG00111.2.M_111215_4_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00111 fraser,fraser_ex TRUE IntersectionStrict TRUE no HP:0100491,HP:0100871 +HG00116.2.M_120131_1_trunc Data/rna_bam/HG00116.2.M_120131_1_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00116 fraser,fraser_ex TRUE IntersectionStrict TRUE no HP:0030613,HP:0012767 +HG00126.1.M_111124_8_trunc Data/rna_bam/HG00126.1.M_111124_8_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00126 fraser,fraser_ex TRUE IntersectionStrict TRUE no HP:0000290,HP:0000293 +HG00132.2.M_111215_4_trunc Data/rna_bam/HG00132.2.M_111215_4_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00132 fraser,fraser_ex TRUE IntersectionStrict TRUE no HP:0006489,HP:0006490 +HG00149.1.M_111124_6_trunc Data/rna_bam/HG00149.1.M_111124_6_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00149 fraser,fraser_ex TRUE IntersectionStrict TRUE no HP:0000014,HP:0000020,HP:0032663 +HG00150.4.M_120208_7_trunc Data/rna_bam/HG00150.4.M_120208_7_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00150 fraser TRUE IntersectionStrict TRUE no HP:0030809,HP:0006144 +HG00176.4.M_120208_2_trunc Data/rna_bam/HG00176.4.M_120208_2_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00176 fraser TRUE IntersectionStrict TRUE no HP:0005215,HP:0010234 +HG00178.4.M_120208_8 import_exp Data/external_count_data/geneCounts.tsv.gz v29 +HG00181.4.M_120208_4 fraser_ex,import_exp Data/external_count_data/geneCounts.tsv.gz v29 Data/external_count_data +HG00191.3.M_120208_3 fraser_ex Data/external_count_data +HG00201.1.M_120208_6 fraser_ex Data/external_count_data diff --git a/drop/download_data.sh b/drop/download_data.sh index ba2ac064..a48faad4 100644 --- a/drop/download_data.sh +++ b/drop/download_data.sh @@ -2,9 +2,10 @@ set -e # get data -resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource.tar.gz" -tmpdir="$(dirname "$(mktemp)")" -wget -nc -P $tmpdir $resource_url +resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource_splice_merge.tar.gz" +tmpdir=$(dirname "$(mktemp -u)") +wget -nc -O "$tmpdir/resource.tar.gz" "$resource_url" + mkdir -p Data if [ -z "$(ls Data)" ]; then tar -zxvf "$tmpdir/resource.tar.gz" -C . diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R b/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R index 72e17db9..b8a46381 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R @@ -5,9 +5,7 @@ #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "00_defineDataset.Rds")`' #' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' #' - ids: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay="RNA")`' -#' - fileMappingFile: '`sm cfg.getRoot() + "/file_mapping.csv"`' #' input: #' - sampleAnnoFile: '`sm config["sampleAnnotation"]`' #' output: @@ -23,30 +21,30 @@ #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(data.table)) #+ input outFile <- snakemake@output$colData annoFile <- snakemake@input$sampleAnnoFile -fileMapFile <- snakemake@params$fileMapping #+ dataset name name <- snakemake@wildcards$dataset anno <- fread(annoFile) -mapping <- fread(fileMapFile) subset_ids <- snakemake@params$ids annoSub <- anno[RNA_ID %in% subset_ids] -colData <- merge( - annoSub[,.(sampleID = RNA_ID, STRAND, PAIRED_END)], - mapping[FILE_TYPE == "RNA_BAM_FILE", .(sampleID=ID, bamFile=FILE_PATH)]) +setnames(annoSub, "RNA_ID", "sampleID") +setnames(annoSub, "RNA_BAM_FILE", "bamFile") +setnames(annoSub, "PAIRED_END", "pairedEnd") +setcolorder(annoSub, unique(c("sampleID", "STRAND", "pairedEnd", "bamFile"), + colnames(annoSub))) #' #' ## Dataset: `r name` #' #+ echo=FALSE -finalTable <- colData +finalTable <- annoSub #' #' ## Final sample table `r name` @@ -55,4 +53,4 @@ finalTable <- colData DT::datatable(finalTable, options=list(scrollX=TRUE)) dim(finalTable) -write_tsv(finalTable, file=outFile) +write.table(x=finalTable, file=outFile, quote=FALSE, sep='\t', row.names=FALSE) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R index c54948ee..9c40fa14 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R @@ -4,37 +4,32 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_0_init.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: #' - colData: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/annotations/{dataset}.tsv"`' #' output: -#' - fdsobj: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/fds-object.RDS"`' -#' - done_fds: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done" `' +#' - fds_init: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/fds-init.done"`' #' type: script #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(FRASER)) -dataset <- snakemake@wildcards$dataset +# input +dataset <- snakemake@wildcards$dataset colDataFile <- snakemake@input$colData -workingDir <- snakemake@params$workingDir -params <- snakemake@config$aberrantSplicing +fds_init <- snakemake@output$fds_init +params <- snakemake@config$aberrantSplicing # Create initial FRASER object col_data <- fread(colDataFile) fds <- FraserDataSet(colData = col_data, - workingDir = workingDir, + workingDir = dirname(dirname(dirname(fds_init))), name = paste0("raw-", dataset)) -# Add paired end and strand specificity to the fds -pairedEnd(fds) <- colData(fds)$PAIRED_END +# Add strand specificity to the fds strandSpecific(fds) <- 'no' if(uniqueN(colData(fds)$STRAND) == 1){ strandSpecific(fds) <- unique(colData(fds)$STRAND) @@ -45,4 +40,4 @@ fds <- saveFraserDataSet(fds) message(date(), ": FRASER object initialized for ", dataset) -file.create(snakemake@output$done_fds) +file.create(fds_init) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R index 5da348af..6659f467 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R @@ -3,51 +3,64 @@ #' author: Luise Schuller #' wb: #' log: -#' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "splitReads" / "{sample_id}.Rds")`' +#' - snakemake: '`sm str(tmp_dir / "AS" / "splitReads" / "{sampleID}.Rds")`' #' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' +#' - COUNT_PARAMS: '`sm lambda w: cfg.AE.getCountParams(w.sampleID)`' #' input: -#' - done_fds: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done"`' +#' - sample_bam: '`sm lambda w: sa.getFilePath(w.sampleID, file_type="RNA_BAM_FILE") `' #' output: -#' - done_sample_splitCounts: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}" -#' +"/sample_tmp/splitCounts/sample_{sample_id}.done"`' +#' - sample_splitCounts: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/cache/splitCounts-{sampleID}.RDS"`' #' threads: 3 #' type: script #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) -library(BSgenome) +suppressPackageStartupMessages({ + library(FRASER) + library(BSgenome) +}) -dataset <- snakemake@wildcards$dataset -workingDir <- snakemake@params$workingDir +sampleID <- snakemake@wildcards$sampleID +sample_bam <- snakemake@input$sample_bam +fds_init <- snakemake@input$fds_init +out_rds <- snakemake@output$sample_splitCounts params <- snakemake@config$aberrantSplicing genomeAssembly <- snakemake@config$genomeAssembly +strand <- snakemake@params$COUNT_PARAMS$STRAND +paired <- snakemake@params$COUNT_PARAMS$PAIRED_END +coldata <- fread(snakemake@config$sampleAnnotation)[RNA_ID == sampleID] -# Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) -# Get sample id from wildcard -sample_id <- snakemake@wildcards[["sample_id"]] +# Get strand specificity of sample ('' does not exists in switch -> x_...) +strand <- switch(paste0("x_", tolower(strand)), + 'x_' = 0L, + 'x_no' = 0L, + 'x_unstranded' = 0L, + 'x_yes' = 1L, + 'x_stranded' = 1L, + 'x_reverse' = 2L, + stop("Strand parameter not known! It was: '", strand, "'.")) # If data is not strand specific, add genome info genome <- NULL -if(strandSpecific(fds) == 0){ +if(strand == 0){ genome <- getBSgenome(genomeAssembly) } # Count splitReads for a given sample id -sample_result <- countSplitReads(sampleID = sample_id, - fds = fds, - NcpuPerSample = snakemake@threads, - recount = params$recount, - keepNonStandardChromosomes = params$keepNonStandardChrs, - genome = genome) +sample_result <- countSplitReads( + sampleID = sampleID, + NcpuPerSample = snakemake@threads, + genome = genome, + recount = TRUE, + keepNonStandardChromosomes=params$keepNonStandardChrs, + bamfile = sample_bam, + pairedend = paired, + strandmode = strand, + cacheFile = out_rds, + scanbamparam = ScanBamParam(mapqFilter=0), + coldata = coldata) -message(date(), ": ", dataset, ", ", sample_id, - " no. splice junctions (split counts) = ", length(sample_result)) - -file.create(snakemake@output$done_sample_splitCounts) +message(date(), ": ", sampleID," no. splice junctions (split counts) = ", + length(sample_result)) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R index 1ea2a86e..28f755fe 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R @@ -4,50 +4,53 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_2_splitReadsMerge.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' -#' threads: 20 +#' threads: 10 #' input: #' - sample_counts: '`sm lambda w: cfg.AS.getSplitCountFiles(w.dataset)`' +#' - fds_init: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/fds-init.done"`' #' output: -#' - countsJ: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`' +#' - countsJ: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`' #' - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`' +#' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`' #' - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`' +#' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`' #' - spliceSites: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' #' type: script #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(FRASER)) -dataset <- snakemake@wildcards$dataset -workingDir <- snakemake@params$workingDir -params <- snakemake@config$aberrantSplicing +# input +dataset <- snakemake@wildcards$dataset +fdsIn <- file.path(dirname(snakemake@input$fds_init), "fds-object.RDS") +params <- snakemake@config$aberrantSplicing minExpressionInOneSample <- params$minExpressionInOneSample +BPPARAM <- MulticoreParam(snakemake@threads) +# Set number of threads including for DelayedArray operations +register(BPPARAM) +DelayedArray::setAutoBPPARAM(BPPARAM) -register(MulticoreParam(snakemake@threads)) -# Limit number of threads for DelayedArray operations -setAutoBPPARAM(MulticoreParam(snakemake@threads)) +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(file=fdsIn) -# If samples are recounted, remove the merged ones -splitCountsDir <- file.path(workingDir, "savedObjects", - paste0("raw-", dataset), 'splitCounts') -if(params$recount == TRUE & dir.exists(splitCountsDir)){ - unlink(splitCountsDir, recursive = TRUE) +# enforce remerging if this script has to be run +splitCountsDir <- file.path(workingDir(fds), "savedObjects", name(fds), "splitCounts") +if(dir.exists(splitCountsDir)){ + unlink(splitCountsDir, recursive=TRUE) } # Get and merge splitReads for all sample ids -splitCounts <- getSplitReadCountsForAllSamples(fds=fds, - recount=FALSE) +splitCounts <- getSplitReadCountsForAllSamples(fds=fds, outDir=splitCountsDir, recount=FALSE) + # Extract, annotate and save granges splitCountRanges <- rowRanges(splitCounts) @@ -65,7 +68,7 @@ splitCountRanges <- splitCountRanges[passed,] saveRDS(splitCountRanges, snakemake@output$gRangesNonSplitCounts) # Extract splitSiteCoodinates: extract donor and acceptor sites -# take either filtered or full fds +# take filtered splicesite map spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges, fds) saveRDS(spliceSiteCoords, snakemake@output$spliceSites) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R index c5d9e6a7..aff3f4d1 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R @@ -4,48 +4,42 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "nonsplitReads" / "{sample_id}.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: #' - spliceSites: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' +#' - fds_init: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/fds-init.done"`' #' output: -#' - done_sample_nonSplitCounts : '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/sample_tmp/nonSplitCounts/sample_{sample_id}.done"`' +#' - nonSplitCounts_sample : '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/cache/nonSplicedCounts/raw-{dataset}/nonSplicedCounts-{sample_id}.h5"`' #' threads: 3 #' type: script #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(FRASER)) dataset <- snakemake@wildcards$dataset -colDataFile <- snakemake@input$colData -workingDir <- snakemake@params$workingDir -params <- snakemake@config$aberrantSplicing +sample_id <- snakemake@wildcards$sample_id +fds_init <- snakemake@input$fds_init +params <- snakemake@config$aberrantSplicing # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) - -# Get sample id from wildcard -sample_id <- snakemake@wildcards[["sample_id"]] - +fds <- loadFraserDataSet(file=file.path(dirname(fds_init), "fds-object.RDS")) # Read splice site coordinates from RDS spliceSiteCoords <- readRDS(snakemake@input$spliceSites) # Count nonSplitReads for given sample id sample_result <- countNonSplicedReads(sample_id, - splitCountRanges = NULL, - fds = fds, - NcpuPerSample = snakemake@threads, - minAnchor=5, - recount=params$recount, - spliceSiteCoords=spliceSiteCoords, - longRead=params$longRead) + splitCountRanges = NULL, + fds = fds, + NcpuPerSample = snakemake@threads, + minAnchor=5, + # TODO should be TRUE in the future as snakemake is checking this + recount=params$recount, + spliceSiteCoords=spliceSiteCoords, + longRead=params$longRead) message(date(), ": ", dataset, ", ", sample_id, " no. splice junctions (non split counts) = ", length(sample_result)) - -file.create(snakemake@output$done_sample_nonSplitCounts) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R index 83c48939..7ceacd00 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R @@ -4,10 +4,7 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_4_nonSplitReadsMerge.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' -#' threads: 20 +#' threads: 10 #' input: #' - sample_counts: '`sm lambda w: cfg.AS.getNonSplitCountFiles(w.dataset)`' #' - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + @@ -19,34 +16,39 @@ #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(FRASER)) -dataset <- snakemake@wildcards$dataset -workingDir <- snakemake@params$workingDir -params <- snakemake@config$aberrantSplicing +dataset <- snakemake@wildcards$dataset +fds_in <- file.path(dirname(snakemake@output$countsSS), "fds-object.RDS") +params <- snakemake@config$aberrantSplicing +BPPARAM <- MulticoreParam(snakemake@threads) -register(MulticoreParam(snakemake@threads)) -# Limit number of threads for DelayedArray operations -setAutoBPPARAM(MulticoreParam(snakemake@threads)) +# Set number of threads including for DelayedArray operations +register(BPPARAM) +DelayedArray::setAutoBPPARAM(BPPARAM) + +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(file=fds_in) # Read splice site coordinates from RDS splitCounts_gRanges <- readRDS(snakemake@input$gRangesNonSplitCounts) # If samples are recounted, remove the merged ones -nonSplitCountsDir <- file.path(workingDir, "savedObjects", - paste0("raw-", dataset), 'nonSplitCounts') -if(params$recount == TRUE & dir.exists(nonSplitCountsDir)){ - unlink(nonSplitCountsDir, recursive = TRUE) +nonSplitCountsDir <- file.path(workingDir(fds), "savedObjects", + name(fds), 'nonSplitCounts') +if(dir.exists(nonSplitCountsDir)){ + unlink(nonSplitCountsDir, recursive=TRUE) } # Get and merge nonSplitReads for all sample ids nonSplitCounts <- getNonSplitReadCountsForAllSamples(fds=fds, - splitCountRanges=splitCounts_gRanges, - minAnchor=5, - recount=FALSE, - longRead=params$longRead) + splitCountRanges=splitCounts_gRanges, + minAnchor=5, + recount=FALSE, + longRead=params$longRead) message(date(), ":", dataset, " nonSplit counts done") diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R index da3d38b6..1af594db 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R @@ -4,9 +4,6 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_5_collect.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: #' - countsJ: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`' @@ -23,19 +20,23 @@ #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(FRASER)) -dataset <- snakemake@wildcards$dataset -workingDir <- snakemake@params$workingDir +dataset <- snakemake@wildcards$dataset +j_counts <- snakemake@input$countsJ +theta_counts <- snakemake@input$countsSS +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(file=j_counts) splitCounts_gRanges <- readRDS(snakemake@input$gRangesSplitCounts) spliceSiteCoords <- readRDS(snakemake@input$spliceSites) # Get splitReads and nonSplitRead counts in order to store them in FRASER object -splitCounts_h5 <- HDF5Array::HDF5Array(snakemake@input$countsJ, "rawCountsJ") +splitCounts_h5 <- HDF5Array::HDF5Array(j_counts, "rawCountsJ") splitCounts_se <- SummarizedExperiment( colData = colData(fds), rowRanges = splitCounts_gRanges, @@ -43,7 +44,7 @@ splitCounts_se <- SummarizedExperiment( ) -nonSplitCounts_h5 <- HDF5Array::HDF5Array(snakemake@input$countsSS, "rawCountsSS") +nonSplitCounts_h5 <- HDF5Array::HDF5Array(theta_counts, "rawCountsSS") nonSplitCounts_se <- SummarizedExperiment( colData = colData(fds), rowRanges = spliceSiteCoords, @@ -51,8 +52,9 @@ nonSplitCounts_se <- SummarizedExperiment( ) # Add Counts to FRASER object -fds <- addCountsToFraserDataSet(fds=fds, splitCounts=splitCounts_se, - nonSplitCounts=nonSplitCounts_se) +fds <- addCountsToFraserDataSet(fds=fds, + splitCounts=splitCounts_se, + nonSplitCounts=nonSplitCounts_se) # Save final FRASER object fds <- saveFraserDataSet(fds) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R index 24e5cb9b..37e39abc 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R @@ -4,31 +4,35 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "02_PSIcalc.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' -#' threads: 30 +#' threads: 10 #' input: #' - counting_done: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/counting.done" `' #' output: #' - theta: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`' +#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/delta_theta.h5"`' #' type: script #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(FRASER)) -dataset <- snakemake@wildcards$dataset -workingDir <- snakemake@params$workingDir -params <- snakemake@config$aberrantSplicing +# input +dataset <- snakemake@wildcards$dataset +fds_in <- file.path(dirname(snakemake@input$counting_done), "fds-object.RDS") +params <- snakemake@config$aberrantSplicing +BPPARAM <- MulticoreParam(snakemake@threads) -register(MulticoreParam(snakemake@threads)) -# Limit number of threads for DelayedArray operations -setAutoBPPARAM(MulticoreParam(snakemake@threads)) +# Set number of threads including for DelayedArray operations +register(BPPARAM) +DelayedArray::setAutoBPPARAM(BPPARAM) -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) + +# load FraserDataSet object +fds <- loadFraserDataSet(file=fds_in) # Calculating PSI values fds <- calculatePSIValues(fds) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R index 731300ae..3d191b23 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R @@ -5,53 +5,76 @@ #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "03_filter.Rds")`' #' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' +#' - exCountIDs: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay="SPLICE_COUNT")`' #' input: #' - theta: '`sm cfg.getProcessedDataDir()+ -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`' +#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/delta_theta.h5"`' +#' - exCounts: '`sm lambda w: cfg.AS.getExternalCounts(w.dataset, "k_j_counts")`' #' output: -#' - fds: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/{dataset}/fds-object.RDS"`' -#' - done: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `' +#' - fds_filter_done: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/{dataset}/filtering.done"`' #' threads: 3 #' type: script #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) - -opts_chunk$set(fig.width=12, fig.height=8) +suppressPackageStartupMessages(library(FRASER)) +knitr::opts_chunk$set(fig.width=12, fig.height=8) # input dataset <- snakemake@wildcards$dataset -workingDir <- snakemake@params$workingDir -params <- snakemake@config$aberrantSplicing +fdsIn <- snakemake@input$theta +fdsOut <- snakemake@output$fds_filter_done +params <- snakemake@config$aberrantSplicing +exCountIDs <- snakemake@params$exCountIDs +exCountFiles <- snakemake@input$exCounts +sample_anno_file <- snakemake@config$sampleAnnotation +minExpressionInOneSample <- params$minExpressionInOneSample +minDeltaPsi <- params$minDeltaPsi +BPPARAM <- MulticoreParam(snakemake@threads) -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +# Set number of threads including for DelayedArray operations +register(BPPARAM) +DelayedArray::setAutoBPPARAM(BPPARAM) -register(MulticoreParam(snakemake@threads)) -# Limit number of threads for DelayedArray operations -setAutoBPPARAM(MulticoreParam(snakemake@threads)) +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) -# Apply filter -minExpressionInOneSample <- params$minExpressionInOneSample -minDeltaPsi <- params$minDeltaPsi +# Load FraserDataSet object +fds <- loadFraserDataSet(file=fdsIn) +# Apply filter fds <- filterExpressionAndVariability(fds, - minExpressionInOneSample = minExpressionInOneSample, - minDeltaPsi = minDeltaPsi, - filter=FALSE) + minExpressionInOneSample = minExpressionInOneSample, + minDeltaPsi = minDeltaPsi, + filter=FALSE) devNull <- saveFraserDataSet(fds) -# Keep junctions that pass filter -name(fds) <- dataset -if (params$filter == TRUE) { - filtered <- mcols(fds, type="j")[,"passed"] - fds <- fds[filtered,] - message(paste("filtered to", nrow(fds), "junctions")) +# TODO move it before applying filter to have it included in the summary +# Add external data if provided by dataset +dontWriteHDF5(fds) <- TRUE +if(length(exCountIDs) > 0){ + for(resource in unique(exCountFiles)){ + exSampleIDs <- exCountIDs[exCountFiles == resource] + exAnno <- fread(sample_anno_file, key="RNA_ID")[J(exSampleIDs)] + setnames(exAnno, "RNA_ID", "sampleID") + + ctsNames <- c("k_j", "k_theta", "n_psi3", "n_psi5", "n_theta") + ctsFiles <- paste0(dirname(resource), "/", ctsNames, "_counts.tsv.gz") + fds <- mergeExternalData(fds=fds, countFiles=ctsFiles, + sampleIDs=exSampleIDs, annotation=exAnno) + } } -fds <- saveFraserDataSet(fds) -file.create(snakemake@output$done) +fds <- filterExpressionAndVariability(fds, + minExpressionInOneSample = minExpressionInOneSample, + minDeltaPsi = minDeltaPsi, + filter=params$filter) + +# save filtered data into new dataset object +dontWriteHDF5(fds) <- FALSE +name(fds) <- dataset +fds <- saveFraserDataSet(fds, rewrite=TRUE) + +file.create(fdsOut) \ No newline at end of file diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R index e2c61d86..f68fa1e7 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R @@ -9,7 +9,7 @@ #' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' input: #' - filter: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `' +#' "/aberrant_splicing/datasets/savedObjects/{dataset}/filtering.done"`' #' output: #' - wBhtml: '`sm config["htmlOutputPath"] + #' "/AberrantSplicing/{dataset}_countSummary.html"`' diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R index 1a99b259..b2db290d 100644 --- a/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R @@ -4,13 +4,10 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "04_hyper.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' threads: 12 #' input: #' - filter: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `' +#' "/aberrant_splicing/datasets/savedObjects/{dataset}/filtering.done"`' #' output: #' - hyper: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/hyper.done" `' @@ -18,7 +15,10 @@ #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages({ + library(FRASER) + library(tidyr) +}) if ("random_seed" %in% names(snakemake@config)){ rseed <- snakemake@config$random_seed @@ -30,15 +30,20 @@ if ("random_seed" %in% names(snakemake@config)){ } #+ input -dataset <- snakemake@wildcards$dataset -workingDir <- snakemake@params$workingDir +dataset <- snakemake@wildcards$dataset +fds_file <- file.path(dirname(snakemake@input$filter), "fds-object.RDS") +BPPARAM <- MulticoreParam(snakemake@threads) -register(MulticoreParam(snakemake@threads)) -# Limit number of threads for DelayedArray operations -setAutoBPPARAM(MulticoreParam(snakemake@threads)) +# Set number of threads including for DelayedArray operations +register(BPPARAM) +DelayedArray::setAutoBPPARAM(BPPARAM) + +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) # Load PSI data -fds <- loadFraserDataSet(dir=workingDir, name=dataset) +fds <- loadFraserDataSet(file=fds_file) # Run hyper parameter optimization implementation <- snakemake@config$aberrantSplicing$implementation diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R index f5e760c8..e87cf310 100644 --- a/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R @@ -4,9 +4,6 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "05_fit.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' threads: 20 #' input: #' - hyper: '`sm cfg.getProcessedDataDir() + @@ -18,16 +15,23 @@ #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(FRASER)) -dataset <- snakemake@wildcards$dataset -workingDir <- snakemake@params$workingDir +# input +dataset <- snakemake@wildcards$dataset +fds_in <- file.path(dirname(snakemake@input$hyper), "fds-object.RDS") +BPPARAM <- MulticoreParam(snakemake@threads) -register(MulticoreParam(snakemake@threads)) -# Limit number of threads for DelayedArray operations -setAutoBPPARAM(MulticoreParam(snakemake@threads)) +# Set number of threads including for DelayedArray operations +register(BPPARAM) +DelayedArray::setAutoBPPARAM(BPPARAM) -fds <- loadFraserDataSet(dir=workingDir, name=dataset) +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) + +# load FraserDataSet object +fds <- loadFraserDataSet(file=fds_in) # Fit autoencoder # run it for every type diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R index 84b222cb..c2caf9fb 100644 --- a/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R @@ -4,9 +4,6 @@ #' wb: #' log: #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "06_stats.Rds")`' -#' params: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' threads: 20 #' input: #' - fdsin: '`sm cfg.getProcessedDataDir() + @@ -20,18 +17,23 @@ #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@params$setup, echo=FALSE) +suppressPackageStartupMessages(library(FRASER)) -dataset <- snakemake@wildcards$dataset -fdsFile <- snakemake@input$fdsin -workingDir <- snakemake@params$workingDir +#+ input +dataset <- snakemake@wildcards$dataset +fdsFile <- snakemake@input$fdsin +BPPARAM <- MulticoreParam(snakemake@threads) -register(MulticoreParam(snakemake@threads)) -# Limit number of threads for DelayedArray operations -setAutoBPPARAM(MulticoreParam(snakemake@threads)) +# Set number of threads including for DelayedArray operations +register(BPPARAM) +DelayedArray::setAutoBPPARAM(BPPARAM) + +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) # Load Zscores data -fds <- loadFraserDataSet(dir=workingDir, name=dataset) +fds <- loadFraserDataSet(file=fdsFile) # Calculate stats for (type in psiTypes) { diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R index dce06ba0..dca8bdb5 100644 --- a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R @@ -14,7 +14,6 @@ #' - assemblyVersion: '`sm cfg.genome.getBSGenomeVersion()`' #' threads: 10 #' input: -#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' #' - add_HPO_cols: '`sm str(projectDir / ".drop" / "helpers" / "add_HPO_cols.R")`' #' - fdsin: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/datasets/savedObjects/{dataset}/" + @@ -32,22 +31,33 @@ #'--- saveRDS(snakemake, snakemake@log$snakemake) -source(snakemake@input$setup, echo=FALSE) +suppressPackageStartupMessages({ + library(FRASER) + library(tidyr) + library(AnnotationDbi) +}) source(snakemake@input$add_HPO_cols) -library(AnnotationDbi) +knitr::opts_chunk$set(fig.width=12, fig.height=8) -opts_chunk$set(fig.width=12, fig.height=8) - -annotation <- snakemake@wildcards$annotation +# input dataset <- snakemake@wildcards$dataset fdsFile <- snakemake@input$fdsin +BPPARAM <- MulticoreParam(snakemake@threads) +annotation <- snakemake@wildcards$annotation workingDir <- snakemake@params$workingDir outputDir <- snakemake@params$outputDir assemblyVersion <- snakemake@params$assemblyVersion +outResJunc <- snakemake@output$resultTableJunc +outResGene <- snakemake@output$resultTableGene + +# Set number of threads including for DelayedArray operations +register(BPPARAM) +DelayedArray::setAutoBPPARAM(BPPARAM) + +# Force writing HDF5 files +options(FRASER.maxSamplesNoHDF5=-1) +options(FRASER.maxJunctionsNoHDF5=-1) -register(MulticoreParam(snakemake@threads)) -# Limit number of threads for DelayedArray operations -setAutoBPPARAM(MulticoreParam(snakemake@threads)) # Load fds and create a new one fds_input <- loadFraserDataSet(dir=workingDir, name=dataset) @@ -106,5 +116,5 @@ if(length(res_junc) > 0){ } else res_genes_dt <- data.table() # Results -write_tsv(res_junc_dt, file=snakemake@output$resultTableJunc) -write_tsv(res_genes_dt, file=snakemake@output$resultTableGene) +write.table(x=res_junc_dt, file=outResJunc, quote=FALSE, sep='\t', row.names=FALSE) +write.table(x=res_genes_dt, file=outResGene, quote=FALSE, sep='\t', row.names=FALSE) diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R index a24168a5..d8b549c9 100644 --- a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R @@ -62,7 +62,7 @@ topN <- 30000 topJ <- 10000 for(type in psiTypes){ before <- plotCountCorHeatmap( - fds, + object=fds, type = type, logit = TRUE, topN = topN, @@ -78,7 +78,7 @@ for(type in psiTypes){ ) before after <- plotCountCorHeatmap( - fds, + object=fds, type = type, logit = TRUE, topN = topN, diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt index 02151c2d..b6fb8718 100644 --- a/drop/requirementsR.txt +++ b/drop/requirementsR.txt @@ -1,6 +1,6 @@ package version gagneurlab/OUTRIDER 1.6.1 -c-mertes/FRASER 1.2.1 +c-mertes/FRASER 1.2.2 mumichae/tMAE 1.0.0 VariantAnnotation rmarkdown diff --git a/drop/utils.py b/drop/utils.py index b325f38f..975c80b6 100644 --- a/drop/utils.py +++ b/drop/utils.py @@ -76,11 +76,20 @@ def subsetBy(df, column, values, exact_match=True): """ if values is None: return df - elif isinstance(values, str) and exact_match : - return df[df[column] == values] - elif not isinstance(values,str) and exact_match: - return df[df[column].isin(values)] - elif isinstance(values,str) and not exact_match: +# elif isinstance(values, str): +# return df[df[column].str.contains("(^|,)" + values + "(,|$)")] +# else: +# if(values.__len__() > 1): +# raise ValueError(f"Values too long! Please report this. Values are: {values}.") + elif exact_match: + if isinstance(values, str): + return df[df[column] == values] + else: + return df[df[column].isin(values)] + elif isinstance(values,str): return df[df[column].str.contains(values)] else: + # this does not work. If you have a group drop and drop_2 + # first group will match the second. Also if you have drop,drop_2 + # it will not detect on or the other. return df[df[column].str.contains("|".join(values))] diff --git a/tests/config/test_AE.py b/tests/config/test_AE.py index fa1e1023..8643130d 100644 --- a/tests/config/test_AE.py +++ b/tests/config/test_AE.py @@ -35,7 +35,7 @@ def test_getCountsFiles(self, demo_dir, dropConfig): # import count counts_files_true = counts_files_true[2:] - counts_files_true.append(f"{demo_dir}/Data/external_geneCounts.tsv.gz") + counts_files_true.append(f"{demo_dir}/Data/external_count_data/geneCounts.tsv.gz") counts_files_true.sort() counts_files_test = dropConfig.AE.getCountFiles(annotation="v29", group="import_exp") counts_files_test.sort() diff --git a/tests/config/test_AS.py b/tests/config/test_AS.py index df6b527d..c3e3dc38 100644 --- a/tests/config/test_AS.py +++ b/tests/config/test_AS.py @@ -3,7 +3,7 @@ class Test_AS_Config: def test_config(self, dropConfig,demo_dir): assert dropConfig.AS.getWorkdir() == f"{demo_dir}/Scripts/AberrantSplicing/pipeline" dict_ = { - 'groups': ['fraser'], + 'groups': ['fraser', 'fraser_ex'], 'recount': True, 'longRead': False, 'keepNonStandardChrs': True, diff --git a/tests/config/test_SampleAnnotation.py b/tests/config/test_SampleAnnotation.py index ba93966f..f1e1d21e 100644 --- a/tests/config/test_SampleAnnotation.py +++ b/tests/config/test_SampleAnnotation.py @@ -13,9 +13,9 @@ def test_columns(self, sampleAnnotation): def test_mapping(self, sampleAnnotation): # ID mappings/groups - assert sampleAnnotation.idMapping.shape == (22, 2) - assert sampleAnnotation.sampleFileMapping.shape == (32, 4) - true_mapping = {'mae': 2, 'import_exp': 8, 'outrider': 10, 'fraser': 10} + assert sampleAnnotation.idMapping.shape == (24, 2) + assert sampleAnnotation.sampleFileMapping.shape == (35, 4) + true_mapping = {'mae': 2, 'import_exp': 8, 'outrider': 10, 'fraser': 10, 'fraser_ex': 10} assert true_mapping == {k: len(v) for k, v in sampleAnnotation.rnaIDs.items()} assert true_mapping == {k: len(v) for k, v in sampleAnnotation.dnaIDs.items()} @@ -23,8 +23,9 @@ def test_mapping(self, sampleAnnotation): "sample_id,file_type,file_name", [ ("HG00096.1.M_111124_6", "RNA_BAM_FILE", "Data/rna_bam/HG00096.1.M_111124_6_chr21.bam"), - ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_geneCounts.tsv.gz"), - ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz") + ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_count_data/geneCounts.tsv.gz"), + ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz"), + ("HG00201.1.M_120208_6", "SPLICE_COUNTS_DIR", "Data/external_count_data") ] ) def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_name): @@ -35,7 +36,7 @@ def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_ @pytest.mark.parametrize( "annotation,group,files", [ - ("v29", "import_exp", {'Data/external_geneCounts.tsv.gz'}) + ("v29", "import_exp", {'Data/external_count_data/geneCounts.tsv.gz'}) ] ) def test_import(self, demo_dir, sampleAnnotation, annotation, group, files):