From 28dcae861fa0bc0826a3dc94180a95c8b2297deb Mon Sep 17 00:00:00 2001 From: JMencius <553843771@qq.com> Date: Sun, 14 Apr 2024 11:39:18 +0800 Subject: [PATCH 1/2] Update cluster.pyx --- dysgu/cluster.pyx | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 55e8a80..bb07efd 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -1160,6 +1160,31 @@ def cluster_reads(args): raise ValueError("--ibam must be a .bam/cam/sam file") ibam = pysam.AlignmentFile(args["ibam"], opts[kind2], threads=1, reference_filename=None if kind2 != "cram" else args["reference"]) + + # Double check if bam file is sorted + # first check SO tag in HD + if "HD" in infile.header: + hd = infile.header["HD"] + if "SO" in hd: + if hd["SO"] == "unsorted": + # raise ValueError for SO:unsorted + raise ValueError("Input bam file must be sorted") + # directly check bam just check the first 1000 alignment for speed + max_alignment_check = 10**3 + alignment_count = 0 + prev_alignment = None + for aln in infile: + alignment_count += 1 + if alignment_count > max_alignment_check: + break + if prev_alignment and prev_alignment.reference_id > aln.reference_id: + raise ValueError("Input bam file must be sorted") + prev_alignment = aln + logging.info("Check PASS : Input BAM file is sorted") + # reopen infile + infile = pysam.AlignmentFile(args["sv_aligns"], bam_mode, threads=args["procs"], + reference_filename=None if kind != "cram" else args["reference"]) + if "RG" in infile.header: rg = infile.header["RG"] if "SM" in rg[0]: From b664d5a834053aebcc4c54d40fc8e3c7625a43b5 Mon Sep 17 00:00:00 2001 From: JMencius <553843771@qq.com> Date: Tue, 16 Apr 2024 01:31:51 +0800 Subject: [PATCH 2/2] move to coverage.pyx --- dysgu/cluster.pyx | 26 +------------------------- dysgu/coverage.pyx | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index bb07efd..6358395 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -1159,31 +1159,7 @@ def cluster_reads(args): if kind2 == "stdin" or kind2 == "-" or kind2 not in opts: raise ValueError("--ibam must be a .bam/cam/sam file") ibam = pysam.AlignmentFile(args["ibam"], opts[kind2], threads=1, - reference_filename=None if kind2 != "cram" else args["reference"]) - - # Double check if bam file is sorted - # first check SO tag in HD - if "HD" in infile.header: - hd = infile.header["HD"] - if "SO" in hd: - if hd["SO"] == "unsorted": - # raise ValueError for SO:unsorted - raise ValueError("Input bam file must be sorted") - # directly check bam just check the first 1000 alignment for speed - max_alignment_check = 10**3 - alignment_count = 0 - prev_alignment = None - for aln in infile: - alignment_count += 1 - if alignment_count > max_alignment_check: - break - if prev_alignment and prev_alignment.reference_id > aln.reference_id: - raise ValueError("Input bam file must be sorted") - prev_alignment = aln - logging.info("Check PASS : Input BAM file is sorted") - # reopen infile - infile = pysam.AlignmentFile(args["sv_aligns"], bam_mode, threads=args["procs"], - reference_filename=None if kind != "cram" else args["reference"]) + reference_filename=None if kind2 != "cram" else args["reference"]) if "RG" in infile.header: rg = infile.header["RG"] diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx index 8e526fd..712e676 100644 --- a/dysgu/coverage.pyx +++ b/dysgu/coverage.pyx @@ -326,7 +326,24 @@ cdef class GenomeScanner: cdef uint32_t *cigar_p cdef int i cdef int non_match_count, matched_bases + + # first check SO tag in HD + if "HD" in file_iter.header: + hd = file_iter.header["HD"] + if "SO" in hd: + if hd["SO"] == "unsorted": + # raise ValueError for SO:unsorted + raise ValueError("Input bam file must be sorted") + else: + logging.info("Check PASS : Input BAM file has sorted header") + + prev_alignment = None for a in file_iter: + # double check with index + if prev_alignment and prev_alignment.reference_id > a.reference_id: + raise ValueError("Input bam file must be sorted") + prev_alignment = a + if ibam is None: if a.flag & 1284 or a.mapq < self.mapq_threshold or a.cigartuples is None: continue