Dysgu dev (#113)

* dysgu_dev <- master (#108) * correct usage of update_filter_value in filter_normals It seems like at some point the usage of update_filter_value changed, because in several spots, it is missing the sample_name argument. This breaks `dysgu filter --keep-all`. * Update main.yml * Update main.yml * Update requirements.txt * updated build process to pyproject.toml (#103) * v1.6.6 updated README * Update main.yml * Added pyproject.toml * Updated pyproject.toml and setup.py * Update pyproject.toml --------- Co-authored-by: Dr. K. D. Murray <[email protected]> * Better recall+precision for long-reads. Improved merging pipeline. Faster runtime. Code refactoring. WIP # [skip ci] * Fixed issues with overriding presets. --mode option updated # [skip ci] * Fixed issues with overriding presets. --mode option updated to support r10 and revio, new presets available. --no-gt flat removed # [skip ci] * Fixed CLI issues for tests # [skip ci] * Fixed API issue * v1.7.0 * added deprecated warning for mode * Added filter for high divergence reads (long-reads only) # [skip ci] --------- Co-authored-by: Dr. K. D. Murray <[email protected]>
kcleal · Oct 25, 2024 · ee92885 · ee92885
1 parent 14a2338
commit ee92885
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 31 deletions.
diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
@@ -10,7 +10,7 @@ import itertools
 from dysgu import consensus
 from dysgu.map_set_utils import echo
 from dysgu.map_set_utils cimport hash as xxhasher
-from dysgu.map_set_utils cimport is_overlapping, clip_sizes_hard, EventResult, clip_sizes, min_fractional_overlapping
+from dysgu.map_set_utils cimport is_overlapping, clip_sizes_hard, EventResult, clip_sizes
 from dysgu.sv_category cimport AlignmentItem, classify_d
 from dysgu.extra_metrics cimport soft_clip_qual_corr
 from dysgu.extra_metrics import filter_poorly_aligned_ends, gap_size_upper_bound
@@ -49,7 +49,7 @@ cpdef n_aligned_bases(AlignedSegment aln):
         cigar_value = cigar_p[i]
         opp = <int> cigar_value & 15
         l = <int> cigar_value >> 4
-        if opp == 0:
+        if opp == 0 or opp == 7 or opp == 8:
             aligned += l
         elif opp == 1 or opp == 2:
             if l >= 30:
@@ -761,15 +761,18 @@ def bicluster_spanning_lr(spanning, informative):
     return [(part_a, []), (part_b, [])]
 
 
+
 def process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative,
                      generic_insertions, insert_ppf, to_assemble):
     cdef int min_found_support = 0
     cdef str svtype, jointype
     cdef bint passed
     cdef EventResult_t er
+    cdef AlignmentItem align
     # todo
     if not paired_end:
         spanning_alignments, rate_poor_ends = filter_poorly_aligned_ends(spanning_alignments, divergence)
+
         if not spanning_alignments or rate_poor_ends > 0.7:
             return None
 
@@ -965,22 +968,6 @@ cdef tuple informative_pair(u, v):
         return pri_u, pri_v
     return None
 
-# cdef tuple informative_pair(u, v, bint paired_end):
-#     # fetch either a split read or pair1 and pair2
-#     for i in u:
-#         i_info = i[0]
-#         for j in v:
-#             j_info = j[0]
-#             if j_info.hash_name == i_info.hash_name:
-#                 continue
-#             if not paired_end:
-#                 if i_info.read_enum == 1 and j_info.read_enum == 1:
-#                     return i, j
-#             elif i_info.read_enum == j_info.read_enum:
-#                 return i, j
-#     return None
-
-
 
 cdef tuple break_ops(positions, precise, int limit, float median_pos):
     # Inspired by mosdepth algorithm +1 for start -1 for end using intervals where break site could occur

diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
@@ -180,7 +180,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
             args["divergence"] = divergence
         else:
             args["divergence"] = float(args["divergence"])
-            logging.info(f"Sequence divergence upper bound {args['divergence']}")
+            logging.info(f"Sequence divergence {args['divergence']}")
         if args["mode"] == "pacbio-sequel2":
             max_dist, max_clust_dist = 35, 500000
             if args["merge_dist"] is None:
@@ -204,6 +204,8 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
     min_size = args["min_size"]
     length_extend = args["length_extend"]
     divergence = args["divergence"]
+    max_divergence = divergence * 13
+
     read_buffer = genome_scanner.read_buffer
     sites_info = sites_utils.vcf_reader(args["sites"], infile, args["parse_probs"], sample_name, args["ignore_sample_sites"] == "True", args["sites_prob"], args["sites_pass_only"] == "True")
 
@@ -234,7 +236,8 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
                                             temp_dir=tdir,
                                             find_n_aligned_bases=find_n_aligned_bases,
                                             position_distance_thresh=args['sd'],
-                                            max_search_depth=args['search_depth']
+                                            max_search_depth=args['search_depth'],
+                                            max_divergence=max_divergence
     )
     sites_index = None
     if sites_adder:

diff --git a/dysgu/extra_metrics.pxd b/dysgu/extra_metrics.pxd
@@ -1,4 +1,14 @@
 #cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
 
+from libc.stdint cimport uint32_t
 
-cdef float soft_clip_qual_corr(reads)
+
+cdef float soft_clip_qual_corr(reads)
+
+
+cdef struct WindowRate:
+    float rate
+    int index
+
+
+cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse)
diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx
@@ -172,12 +172,7 @@ cdef int log2_32(uint32_t value):
     return tab32[<uint32_t>(value*0x07C4ACDD) >> 27]
 
 
-cdef struct WindowRate:
-    float rate
-    int index
-
-
-cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse=False):
+cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse):
     cdef int n = 0
     cdef int matches = 0
     cdef int covered = 0
@@ -521,4 +516,4 @@ def find_repeat_expansions(events, insert_stdev):
                 e.stride = r["stride"]
                 e.exp_seq = r["exp_seq"]
                 e.ref_poly_bases += r["ref_poly_bases"]
-    return events
+    return events
diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx
@@ -1163,14 +1163,34 @@ class SiteAdder:
                 break
 
 
+cdef bint edit_distance_too_high(uint32_t *cigar_p, uint32_t cigar_l, float max_rate, int edit_distance):
+    # ignore large gaps when counting edit distance rate
+    cdef uint32_t cigar_value, opp, l, i
+    cdef uint32_t aligned = 0
+    for i in range(cigar_l):
+        cigar_value = cigar_p[i]
+        opp = cigar_value & 15
+        l = cigar_value >> 4
+        if opp == 0 or opp == 7 or opp == 8:
+            aligned += l
+        elif l < 30 and (opp == 1 or opp == 2):
+            aligned += l
+    cdef int max_edit = <int> ( <float>aligned * max_rate )
+    if max_edit > edit_distance:
+        return <bint>False
+    return <bint>True
+
+
 cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering_dist, int k=16, int m=7, int clip_l=21,
                             int min_sv_size=30,
                             int minimizer_support_thresh=2, int minimizer_breadth=3,
-                            int minimizer_dist=10, int mapq_thresh=1, debug=None, procs=1,
+                            int minimizer_dist=10, int mapq_thresh=1, debug=None, int procs=1,
                             int paired_end=1, int read_length=150, bint contigs=True,
                             float norm_thresh=100, float spd_thresh=0.3, bint mm_only=False,
-                            sites=None, bint trust_ins_len=True, low_mem=False, temp_dir=".",
-                            find_n_aligned_bases=True, position_distance_thresh=0.8, max_search_depth=20):
+                            sites=None, bint trust_ins_len=True, bint low_mem=False, temp_dir=".",
+                            bint find_n_aligned_bases=True, float position_distance_thresh=0.8,
+                            int max_search_depth=20, float max_divergence=0.2):
+
     logging.info("Building graph with clustering {} bp".format(clustering_dist))
     cdef TemplateEdges_t template_edges = TemplateEdges()  # Edges are added between alignments from same template, after building main graph
     cdef int event_pos, cigar_index, opp, length
@@ -1213,6 +1233,10 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
             events_to_add.clear()
             cigar_l = r._delegate.core.n_cigar
             cigar_p = bam_get_cigar(r._delegate)
+
+            if not paired_end and r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")):
+                continue
+
             if cigar_l > 1:
                 if r.has_tag("SA"):
                     # Set cigar-index to -1 means it is unset, will be determined during SA parsing