diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 1fdd423..1c13a34 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -10,7 +10,7 @@ import itertools from dysgu import consensus from dysgu.map_set_utils import echo from dysgu.map_set_utils cimport hash as xxhasher -from dysgu.map_set_utils cimport is_overlapping, clip_sizes_hard, EventResult, clip_sizes, min_fractional_overlapping +from dysgu.map_set_utils cimport is_overlapping, clip_sizes_hard, EventResult, clip_sizes from dysgu.sv_category cimport AlignmentItem, classify_d from dysgu.extra_metrics cimport soft_clip_qual_corr from dysgu.extra_metrics import filter_poorly_aligned_ends, gap_size_upper_bound @@ -49,7 +49,7 @@ cpdef n_aligned_bases(AlignedSegment aln): cigar_value = cigar_p[i] opp = cigar_value & 15 l = cigar_value >> 4 - if opp == 0: + if opp == 0 or opp == 7 or opp == 8: aligned += l elif opp == 1 or opp == 2: if l >= 30: @@ -761,15 +761,18 @@ def bicluster_spanning_lr(spanning, informative): return [(part_a, []), (part_b, [])] + def process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative, generic_insertions, insert_ppf, to_assemble): cdef int min_found_support = 0 cdef str svtype, jointype cdef bint passed cdef EventResult_t er + cdef AlignmentItem align # todo if not paired_end: spanning_alignments, rate_poor_ends = filter_poorly_aligned_ends(spanning_alignments, divergence) + if not spanning_alignments or rate_poor_ends > 0.7: return None @@ -965,22 +968,6 @@ cdef tuple informative_pair(u, v): return pri_u, pri_v return None -# cdef tuple informative_pair(u, v, bint paired_end): -# # fetch either a split read or pair1 and pair2 -# for i in u: -# i_info = i[0] -# for j in v: -# j_info = j[0] -# if j_info.hash_name == i_info.hash_name: -# continue -# if not paired_end: -# if i_info.read_enum == 1 and j_info.read_enum == 1: -# return i, j -# elif i_info.read_enum == j_info.read_enum: -# return i, j -# return None - - cdef tuple break_ops(positions, precise, int limit, float median_pos): # Inspired by mosdepth algorithm +1 for start -1 for end using intervals where break site could occur diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 14cf4b3..5d77be2 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -180,7 +180,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N args["divergence"] = divergence else: args["divergence"] = float(args["divergence"]) - logging.info(f"Sequence divergence upper bound {args['divergence']}") + logging.info(f"Sequence divergence {args['divergence']}") if args["mode"] == "pacbio-sequel2": max_dist, max_clust_dist = 35, 500000 if args["merge_dist"] is None: @@ -204,6 +204,8 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N min_size = args["min_size"] length_extend = args["length_extend"] divergence = args["divergence"] + max_divergence = divergence * 13 + read_buffer = genome_scanner.read_buffer sites_info = sites_utils.vcf_reader(args["sites"], infile, args["parse_probs"], sample_name, args["ignore_sample_sites"] == "True", args["sites_prob"], args["sites_pass_only"] == "True") @@ -234,7 +236,8 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N temp_dir=tdir, find_n_aligned_bases=find_n_aligned_bases, position_distance_thresh=args['sd'], - max_search_depth=args['search_depth'] + max_search_depth=args['search_depth'], + max_divergence=max_divergence ) sites_index = None if sites_adder: diff --git a/dysgu/extra_metrics.pxd b/dysgu/extra_metrics.pxd index 2fc7078..d4bae53 100644 --- a/dysgu/extra_metrics.pxd +++ b/dysgu/extra_metrics.pxd @@ -1,4 +1,14 @@ #cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True +from libc.stdint cimport uint32_t -cdef float soft_clip_qual_corr(reads) \ No newline at end of file + +cdef float soft_clip_qual_corr(reads) + + +cdef struct WindowRate: + float rate + int index + + +cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse) diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx index 8a829dc..d4bef24 100644 --- a/dysgu/extra_metrics.pyx +++ b/dysgu/extra_metrics.pyx @@ -172,12 +172,7 @@ cdef int log2_32(uint32_t value): return tab32[(value*0x07C4ACDD) >> 27] -cdef struct WindowRate: - float rate - int index - - -cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse=False): +cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse): cdef int n = 0 cdef int matches = 0 cdef int covered = 0 @@ -521,4 +516,4 @@ def find_repeat_expansions(events, insert_stdev): e.stride = r["stride"] e.exp_seq = r["exp_seq"] e.ref_poly_bases += r["ref_poly_bases"] - return events \ No newline at end of file + return events diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index 7365a96..5d641ba 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -1163,14 +1163,34 @@ class SiteAdder: break +cdef bint edit_distance_too_high(uint32_t *cigar_p, uint32_t cigar_l, float max_rate, int edit_distance): + # ignore large gaps when counting edit distance rate + cdef uint32_t cigar_value, opp, l, i + cdef uint32_t aligned = 0 + for i in range(cigar_l): + cigar_value = cigar_p[i] + opp = cigar_value & 15 + l = cigar_value >> 4 + if opp == 0 or opp == 7 or opp == 8: + aligned += l + elif l < 30 and (opp == 1 or opp == 2): + aligned += l + cdef int max_edit = ( aligned * max_rate ) + if max_edit > edit_distance: + return False + return True + + cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering_dist, int k=16, int m=7, int clip_l=21, int min_sv_size=30, int minimizer_support_thresh=2, int minimizer_breadth=3, - int minimizer_dist=10, int mapq_thresh=1, debug=None, procs=1, + int minimizer_dist=10, int mapq_thresh=1, debug=None, int procs=1, int paired_end=1, int read_length=150, bint contigs=True, float norm_thresh=100, float spd_thresh=0.3, bint mm_only=False, - sites=None, bint trust_ins_len=True, low_mem=False, temp_dir=".", - find_n_aligned_bases=True, position_distance_thresh=0.8, max_search_depth=20): + sites=None, bint trust_ins_len=True, bint low_mem=False, temp_dir=".", + bint find_n_aligned_bases=True, float position_distance_thresh=0.8, + int max_search_depth=20, float max_divergence=0.2): + logging.info("Building graph with clustering {} bp".format(clustering_dist)) cdef TemplateEdges_t template_edges = TemplateEdges() # Edges are added between alignments from same template, after building main graph cdef int event_pos, cigar_index, opp, length @@ -1213,6 +1233,10 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering events_to_add.clear() cigar_l = r._delegate.core.n_cigar cigar_p = bam_get_cigar(r._delegate) + + if not paired_end and r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")): + continue + if cigar_l > 1: if r.has_tag("SA"): # Set cigar-index to -1 means it is unset, will be determined during SA parsing