Skip to content

Commit

Permalink
Dysgu dev (#113)
Browse files Browse the repository at this point in the history
* dysgu_dev <- master (#108)

* correct usage of update_filter_value in filter_normals

It seems like at some point the usage of update_filter_value changed, because in several spots, it is missing the sample_name argument. This breaks `dysgu filter --keep-all`.

* Update main.yml

* Update main.yml

* Update requirements.txt

* updated build process to pyproject.toml (#103)

* v1.6.6 updated README

* Update main.yml

* Added pyproject.toml

* Updated pyproject.toml and setup.py

* Update pyproject.toml

---------

Co-authored-by: Dr. K. D. Murray <[email protected]>

* Better recall+precision for long-reads. Improved merging pipeline. Faster runtime. Code refactoring. WIP # [skip ci]

* Fixed issues with overriding presets. --mode option updated # [skip ci]

* Fixed issues with overriding presets. --mode option updated to support r10 and revio, new presets available. --no-gt flat removed # [skip ci]

* Fixed CLI issues for tests # [skip ci]

* Fixed API issue

* v1.7.0

* added deprecated warning for mode

* Added filter for high divergence reads (long-reads only) # [skip ci]

---------

Co-authored-by: Dr. K. D. Murray <[email protected]>
  • Loading branch information
kcleal and kdm9 authored Oct 25, 2024
1 parent 14a2338 commit ee92885
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 31 deletions.
23 changes: 5 additions & 18 deletions dysgu/call_component.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import itertools
from dysgu import consensus
from dysgu.map_set_utils import echo
from dysgu.map_set_utils cimport hash as xxhasher
from dysgu.map_set_utils cimport is_overlapping, clip_sizes_hard, EventResult, clip_sizes, min_fractional_overlapping
from dysgu.map_set_utils cimport is_overlapping, clip_sizes_hard, EventResult, clip_sizes
from dysgu.sv_category cimport AlignmentItem, classify_d
from dysgu.extra_metrics cimport soft_clip_qual_corr
from dysgu.extra_metrics import filter_poorly_aligned_ends, gap_size_upper_bound
Expand Down Expand Up @@ -49,7 +49,7 @@ cpdef n_aligned_bases(AlignedSegment aln):
cigar_value = cigar_p[i]
opp = <int> cigar_value & 15
l = <int> cigar_value >> 4
if opp == 0:
if opp == 0 or opp == 7 or opp == 8:
aligned += l
elif opp == 1 or opp == 2:
if l >= 30:
Expand Down Expand Up @@ -761,15 +761,18 @@ def bicluster_spanning_lr(spanning, informative):
return [(part_a, []), (part_b, [])]



def process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative,
generic_insertions, insert_ppf, to_assemble):
cdef int min_found_support = 0
cdef str svtype, jointype
cdef bint passed
cdef EventResult_t er
cdef AlignmentItem align
# todo
if not paired_end:
spanning_alignments, rate_poor_ends = filter_poorly_aligned_ends(spanning_alignments, divergence)

if not spanning_alignments or rate_poor_ends > 0.7:
return None

Expand Down Expand Up @@ -965,22 +968,6 @@ cdef tuple informative_pair(u, v):
return pri_u, pri_v
return None

# cdef tuple informative_pair(u, v, bint paired_end):
# # fetch either a split read or pair1 and pair2
# for i in u:
# i_info = i[0]
# for j in v:
# j_info = j[0]
# if j_info.hash_name == i_info.hash_name:
# continue
# if not paired_end:
# if i_info.read_enum == 1 and j_info.read_enum == 1:
# return i, j
# elif i_info.read_enum == j_info.read_enum:
# return i, j
# return None



cdef tuple break_ops(positions, precise, int limit, float median_pos):
# Inspired by mosdepth algorithm +1 for start -1 for end using intervals where break site could occur
Expand Down
7 changes: 5 additions & 2 deletions dysgu/cluster.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
args["divergence"] = divergence
else:
args["divergence"] = float(args["divergence"])
logging.info(f"Sequence divergence upper bound {args['divergence']}")
logging.info(f"Sequence divergence {args['divergence']}")
if args["mode"] == "pacbio-sequel2":
max_dist, max_clust_dist = 35, 500000
if args["merge_dist"] is None:
Expand All @@ -204,6 +204,8 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
min_size = args["min_size"]
length_extend = args["length_extend"]
divergence = args["divergence"]
max_divergence = divergence * 13

read_buffer = genome_scanner.read_buffer
sites_info = sites_utils.vcf_reader(args["sites"], infile, args["parse_probs"], sample_name, args["ignore_sample_sites"] == "True", args["sites_prob"], args["sites_pass_only"] == "True")

Expand Down Expand Up @@ -234,7 +236,8 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
temp_dir=tdir,
find_n_aligned_bases=find_n_aligned_bases,
position_distance_thresh=args['sd'],
max_search_depth=args['search_depth']
max_search_depth=args['search_depth'],
max_divergence=max_divergence
)
sites_index = None
if sites_adder:
Expand Down
12 changes: 11 additions & 1 deletion dysgu/extra_metrics.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True

from libc.stdint cimport uint32_t

cdef float soft_clip_qual_corr(reads)

cdef float soft_clip_qual_corr(reads)


cdef struct WindowRate:
float rate
int index


cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse)
9 changes: 2 additions & 7 deletions dysgu/extra_metrics.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,7 @@ cdef int log2_32(uint32_t value):
return tab32[<uint32_t>(value*0x07C4ACDD) >> 27]


cdef struct WindowRate:
float rate
int index


cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse=False):
cdef void window_rate(WindowRate *result, uint32_t cigar_l, uint32_t *cigar_p, int index, int window_size, bint reverse):
cdef int n = 0
cdef int matches = 0
cdef int covered = 0
Expand Down Expand Up @@ -521,4 +516,4 @@ def find_repeat_expansions(events, insert_stdev):
e.stride = r["stride"]
e.exp_seq = r["exp_seq"]
e.ref_poly_bases += r["ref_poly_bases"]
return events
return events
30 changes: 27 additions & 3 deletions dysgu/graph.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1163,14 +1163,34 @@ class SiteAdder:
break


cdef bint edit_distance_too_high(uint32_t *cigar_p, uint32_t cigar_l, float max_rate, int edit_distance):
# ignore large gaps when counting edit distance rate
cdef uint32_t cigar_value, opp, l, i
cdef uint32_t aligned = 0
for i in range(cigar_l):
cigar_value = cigar_p[i]
opp = cigar_value & 15
l = cigar_value >> 4
if opp == 0 or opp == 7 or opp == 8:
aligned += l
elif l < 30 and (opp == 1 or opp == 2):
aligned += l
cdef int max_edit = <int> ( <float>aligned * max_rate )
if max_edit > edit_distance:
return <bint>False
return <bint>True


cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering_dist, int k=16, int m=7, int clip_l=21,
int min_sv_size=30,
int minimizer_support_thresh=2, int minimizer_breadth=3,
int minimizer_dist=10, int mapq_thresh=1, debug=None, procs=1,
int minimizer_dist=10, int mapq_thresh=1, debug=None, int procs=1,
int paired_end=1, int read_length=150, bint contigs=True,
float norm_thresh=100, float spd_thresh=0.3, bint mm_only=False,
sites=None, bint trust_ins_len=True, low_mem=False, temp_dir=".",
find_n_aligned_bases=True, position_distance_thresh=0.8, max_search_depth=20):
sites=None, bint trust_ins_len=True, bint low_mem=False, temp_dir=".",
bint find_n_aligned_bases=True, float position_distance_thresh=0.8,
int max_search_depth=20, float max_divergence=0.2):

logging.info("Building graph with clustering {} bp".format(clustering_dist))
cdef TemplateEdges_t template_edges = TemplateEdges() # Edges are added between alignments from same template, after building main graph
cdef int event_pos, cigar_index, opp, length
Expand Down Expand Up @@ -1213,6 +1233,10 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
events_to_add.clear()
cigar_l = r._delegate.core.n_cigar
cigar_p = bam_get_cigar(r._delegate)

if not paired_end and r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")):
continue

if cigar_l > 1:
if r.has_tag("SA"):
# Set cigar-index to -1 means it is unset, will be determined during SA parsing
Expand Down

0 comments on commit ee92885

Please sign in to comment.