Skip to content

Commit

Permalink
DEV: Support multiple sources and sinks to fix #53
Browse files Browse the repository at this point in the history
  • Loading branch information
Vini2 committed Jun 12, 2024
1 parent a09b845 commit b71d313
Showing 1 changed file with 85 additions and 43 deletions.
128 changes: 85 additions & 43 deletions phables/workflow/scripts/phables_utils/long_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def resolve_long(
in_degree = []
out_degree = []

case_name = ""

# Case 2 components
if len(candidate_nodes) == 2:
all_self_looped = False
Expand Down Expand Up @@ -131,6 +133,8 @@ def resolve_long(

if unitig1 != "" and unitig2 != "":
# Case 2 - both are circular
case_name = "case2_circular"

if all_self_looped:
case2_found.add(my_count)

Expand Down Expand Up @@ -184,7 +188,7 @@ def resolve_long(

genome_path = GenomePath(
id=f"{prefix}phage_comp_{my_count}_cycle_{cycle_number}",
bubble_case="case2_circular",
bubble_case=case_name,
node_order=[
f"{repeat_unitig_name}+",
f"{unitig_name}+",
Expand All @@ -210,6 +214,9 @@ def resolve_long(

# Case 2 - only one is circular
elif one_circular:

case_name = "case2_linear"

case2_found.add(my_count)

cycle_components.add(my_count)
Expand Down Expand Up @@ -293,7 +300,7 @@ def resolve_long(

genome_path = GenomePath(
id=f"{prefix}phage_comp_{my_count}_cycle_{cycle_number}",
bubble_case="case2_linear",
bubble_case=case_name,
node_order=path_with_repeats,
node_order_human=path_with_repeats_human,
node_id_order=node_id_order_with_repeats,
Expand All @@ -311,6 +318,9 @@ def resolve_long(

# Case 3 components
elif len(candidate_nodes) > 2 and len(candidate_nodes) <= compcount:

case_name = "case3_circular"

# Create initial directed graph with coverage values
# ----------------------------------------------------------------------
G_edge = nx.DiGraph()
Expand Down Expand Up @@ -804,7 +814,7 @@ def resolve_long(
# Create GenomePath object with path details
genome_path = GenomePath(
id=f"{prefix}phage_comp_{my_count}_cycle_{cycle_number}",
bubble_case="case3_circular",
bubble_case=case_name,
node_order=[x for x in path_order],
node_order_human=path_node_order_human,
node_id_order=[
Expand Down Expand Up @@ -845,6 +855,8 @@ def resolve_long(
else:
logger.debug(f"No cycles detected. Found a complex linear component.")

case_name = "case3_linear"

linear_components.add(my_count)

# Identify source/sink vertex
Expand All @@ -858,19 +870,10 @@ def resolve_long(
logger.debug(f"Identified candidate sources: {source_candidates}")
logger.debug(f"Identified candidate sinks: {sink_candidates}")

if len(source_candidates) == 1 and len(sink_candidates) == 1:
logger.debug(f"Found source: {source_candidates[0]}")
logger.debug(f"Found sink: {sink_candidates[0]}")

source_node = unitig_names_rev[source_candidates[0][:-1]]
sink_node = unitig_names_rev[sink_candidates[0][:-1]]
if len(source_candidates) > 0 and len(sink_candidates) > 0:

candidate_nodes.remove(source_node)
candidate_nodes.insert(0, source_node)
candidate_nodes.remove(sink_node)
candidate_nodes.append(sink_node)

logger.debug(f"Ordered candidate_nodes: {candidate_nodes}")
source_node_indices = [unitig_names_rev[x[:-1]] for x in source_candidates]
sink_node_indices = [unitig_names_rev[x[:-1]] for x in sink_candidates]

# Create refined directed graph for flow network
# ----------------------------------------------------------------------
Expand Down Expand Up @@ -943,8 +946,8 @@ def resolve_long(
u_name = unitig_names_rev[u[:-1]]
v_name = unitig_names_rev[v[:-1]]

u_index = candidate_nodes.index(u_name)
v_index = candidate_nodes.index(v_name)
u_index = candidate_nodes.index(u_name) + 1
v_index = candidate_nodes.index(v_name) + 1

edge_list_indices[u_index] = u
edge_list_indices[v_index] = v
Expand Down Expand Up @@ -997,13 +1000,13 @@ def resolve_long(
# Extend subpath using coverages of predecessors
for u_pred in G_edge.predecessors(u):
u_pred_name = unitig_names_rev[u_pred[:-1]]
u_pred_index = candidate_nodes.index(u_pred_name)
u_pred_index = candidate_nodes.index(u_pred_name) + 1
u_pred_cov = unitig_coverages[u_pred[:-1]]
u_cov = unitig_coverages[u[:-1]]

if (
v_index != 0
and u_index != 0
(v_index - 1) not in source_node_indices
and (u_index - 1) not in source_node_indices
and u_pred_index != v_index
):
if (
Expand All @@ -1023,15 +1026,15 @@ def resolve_long(
# Extend subpath using coverages of successors
for v_succ in G_edge.successors(v):
v_succ_name = unitig_names_rev[v_succ[:-1]]
v_succ_index = candidate_nodes.index(v_succ_name)
v_succ_index = candidate_nodes.index(v_succ_name) + 1
v_succ_cov = unitig_coverages[v_succ[:-1]]
v_cov = unitig_coverages[v[:-1]]

if (
v_succ_index != 0
and u_index != 0
and v_index != 0
and v_index != len(candidate_nodes)
(v_succ_index - 1) not in source_node_indices
and (u_index - 1) not in source_node_indices
and (v_index - 1) not in source_node_indices
and (v_index - 1) not in sink_node_indices
and v_succ_index != u_index
):
if (
Expand Down Expand Up @@ -1067,12 +1070,10 @@ def resolve_long(

if subpath_coverage > covtol:
u_pred_name = unitig_names_rev[u_pred[:-1]]
u_pred_index = candidate_nodes.index(
u_pred_name
)
u_pred_index = candidate_nodes.index(u_pred_name) + 1
if (
v_index != 0
and u_index != 0
(v_index - 1) not in source_node_indices
and (u_index - 1) not in source_node_indices
and u_pred_index != v_index
):
subpaths[subpath_count] = [
Expand All @@ -1098,14 +1099,12 @@ def resolve_long(

if subpath_coverage > covtol:
v_succ_name = unitig_names_rev[v_succ[:-1]]
v_succ_index = candidate_nodes.index(
v_succ_name
)
v_succ_index = candidate_nodes.index(v_succ_name) + 1
if (
v_succ_index != 0
and u_index != 0
and v_index != 0
and v_index != len(candidate_nodes)
(v_succ_index - 1) not in source_node_indices
and (u_index - 1) not in source_node_indices
and (v_index - 1) not in source_node_indices
and (v_index - 1) not in sink_node_indices
and v_succ_index != u_index
):
subpaths[subpath_count] = [
Expand All @@ -1118,6 +1117,43 @@ def resolve_long(
)
subpath_count += 1

# Add common start to source links
for source_v in source_candidates:
source_node_index = candidate_nodes.index(unitig_names_rev[source_v[:-1]]) + 1
source_node_cov = unitig_coverages[source_v[:-1]]
cov_upper_bound = int(max_comp_cov * alpha)

network_edges.append(
(
0,
source_node_index,
source_node_cov,
cov_upper_bound,
)
)

subpaths[subpath_count] = [0, source_node_index]
subpath_count += 1

# Add common sink to end links
for sink_v in sink_candidates:
sink_node_index = candidate_nodes.index(unitig_names_rev[sink_v[:-1]]) + 1
sink_node_cov = unitig_coverages[sink_v[:-1]]
cov_upper_bound = int(max_comp_cov * alpha)

network_edges.append(
(
sink_node_index,
len(candidate_nodes) + 1,
sink_node_cov,
cov_upper_bound,
)
)

subpaths[subpath_count] = [sink_node_index, len(candidate_nodes) + 1]
subpath_count += 1


logger.debug(f"edge_list_indices: {edge_list_indices}")
logger.debug(f"subpaths: {subpaths}")

Expand Down Expand Up @@ -1170,7 +1206,7 @@ def resolve_long(
try:
candidate_paths = list(
nx.all_simple_paths(
G_path, 0, len(candidate_nodes) - 1
G_path, 0, len(candidate_nodes) + 1
)
)

Expand All @@ -1182,9 +1218,10 @@ def resolve_long(
# Get mapped unitigs in order from the flow network
path_order = []
for path_edge in candidate_paths[0]:
path_order.append(
edge_list_indices[path_edge]
)
if not( path_edge == 0 or path_edge == len(candidate_nodes) + 1):
path_order.append(
edge_list_indices[path_edge]
)

logger.debug(f"path_order: {path_order}")

Expand Down Expand Up @@ -1239,7 +1276,7 @@ def resolve_long(
# Create GenomePath object with path details
genome_path = GenomePath(
id=f"{prefix}phage_comp_{my_count}_cycle_{cycle_number}",
bubble_case="case3_linear",
bubble_case=case_name,
node_order=[x for x in path_order],
node_order_human=path_node_order_human,
node_id_order=[
Expand Down Expand Up @@ -1350,6 +1387,9 @@ def resolve_long(
if genomic_path.length > largest_length * LEN_THRESHOLD:
passed = True

if case_name == "case3_linear":
passed = True

path_node_order_string = ",".join(genomic_path.node_order)

if path_node_order_string in comp_resolved_paths:
Expand Down Expand Up @@ -1382,6 +1422,8 @@ def resolve_long(
and len(in_degree) > 0
and len(out_degree) > 0
):
coverage_frac = max(path_coverages) / min(path_coverages) if min(path_coverages) > 0 else 1

# Create GenomeComponent object with component details
genome_comp = GenomeComponent(
f"{prefix}phage_comp_{my_count}",
Expand All @@ -1404,7 +1446,7 @@ def resolve_long(
/ path_lengths[path_coverages.index(min(path_coverages))],
max(path_coverages),
min(path_coverages),
max(path_coverages) / min(path_coverages),
coverage_frac,
frac_unitigs,
)
all_components.append(genome_comp)
Expand Down

0 comments on commit b71d313

Please sign in to comment.