From 925edca3062783f9251bdec3605adf2f9e4600e2 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Wed, 16 Sep 2020 14:53:07 -0400 Subject: [PATCH 1/3] [pygenomeworks] Fix incorrect output of number of mismatched starts/ends in evaluate_paf. The `evaluate_paf` script now reports mismatched starts/ends only from the best match, rather than adding on every iteration. --- pygenomeworks/bin/evaluate_paf | 44 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/pygenomeworks/bin/evaluate_paf b/pygenomeworks/bin/evaluate_paf index b29aacbd1..f86f66954 100755 --- a/pygenomeworks/bin/evaluate_paf +++ b/pygenomeworks/bin/evaluate_paf @@ -90,8 +90,8 @@ def match_overlaps(record, other, pos_tolerance, min_reciprocal_overlap): """ equal, query_start_valid, query_end_valid, target_start_valid, target_end_valid, strands_equal = records_equal(record, other, pos_tolerance) - - reciprocal = calculate_reciprocal_overlap(record, other) > min_reciprocal_overlap + pct_recip = calculate_reciprocal_overlap(record, other) + reciprocal = pct_recip > min_reciprocal_overlap match = equal or reciprocal @@ -100,6 +100,7 @@ def match_overlaps(record, other, pos_tolerance, min_reciprocal_overlap): "target_start_valid": target_start_valid, "target_end_valid": target_end_valid, "reciprocal_overlaps": reciprocal, + "percent_reciprocal": pct_recip, "strands_equal" : strands_equal, "equal" : equal, "match": match} @@ -186,28 +187,35 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip # seen_test_overlap_keys.add(key) # seen_test_overlap_keys.add(key_reversed) - + #incorrect_query_start += not match_statistics["query_start_valid"] + #incorrect_query_end += not match_statistics["query_end_valid"] + #incorrect_target_start += not match_statistics["target_start_valid"] + #incorrect_target_end += not match_statistics["target_end_valid"] + best_pct_match = 0.0 + best_ends = [1, 1, 1, 1] + found_match = False if key in truth_keys: for truth_interval in truth_query_intervals[test_overlap.query_sequence_name]: truth_overlap = truth_interval.data match_statistics = match_overlaps(truth_overlap, test_overlap, pos_tolerance, min_reciprocal) - incorrect_query_start += not match_statistics["query_start_valid"] - incorrect_query_end += not match_statistics["query_end_valid"] - incorrect_target_start += not match_statistics["target_start_valid"] - incorrect_target_end += not match_statistics["target_end_valid"] if match_statistics["match"]: true_positive_count += 1 found_match = True + best_pct_match = match_statistics["percent_reciprocal"] + best_ends = [0, 0, 0, 0] break + pct_match = match_statistics["percent_reciprocal"] + if pct_match > best_pct_match: + best_pct_match = pct_match + best_ends[0] = 1 if not match_statistics["query_start_valid"] else 0 + best_ends[1] = 1 if not match_statistics["query_end_valid"] else 0 + best_ends[2] = 1 if not match_statistics["target_start_valid"] else 0 + best_ends[3] = 1 if not match_statistics["target_end_valid"] else 0 if not found_match: for truth_interval in truth_target_intervals[test_overlap.target_sequence_name]: truth_overlap = truth_interval.data match_statistics = match_overlaps(truth_overlap, test_overlap, pos_tolerance, min_reciprocal) - incorrect_query_start += not match_statistics["query_start_valid"] - incorrect_query_end += not match_statistics["query_end_valid"] - incorrect_target_start += not match_statistics["target_start_valid"] - incorrect_target_end += not match_statistics["target_end_valid"] if match_statistics["match"]: true_positive_count += 1 found_match = True @@ -217,10 +225,6 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip for truth_interval in truth_query_intervals[key_reversed]: truth_overlap = truth_interval.data match_statistics = match_overlaps(truth_overlap, test_overlap) - incorrect_query_start += not match_statistics["query_start_valid"] - incorrect_query_end += not match_statistics["query_end_valid"] - incorrect_target_start += not match_statistics["target_start_valid"] - incorrect_target_end += not match_statistics["target_end_valid"] if match_statistics["match"]: true_positive_count += 1 found_match = True @@ -229,15 +233,15 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip for truth_interval in truth_target_intervals[key_reversed]: truth_overlap = truth_interval.data match_statistics = match_overlaps(truth_overlap, test_overlap) - incorrect_query_start += not match_statistics["query_start_valid"] - incorrect_query_end += not match_statistics["query_end_valid"] - incorrect_target_start += not match_statistics["target_start_valid"] - incorrect_target_end += not match_statistics["target_end_valid"] if match_statistics["match"]: true_positive_count += 1 found_match = True break - + incorrect_query_start += best_ends[0] + incorrect_query_end += best_ends[1] + incorrect_target_start += best_ends[2] + incorrect_target_end += best_ends[3] + if not found_match: false_positive_count += 1 From a47641ffcb7cec55f3cfb1818ded7f5229897555 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Wed, 16 Sep 2020 14:59:27 -0400 Subject: [PATCH 2/3] [pygenomeworks] Add tabulation for mismatched starts/ends to target and searches where the query/target keys are reversed. --- pygenomeworks/bin/evaluate_paf | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pygenomeworks/bin/evaluate_paf b/pygenomeworks/bin/evaluate_paf index f86f66954..2d6a47daf 100755 --- a/pygenomeworks/bin/evaluate_paf +++ b/pygenomeworks/bin/evaluate_paf @@ -220,6 +220,13 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip true_positive_count += 1 found_match = True break + pct_match = match_statistics["percent_reciprocal"] + if pct_match > best_pct_match: + best_pct_match = pct_match + best_ends[0] = 1 if not match_statistics["query_start_valid"] else 0 + best_ends[1] = 1 if not match_statistics["query_end_valid"] else 0 + best_ends[2] = 1 if not match_statistics["target_start_valid"] else 0 + best_ends[3] = 1 if not match_statistics["target_end_valid"] else 0 if not found_match and key_reversed in truth_keys: test_overlap = reverse_record(test_overlap) for truth_interval in truth_query_intervals[key_reversed]: @@ -229,6 +236,13 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip true_positive_count += 1 found_match = True break + pct_match = match_statistics["percent_reciprocal"] + if pct_match > best_pct_match: + best_pct_match = pct_match + best_ends[0] = 1 if not match_statistics["query_start_valid"] else 0 + best_ends[1] = 1 if not match_statistics["query_end_valid"] else 0 + best_ends[2] = 1 if not match_statistics["target_start_valid"] else 0 + best_ends[3] = 1 if not match_statistics["target_end_valid"] else 0 if not found_match: for truth_interval in truth_target_intervals[key_reversed]: truth_overlap = truth_interval.data @@ -237,6 +251,13 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip true_positive_count += 1 found_match = True break + pct_match = match_statistics["percent_reciprocal"] + if pct_match > best_pct_match: + best_pct_match = pct_match + best_ends[0] = 1 if not match_statistics["query_start_valid"] else 0 + best_ends[1] = 1 if not match_statistics["query_end_valid"] else 0 + best_ends[2] = 1 if not match_statistics["target_start_valid"] else 0 + best_ends[3] = 1 if not match_statistics["target_end_valid"] else 0 incorrect_query_start += best_ends[0] incorrect_query_end += best_ends[1] incorrect_target_start += best_ends[2] From b3b0a651a0ac8cee6d31cad5dff50f9bb09a33d9 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 21 Sep 2020 12:32:21 -0400 Subject: [PATCH 3/3] [pygenomeworks] Remove commented out code from evaluate_paf. --- pygenomeworks/bin/evaluate_paf | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pygenomeworks/bin/evaluate_paf b/pygenomeworks/bin/evaluate_paf index 2d6a47daf..f248311c6 100755 --- a/pygenomeworks/bin/evaluate_paf +++ b/pygenomeworks/bin/evaluate_paf @@ -176,21 +176,10 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip (test_overlap.query_sequence_name == test_overlap.target_sequence_name): continue test_overlap_count += 1 - # query_0 = (test_overlap.query_start, test_overlap.query_end) - # target_0 = (test_overlap.target_start, test_overlap.target_end) key = generate_key(test_overlap.query_sequence_name, test_overlap.target_sequence_name) key_reversed = generate_key(test_overlap.target_sequence_name, test_overlap.query_sequence_name) - # if (key in seen_test_overlap_keys) or (key_reversed in seen_test_overlap_keys): - # continue - - # seen_test_overlap_keys.add(key) - # seen_test_overlap_keys.add(key_reversed) - #incorrect_query_start += not match_statistics["query_start_valid"] - #incorrect_query_end += not match_statistics["query_end_valid"] - #incorrect_target_start += not match_statistics["target_start_valid"] - #incorrect_target_end += not match_statistics["target_end_valid"] best_pct_match = 0.0 best_ends = [1, 1, 1, 1]