From 925edca3062783f9251bdec3605adf2f9e4600e2 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Wed, 16 Sep 2020 14:53:07 -0400
Subject: [PATCH 1/3] [pygenomeworks] Fix incorrect output of number of
 mismatched starts/ends in evaluate_paf.

The `evaluate_paf` script now reports mismatched starts/ends only from the
best match, rather than adding on every iteration.
---
 pygenomeworks/bin/evaluate_paf | 44 ++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/pygenomeworks/bin/evaluate_paf b/pygenomeworks/bin/evaluate_paf
index b29aacbd1..f86f66954 100755
--- a/pygenomeworks/bin/evaluate_paf
+++ b/pygenomeworks/bin/evaluate_paf
@@ -90,8 +90,8 @@ def match_overlaps(record, other, pos_tolerance, min_reciprocal_overlap):
     """
 
     equal, query_start_valid, query_end_valid, target_start_valid, target_end_valid, strands_equal = records_equal(record, other, pos_tolerance)
-    
-    reciprocal =  calculate_reciprocal_overlap(record, other) > min_reciprocal_overlap
+    pct_recip = calculate_reciprocal_overlap(record, other)
+    reciprocal =  pct_recip > min_reciprocal_overlap
 
     match = equal or reciprocal
 
@@ -100,6 +100,7 @@ def match_overlaps(record, other, pos_tolerance, min_reciprocal_overlap):
             "target_start_valid": target_start_valid,
             "target_end_valid": target_end_valid,
             "reciprocal_overlaps": reciprocal,
+            "percent_reciprocal": pct_recip,
             "strands_equal" : strands_equal,
             "equal" : equal,
             "match": match}
@@ -186,28 +187,35 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip
 
         # seen_test_overlap_keys.add(key)
         # seen_test_overlap_keys.add(key_reversed)
-
+        #incorrect_query_start += not match_statistics["query_start_valid"]
+        #incorrect_query_end += not match_statistics["query_end_valid"]
+        #incorrect_target_start += not match_statistics["target_start_valid"]
+        #incorrect_target_end += not match_statistics["target_end_valid"]
+        best_pct_match = 0.0
+        best_ends = [1, 1, 1, 1]
+        
         found_match = False
         if key in truth_keys:
             for truth_interval in truth_query_intervals[test_overlap.query_sequence_name]:
                 truth_overlap = truth_interval.data
                 match_statistics = match_overlaps(truth_overlap, test_overlap, pos_tolerance, min_reciprocal)
-                incorrect_query_start += not match_statistics["query_start_valid"]
-                incorrect_query_end += not match_statistics["query_end_valid"]
-                incorrect_target_start += not match_statistics["target_start_valid"]
-                incorrect_target_end += not match_statistics["target_end_valid"]
                 if match_statistics["match"]:
                     true_positive_count += 1
                     found_match = True
+                    best_pct_match = match_statistics["percent_reciprocal"]
+                    best_ends = [0, 0, 0, 0]
                     break
+                pct_match = match_statistics["percent_reciprocal"]
+                if pct_match > best_pct_match:
+                    best_pct_match = pct_match
+                    best_ends[0] = 1 if not match_statistics["query_start_valid"] else 0
+                    best_ends[1] = 1 if not match_statistics["query_end_valid"] else 0
+                    best_ends[2] = 1 if not match_statistics["target_start_valid"] else 0
+                    best_ends[3] = 1 if not match_statistics["target_end_valid"] else 0
             if not found_match:
                 for truth_interval in truth_target_intervals[test_overlap.target_sequence_name]:
                     truth_overlap = truth_interval.data
                     match_statistics = match_overlaps(truth_overlap, test_overlap, pos_tolerance, min_reciprocal)
-                    incorrect_query_start += not match_statistics["query_start_valid"]
-                    incorrect_query_end += not match_statistics["query_end_valid"]
-                    incorrect_target_start += not match_statistics["target_start_valid"]
-                    incorrect_target_end += not match_statistics["target_end_valid"]
                     if match_statistics["match"]:
                         true_positive_count += 1
                         found_match = True
@@ -217,10 +225,6 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip
             for truth_interval in truth_query_intervals[key_reversed]:
                 truth_overlap = truth_interval.data
                 match_statistics = match_overlaps(truth_overlap, test_overlap)
-                incorrect_query_start += not match_statistics["query_start_valid"]
-                incorrect_query_end += not match_statistics["query_end_valid"]
-                incorrect_target_start += not match_statistics["target_start_valid"]
-                incorrect_target_end += not match_statistics["target_end_valid"]
                 if match_statistics["match"]:
                     true_positive_count += 1
                     found_match = True
@@ -229,15 +233,15 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip
                 for truth_interval in truth_target_intervals[key_reversed]:
                     truth_overlap = truth_interval.data
                     match_statistics = match_overlaps(truth_overlap, test_overlap)
-                    incorrect_query_start += not match_statistics["query_start_valid"]
-                    incorrect_query_end += not match_statistics["query_end_valid"]
-                    incorrect_target_start += not match_statistics["target_start_valid"]
-                    incorrect_target_end += not match_statistics["target_end_valid"]
                     if match_statistics["match"]:
                         true_positive_count += 1
                         found_match = True
                         break
-
+        incorrect_query_start += best_ends[0]
+        incorrect_query_end += best_ends[1]
+        incorrect_target_start += best_ends[2]
+        incorrect_target_end += best_ends[3]
+        
         if not found_match:
             false_positive_count += 1
 

From a47641ffcb7cec55f3cfb1818ded7f5229897555 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Wed, 16 Sep 2020 14:59:27 -0400
Subject: [PATCH 2/3] [pygenomeworks] Add tabulation for mismatched starts/ends
 to target and searches where the query/target keys are reversed.

---
 pygenomeworks/bin/evaluate_paf | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/pygenomeworks/bin/evaluate_paf b/pygenomeworks/bin/evaluate_paf
index f86f66954..2d6a47daf 100755
--- a/pygenomeworks/bin/evaluate_paf
+++ b/pygenomeworks/bin/evaluate_paf
@@ -220,6 +220,13 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip
                         true_positive_count += 1
                         found_match = True
                         break
+                    pct_match = match_statistics["percent_reciprocal"]
+                    if pct_match > best_pct_match:
+                        best_pct_match = pct_match
+                        best_ends[0] = 1 if not match_statistics["query_start_valid"] else 0
+                        best_ends[1] = 1 if not match_statistics["query_end_valid"] else 0
+                        best_ends[2] = 1 if not match_statistics["target_start_valid"] else 0
+                        best_ends[3] = 1 if not match_statistics["target_end_valid"] else 0
         if not found_match and key_reversed in truth_keys:
             test_overlap = reverse_record(test_overlap)
             for truth_interval in truth_query_intervals[key_reversed]:
@@ -229,6 +236,13 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip
                     true_positive_count += 1
                     found_match = True
                     break
+                pct_match = match_statistics["percent_reciprocal"]
+                if pct_match > best_pct_match:
+                    best_pct_match = pct_match
+                    best_ends[0] = 1 if not match_statistics["query_start_valid"] else 0
+                    best_ends[1] = 1 if not match_statistics["query_end_valid"] else 0
+                    best_ends[2] = 1 if not match_statistics["target_start_valid"] else 0
+                    best_ends[3] = 1 if not match_statistics["target_end_valid"] else 0
             if not found_match:
                 for truth_interval in truth_target_intervals[key_reversed]:
                     truth_overlap = truth_interval.data
@@ -237,6 +251,13 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip
                         true_positive_count += 1
                         found_match = True
                         break
+                    pct_match = match_statistics["percent_reciprocal"]
+                    if pct_match > best_pct_match:
+                        best_pct_match = pct_match
+                        best_ends[0] = 1 if not match_statistics["query_start_valid"] else 0
+                        best_ends[1] = 1 if not match_statistics["query_end_valid"] else 0
+                        best_ends[2] = 1 if not match_statistics["target_start_valid"] else 0
+                        best_ends[3] = 1 if not match_statistics["target_end_valid"] else 0
         incorrect_query_start += best_ends[0]
         incorrect_query_end += best_ends[1]
         incorrect_target_start += best_ends[2]

From b3b0a651a0ac8cee6d31cad5dff50f9bb09a33d9 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 21 Sep 2020 12:32:21 -0400
Subject: [PATCH 3/3] [pygenomeworks] Remove commented out code from
 evaluate_paf.

---
 pygenomeworks/bin/evaluate_paf | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/pygenomeworks/bin/evaluate_paf b/pygenomeworks/bin/evaluate_paf
index 2d6a47daf..f248311c6 100755
--- a/pygenomeworks/bin/evaluate_paf
+++ b/pygenomeworks/bin/evaluate_paf
@@ -176,21 +176,10 @@ def evaluate_paf(truth_paf_filepath, test_paf_filepath, pos_tolerance, min_recip
                 (test_overlap.query_sequence_name == test_overlap.target_sequence_name):
             continue
         test_overlap_count += 1
-        # query_0 = (test_overlap.query_start, test_overlap.query_end)
-        # target_0 = (test_overlap.target_start, test_overlap.target_end)
 
         key = generate_key(test_overlap.query_sequence_name, test_overlap.target_sequence_name)
         key_reversed = generate_key(test_overlap.target_sequence_name, test_overlap.query_sequence_name)
 
-        # if (key in seen_test_overlap_keys) or (key_reversed in seen_test_overlap_keys):
-        #     continue
-
-        # seen_test_overlap_keys.add(key)
-        # seen_test_overlap_keys.add(key_reversed)
-        #incorrect_query_start += not match_statistics["query_start_valid"]
-        #incorrect_query_end += not match_statistics["query_end_valid"]
-        #incorrect_target_start += not match_statistics["target_start_valid"]
-        #incorrect_target_end += not match_statistics["target_end_valid"]
         best_pct_match = 0.0
         best_ends = [1, 1, 1, 1]