Fixed #755 (#756)

## Summary of Changes Updated the TR grading code to stop double counting automated extractions which lead to an incorrect amount of true positives. ## Expanded explanation There is a many to one relationship between SKEMA extractions and manual annotations. Multiple extractions can match an annotation (be correct). To compute P, R and F1, all the extractions associated to a manual annotation should be counted as a single true positive. This change addresses this issue. ### Related issues Resolves 755
ml4ai · Jan 16, 2024 · e017c99 · e017c99
1 parent ea7e2b6
commit e017c99
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 16 deletions.
diff --git a/skema/rest/tests/test_integrated_text_reading_proxy.py b/skema/rest/tests/test_integrated_text_reading_proxy.py
@@ -114,9 +114,9 @@ def test_extraction_evaluation():
     results = response.json()
 
     assert results['num_manual_annotations'] == 220, "There should be 220 gt manual annotations"
-    assert results['precision'] == approx(0.7230769230768118), "Precision drastically different from the expected value"
-    assert results['recall'] == approx(0.21363636363636362), "Recall drastically different from the expected value"
-    assert results['f1'] == approx(0.32982456136828636), "F1 drastically different from the expected value"
+    assert results['precision'] == approx(0.5230769230768426), "Precision drastically different from the expected value"
+    assert results['recall'] == approx(0.154545454545454542), "Recall drastically different from the expected value"
+    assert results['f1'] == approx(0.23859649119285095), "F1 drastically different from the expected value"
 
 
 def test_healthcheck():

diff --git a/skema/rest/utils.py b/skema/rest/utils.py
@@ -18,6 +18,7 @@ async def get_client():
         yield client
         # close the client when the request is done
 
+
 def fn_preprocessor(function_network: Dict[str, Any]):
     fn_data = function_network.copy()
 
@@ -180,23 +181,32 @@ def compute_text_reading_evaluation(gt_data: list, attributes: AttributeCollecti
             page = a["page"]
             annotations_by_page[page].append(a)
 
+    def annotation_key(a: Dict):
+        return a['page'], tuple(a['start_xy']), a['text']
+
     # Count the matches
     tp, tn, fp, fn = 0, 0, 0, 0
+    matched_annotations = set()
     for e in extractions:
+        matched = False
         for m in e.mentions:
-            if m.extraction_source is not None:
-                te = m.extraction_source
-                if te.page is not None:
-                    e_page = te.page
-                    page_annotations = annotations_by_page[e_page]
-                    matched = False
-                    for a in page_annotations:
-                        if extraction_matches_annotation(m, a, json_contents):
-                            matched = True
-                            tp += 1
-                            break
-                    if not matched:
-                        fp += 1
+            if not matched:
+                if m.extraction_source is not None:
+                    te = m.extraction_source
+                    if te.page is not None:
+                        e_page = te.page
+                        page_annotations = annotations_by_page[e_page]
+
+                        for a in page_annotations:
+                            key = annotation_key(a)
+                            if key not in matched_annotations:
+                                if extraction_matches_annotation(m, a, json_contents):
+                                    matched_annotations.add(key)
+                                    matched = True
+                                    tp += 1
+                                    break
+                        if not matched:
+                            fp += 1
 
     recall = tp / len(gt_data)
     precision = tp / (tp + fp + 0.00000000001)