Skip to content

Commit

Permalink
Fixed #755 (#756)
Browse files Browse the repository at this point in the history
## Summary of Changes
Updated the TR grading code to stop double counting automated
extractions which lead to an incorrect amount of true positives.

## Expanded explanation
There is a many to one relationship between SKEMA extractions and manual
annotations. Multiple extractions can match an annotation (be correct).
To compute P, R and F1, all the extractions associated to a manual
annotation should be counted as a single true positive.

This change addresses this issue.

### Related issues

Resolves 755
  • Loading branch information
enoriega authored Jan 16, 2024
1 parent ea7e2b6 commit e017c99
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 16 deletions.
6 changes: 3 additions & 3 deletions skema/rest/tests/test_integrated_text_reading_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ def test_extraction_evaluation():
results = response.json()

assert results['num_manual_annotations'] == 220, "There should be 220 gt manual annotations"
assert results['precision'] == approx(0.7230769230768118), "Precision drastically different from the expected value"
assert results['recall'] == approx(0.21363636363636362), "Recall drastically different from the expected value"
assert results['f1'] == approx(0.32982456136828636), "F1 drastically different from the expected value"
assert results['precision'] == approx(0.5230769230768426), "Precision drastically different from the expected value"
assert results['recall'] == approx(0.154545454545454542), "Recall drastically different from the expected value"
assert results['f1'] == approx(0.23859649119285095), "F1 drastically different from the expected value"


def test_healthcheck():
Expand Down
36 changes: 23 additions & 13 deletions skema/rest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ async def get_client():
yield client
# close the client when the request is done


def fn_preprocessor(function_network: Dict[str, Any]):
fn_data = function_network.copy()

Expand Down Expand Up @@ -180,23 +181,32 @@ def compute_text_reading_evaluation(gt_data: list, attributes: AttributeCollecti
page = a["page"]
annotations_by_page[page].append(a)

def annotation_key(a: Dict):
return a['page'], tuple(a['start_xy']), a['text']

# Count the matches
tp, tn, fp, fn = 0, 0, 0, 0
matched_annotations = set()
for e in extractions:
matched = False
for m in e.mentions:
if m.extraction_source is not None:
te = m.extraction_source
if te.page is not None:
e_page = te.page
page_annotations = annotations_by_page[e_page]
matched = False
for a in page_annotations:
if extraction_matches_annotation(m, a, json_contents):
matched = True
tp += 1
break
if not matched:
fp += 1
if not matched:
if m.extraction_source is not None:
te = m.extraction_source
if te.page is not None:
e_page = te.page
page_annotations = annotations_by_page[e_page]

for a in page_annotations:
key = annotation_key(a)
if key not in matched_annotations:
if extraction_matches_annotation(m, a, json_contents):
matched_annotations.add(key)
matched = True
tp += 1
break
if not matched:
fp += 1

recall = tp / len(gt_data)
precision = tp / (tp + fp + 0.00000000001)
Expand Down

0 comments on commit e017c99

Please sign in to comment.