Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Even better token annotation (#27)
Browse files Browse the repository at this point in the history
* Handle cases where the annotated token goes beyond the last token.

* Adds test for the new behavior
  • Loading branch information
dirkgr authored Apr 27, 2020
1 parent 2cd9a73 commit 5209e07
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 0 deletions.
9 changes: 9 additions & 0 deletions allennlp_models/rc/common/reader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ def char_span_to_token_span(
token_offsets[end_index] is None or token_offsets[end_index][1] < character_span[1]
):
end_index += 1
if end_index == len(token_offsets):
# We want a character span that goes beyond the last token. Let's see if this is salvageable.
# We consider this salvageable if the span we're looking for starts before the last token ends.
# In other words, we don't salvage if the whole span comes after the tokens end.
if character_span[0] < token_offsets[-1][1]:
# We also want to make sure we aren't way off. We need to be within 8 characters to salvage.
if character_span[1] - 8 < token_offsets[-1][1]:
end_index -= 1

if end_index >= len(token_offsets):
raise ValueError(f"Character span %r outside the range of the given tokens.")
if end_index == start_index and token_offsets[end_index][1] > character_span[1]:
Expand Down
6 changes: 6 additions & 0 deletions tests/rc/reader_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@
([(0, 3), (4, 4), (5, 8)], (0, 8), ((0, 2), False)),
([(0, 3), (4, 4), (5, 8)], (1, 8), ((0, 2), True)),
([(0, 3), (4, 4), (5, 8)], (7, 8), ((2, 2), True)),
([(0, 3), (4, 4), (5, 8)], (7, 9), ((2, 2), True)),
],
)
def test_char_span_to_token_span(token_offsets, character_span, expected_result):
assert char_span_to_token_span(token_offsets, character_span) == expected_result


def test_char_span_to_token_span_throws():
with pytest.raises(ValueError):
char_span_to_token_span([(0, 3), (4, 4), (5, 8)], (7, 19))

0 comments on commit 5209e07

Please sign in to comment.