Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Even better token annotation #27

Merged
merged 4 commits into from
Apr 27, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions allennlp_models/rc/common/reader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ def char_span_to_token_span(
token_offsets[end_index] is None or token_offsets[end_index][1] < character_span[1]
):
end_index += 1
if end_index == len(token_offsets):
# We want a character span that goes beyond the last token. Let's see if this is salvageable.
# We consider this salvageable if the span we're looking for starts before the last token ends.
# In other words, we don't salvage if the whole span comes after the tokens end.
if character_span[0] < token_offsets[-1][1]:
# We also want to make sure we aren't way off. We need to be within 8 characters to salvage.
if character_span[1] - 8 < token_offsets[-1][1]:
end_index -= 1

if end_index >= len(token_offsets):
raise ValueError(f"Character span %r outside the range of the given tokens.")
if end_index == start_index and token_offsets[end_index][1] > character_span[1]:
Expand Down
6 changes: 6 additions & 0 deletions tests/rc/reader_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@
([(0, 3), (4, 4), (5, 8)], (0, 8), ((0, 2), False)),
([(0, 3), (4, 4), (5, 8)], (1, 8), ((0, 2), True)),
([(0, 3), (4, 4), (5, 8)], (7, 8), ((2, 2), True)),
([(0, 3), (4, 4), (5, 8)], (7, 9), ((2, 2), True)),
],
)
def test_char_span_to_token_span(token_offsets, character_span, expected_result):
assert char_span_to_token_span(token_offsets, character_span) == expected_result


def test_char_span_to_token_span_throws():
with pytest.raises(ValueError):
char_span_to_token_span([(0, 3), (4, 4), (5, 8)], (7, 19))