Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Fix the IndexError when CNNDailyMailDatasetReader reads test data. #306

Merged
merged 8 commits into from
Nov 1, 2021
Merged
9 changes: 5 additions & 4 deletions allennlp_models/generation/dataset_readers/cnn_dm.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,6 @@ def _sanitize_story_line(line):

sentence_endings = [".", "!", "?", "...", "'", "`", '"', ")", "\u2019", "\u201d"]

# CNN stories always start with "(CNN)"
if line.startswith("(CNN)"):
line = line[len("(CNN)") :]

# Highlight are essentially bullet points and don't have proper sentence endings
if line[-1] not in sentence_endings:
line += "."
Expand All @@ -104,6 +100,11 @@ def _read_story(story_path: str):
with open(story_path, "r") as f:
for line in f:
line = line.strip()

# CNN stories always start with "(CNN)"
if line.startswith("(CNN)"):
line = line[len("(CNN)") :]

if line == "":
continue

Expand Down