Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix crashing index command when targeted directory contains subject files #705

Merged
merged 4 commits into from
May 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,7 @@ def run_index(
raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
backend_params = cli_util.parse_backend_params(backend_param, project)

documents = annif.corpus.DocumentDirectory(
directory, None, None, require_subjects=False
)
documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

for (docfilename, _), suggestions in zip(documents, results):
Expand Down
33 changes: 17 additions & 16 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,40 +17,41 @@
class DocumentDirectory(DocumentCorpus):
"""A directory of files as a full text document corpus"""

def __init__(self, path, subject_index, language, require_subjects=False):
def __init__(self, path, subject_index=None, language=None, require_subjects=False):
self.path = path
self.subject_index = subject_index
self.language = language
self.require_subjects = require_subjects

def __iter__(self):
"""Iterate through the directory, yielding tuples of (docfile,
subjectfile) containing file paths. If there is no key file and
require_subjects is False, the subjectfile will be returned as None."""
subjectfile) containing file paths. If require_subjects is False, the
subjectfile will be returned as None."""

for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
if os.path.exists(tsvfilename):
yield (filename, tsvfilename)
continue
keyfilename = re.sub(r"\.txt$", ".key", filename)
if os.path.exists(keyfilename):
yield (filename, keyfilename)
continue
if not self.require_subjects:
if self.require_subjects:
tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
if os.path.exists(tsvfilename):
yield (filename, tsvfilename)
continue
keyfilename = re.sub(r"\.txt$", ".key", filename)
if os.path.exists(keyfilename):
yield (filename, keyfilename)
continue
else:
yield (filename, None)

@property
def documents(self):
for docfilename, keyfilename in self:
for docfilename, subjfilename in self:
with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
text = docfile.read()
if keyfilename is None:
if subjfilename is None:
yield Document(text=text, subject_set=None)
continue
with open(keyfilename, encoding="utf-8-sig") as keyfile:
with open(subjfilename, encoding="utf-8-sig") as subjfile:
subjects = SubjectSet.from_string(
keyfile.read(), self.subject_index, self.language
subjfile.read(), self.subject_index, self.language
)
yield Document(text=text, subject_set=subjects)

Expand Down
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ def fulltext_corpus(subject_index):
ftdir = os.path.join(
os.path.dirname(__file__), "corpora", "archaeology", "fulltext"
)
ft_corpus = annif.corpus.DocumentDirectory(ftdir, subject_index, "fi")
ft_corpus = annif.corpus.DocumentDirectory(
ftdir, subject_index, "fi", require_subjects=True
)
return ft_corpus


Expand Down
4 changes: 3 additions & 1 deletion tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def test_learn_dummy(project, tmpdir):
tmpdir.join("doc1.tsv").write("<http://www.yso.fi/onto/yso/p10849>\tarchaeologists")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.tsv").write("<http://example.org/dummy>\tdummy")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), project.subjects, "en", require_subjects=True
)

dummy.learn(docdir)

Expand Down
3 changes: 3 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,9 @@ def test_suggest_dash_path():

def test_index(tmpdir):
tmpdir.join("doc1.txt").write("nothing special")
# Existing subject files should not have an effect
tmpdir.join("doc1.tsv").write("<http://example.org/dummy>\tdummy")
tmpdir.join("doc1.key").write("<http://example.org/dummy>\tdummy")

result = runner.invoke(annif.cli.cli, ["index", "dummy-en", str(tmpdir)])
assert not result.exception
Expand Down
20 changes: 11 additions & 9 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,38 +80,38 @@ def test_subjectset_as_vector_destination(subject_index):
assert vector is destination


def test_docdir_key(tmpdir, subject_index):
def test_docdir_key(tmpdir):
tmpdir.join("doc1.txt").write("doc1")
tmpdir.join("doc1.key").write("key1")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.key").write("key2")
tmpdir.join("doc3.txt").write("doc3")

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False)
files = sorted(list(docdir))
assert len(files) == 3
assert files[0][0] == str(tmpdir.join("doc1.txt"))
assert files[0][1] == str(tmpdir.join("doc1.key"))
assert files[0][1] is None
assert files[1][0] == str(tmpdir.join("doc2.txt"))
assert files[1][1] == str(tmpdir.join("doc2.key"))
assert files[1][1] is None
assert files[2][0] == str(tmpdir.join("doc3.txt"))
assert files[2][1] is None


def test_docdir_tsv(tmpdir, subject_index):
def test_docdir_tsv(tmpdir):
tmpdir.join("doc1.txt").write("doc1")
tmpdir.join("doc1.tsv").write("<http://example.org/key1>\tkey1")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.tsv").write("<http://example.org/key2>\tkey2")
tmpdir.join("doc3.txt").write("doc3")

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False)
files = sorted(list(docdir))
assert len(files) == 3
assert files[0][0] == str(tmpdir.join("doc1.txt"))
assert files[0][1] == str(tmpdir.join("doc1.tsv"))
assert files[0][1] is None
assert files[1][0] == str(tmpdir.join("doc2.txt"))
assert files[1][1] == str(tmpdir.join("doc2.tsv"))
assert files[1][1] is None
assert files[2][0] == str(tmpdir.join("doc3.txt"))
assert files[2][1] is None

Expand All @@ -126,7 +126,9 @@ def test_docdir_tsv_bom(tmpdir, subject_index):
"<http://www.yso.fi/onto/yso/p2558>\trautakausi".encode("utf-8-sig")
)

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "fi")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), subject_index, "fi", require_subjects=True
)
docs = list(docdir.documents)
assert docs[0].text == "doc1"
assert (
Expand Down
4 changes: 3 additions & 1 deletion tests/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ def test_project_learn(registry, tmpdir):
tmpdir.join("doc2.tsv").write("<http://example.org/dummy>\tdummy")

project = registry.get_project("dummy-fi")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), project.subjects, "en", require_subjects=True
)
project.learn(docdir)
result = project.suggest(["this is some text"])[0]
assert len(result) == 1
Expand Down