Skip to content

Commit

Permalink
Use variable for NLTK tokenizer datapackage name (punkt_tab)
Browse files Browse the repository at this point in the history
  • Loading branch information
juhoinkinen committed Sep 19, 2024
1 parent 642281e commit c047aea
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions annif/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
logger = annif.logger

_KEY_TOKEN_MIN_LENGTH = "token_min_length"
_NLTK_TOKENIZER_DATA = "punkt_tab"


class Analyzer(metaclass=abc.ABCMeta):
Expand All @@ -28,14 +29,15 @@ def __init__(self, **kwargs) -> None:
import nltk.data

try:
nltk.data.find("tokenizers/punkt_tab")
nltk.data.find("tokenizers/" + _NLTK_TOKENIZER_DATA)
except LookupError as err:
logger.debug(str(err))
if "punkt_tab" in str(err): # "punkt_tab" is surrounded by color code tags
if _NLTK_TOKENIZER_DATA in str(err):
logger.warning(
'NLTK datapackage "punkt_tab" not found, downloading it now.'
f'NLTK datapackage "{_NLTK_TOKENIZER_DATA}" not found, '
"downloading it now."
)
nltk.download("punkt_tab")
nltk.download(_NLTK_TOKENIZER_DATA)

def tokenize_sentences(self, text: str) -> list[str]:
"""Tokenize a piece of text (e.g. a document) into sentences."""
Expand Down

0 comments on commit c047aea

Please sign in to comment.