Skip to content

Commit

Permalink
Merge pull request #803 from NatLibFi/automate-nltkdata-download
Browse files Browse the repository at this point in the history
Automate NLTK datapackage `punkt_tab` download
  • Loading branch information
juhoinkinen authored Sep 30, 2024
2 parents 8f2e905 + 45be777 commit 20ae1e4
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
8 changes: 0 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,6 @@ The recommended way is to install Annif from
source annif-venv/bin/activate
pip install annif

You will also need NLTK data files:

python -m nltk.downloader punkt_tab

Start up the application:

annif
Expand Down Expand Up @@ -113,10 +109,6 @@ Enter the virtual environment:

poetry shell

You will also need NLTK data files:

python -m nltk.downloader punkt_tab

Start up the application:

annif
Expand Down
20 changes: 20 additions & 0 deletions annif/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
import functools
import unicodedata

import annif

logger = annif.logger

_KEY_TOKEN_MIN_LENGTH = "token_min_length"
_NLTK_TOKENIZER_DATA = "punkt_tab"


class Analyzer(metaclass=abc.ABCMeta):
Expand All @@ -21,6 +26,21 @@ def __init__(self, **kwargs) -> None:
if _KEY_TOKEN_MIN_LENGTH in kwargs:
self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH])

import nltk.data

try:
nltk.data.find("tokenizers/" + _NLTK_TOKENIZER_DATA)
except LookupError as err:
logger.debug(str(err))
if _NLTK_TOKENIZER_DATA in str(err):
logger.warning(
f'NLTK datapackage "{_NLTK_TOKENIZER_DATA}" not found, '
"downloading it now."
)
nltk.download(_NLTK_TOKENIZER_DATA)
else:
raise

def tokenize_sentences(self, text: str) -> list[str]:
"""Tokenize a piece of text (e.g. a document) into sentences."""
import nltk.tokenize
Expand Down
11 changes: 11 additions & 0 deletions tests/test_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Unit tests for analyzers in Annif"""

from unittest import mock

import pytest

import annif.analyzer
Expand All @@ -15,6 +17,15 @@ def test_get_analyzer_badspec():
annif.analyzer.get_analyzer("()")


@mock.patch("nltk.data.find", side_effect=LookupError("Resource punkt_tab not found"))
@mock.patch("nltk.download")
def test_nltk_data_missing(download, find):
annif.analyzer.get_analyzer("snowball(english)")
assert find.called
assert download.called
assert download.call_args == mock.call("punkt_tab")


def test_english_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("snowball(english)")
assert analyzer._normalize_word("running") == "run"
Expand Down

0 comments on commit 20ae1e4

Please sign in to comment.