diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index 3161255..da4c476 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -8,6 +8,9 @@ inputs: image-name: description: 'Name of the Docker image' required: true + spacy_model: + description: 'Spacy model to download' + required: true runs: using: "composite" @@ -56,6 +59,8 @@ runs: uses: docker/build-push-action@v5 with: context: . + build-args: | + SPACY_MODEL=${{ inputs.spacy_model }} file: ${{ inputs.dockerfile }} provenance: false platforms: linux/amd64 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5b1cca8..0c2e840 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -25,7 +25,7 @@ jobs: - name: Run tests run: ./do test - build_syntactical_analysis_docker: + build_syntactical_analysis_docker_german: needs: test runs-on: ubuntu-latest env: @@ -39,8 +39,9 @@ jobs: with: dockerfile: ./docker/syntactical_analysis.Dockerfile image-name: syntactical_analysis-lambda + spacy_model: de_core_news_sm - build_inflection_docker: + build_inflection_docker_german: needs: test runs-on: ubuntu-latest env: @@ -54,6 +55,7 @@ jobs: with: dockerfile: ./docker/inflection.Dockerfile image-name: inflection-lambda + spacy_model: de_core_news_sm deploy_dev_infrastructure: needs: [build_syntactical_analysis_docker, build_inflection_docker] diff --git a/do b/do index ecbaa2c..46aef3a 100755 --- a/do +++ b/do @@ -85,11 +85,6 @@ function task_build_webserver() { while [[ $# -gt 0 ]]; do case $1 in - --source-lang) - SOURCE_LANG="$2" - shift # past argument - shift # past value - ;; --spacy-model) SPACY_MODEL="$2" shift # past argument diff --git a/docker/inflection.Dockerfile b/docker/inflection.Dockerfile index 99ca140..e473af8 100644 --- a/docker/inflection.Dockerfile +++ b/docker/inflection.Dockerfile @@ -7,6 +7,10 @@ ENV PYTHONPATH=/var/task/ # Set up a working directory WORKDIR /var/task +# And corresponding spacy model id, e.g. "en_core_web_sm" +ARG SPACY_MODEL +ENV SPACY_MODEL=${SPACY_MODEL} + # Copy project COPY lingolift/generative ./lingolift/generative/ COPY lingolift/llm ./lingolift/llm/ @@ -21,4 +25,7 @@ COPY lingolift/nlp_lambda_context_container.py ./lingolift/nlp_lambda_context_co COPY package/requirements.txt ./ RUN python3 -m pip install -r requirements.txt +# Install spaCy model +RUN python3 -m spacy download ${SPACY_MODEL} + CMD [ "nlp_lambda_handlers.inflection_handler" ] diff --git a/docker/syntactical_analysis.Dockerfile b/docker/syntactical_analysis.Dockerfile index 7ae24bb..c2cd655 100644 --- a/docker/syntactical_analysis.Dockerfile +++ b/docker/syntactical_analysis.Dockerfile @@ -7,6 +7,10 @@ ENV PYTHONPATH=/var/task/ # Set up a working directory WORKDIR /var/task +# And corresponding spacy model id, e.g. "en_core_web_sm" +ARG SPACY_MODEL +ENV SPACY_MODEL=${SPACY_MODEL} + # Copy project COPY lingolift/generative ./lingolift/generative/ COPY lingolift/llm ./lingolift/llm/ @@ -21,4 +25,7 @@ COPY lingolift/nlp_lambda_context_container.py ./lingolift/nlp_lambda_context_co COPY package/requirements.txt ./ RUN python3 -m pip install -r requirements.txt +# Install spaCy model +RUN python3 -m spacy download ${SPACY_MODEL} + CMD [ "nlp_lambda_handlers.syntactical_analysis_handler" ] diff --git a/lingolift/nlp/morphologizer.py b/lingolift/nlp/morphologizer.py index 7d1976a..f498f50 100644 --- a/lingolift/nlp/morphologizer.py +++ b/lingolift/nlp/morphologizer.py @@ -16,7 +16,7 @@ def __init__(self, morphology_generator: InflectionGenerator): @timed def retrieve_all_inflections(self, word: str) -> Inflections: # Get the part of speech tag for the word - analysis = perform_analysis(word, "DE")[ + analysis = perform_analysis(word)[ 0 ] # only analyze one word at a time right now, only support German pos = analysis.pos diff --git a/lingolift/nlp/syntactical_analysis.py b/lingolift/nlp/syntactical_analysis.py index 0657552..f9560c6 100644 --- a/lingolift/nlp/syntactical_analysis.py +++ b/lingolift/nlp/syntactical_analysis.py @@ -1,8 +1,7 @@ -from typing import Optional +import os import shared.universal_features as universal_features import spacy -from shared.exception import LanguageNotAvailableException from shared.model.syntactical_analysis import ( Morphology, PartOfSpeech, @@ -10,39 +9,16 @@ ) from spacy.tokens.token import Token -from lingolift.nlp.abstract_language_detector import AbstractLanguageDetector -from lingolift.nlp.lingua_language_detector import LinguaLanguageDetector -models = { - "DE": "de_core_news_sm", - "RU": "ru_core_news_sm", - "ES": "es_core_news_sm", - "FR": "fr_core_news_md", - "PT": "pt_core_news_sm", -} - -def perform_analysis( - sentence: str, - language_code: Optional[str] = None, - language_detector: AbstractLanguageDetector = LinguaLanguageDetector(), -) -> list[SyntacticalAnalysis]: +def perform_analysis(sentence: str) -> list[SyntacticalAnalysis]: """ Performs a syntactical analysis on a sentence in a given language. - :param language_detector: LanguageDetector to detect languages if language code is not provided. - :param language_code: Can optionally be supplied to override the language detection. :param sentence: Source sentence :return: """ - if not language_code: - language_code = str(language_detector.detect_language(sentence)) - try: - model = models[language_code] - nlp = spacy.load(model) - except KeyError: - raise LanguageNotAvailableException() + nlp = spacy.load(os.getenv("SPACY_MODEL")) doc = nlp(sentence) - return [_analyze_token(token) for token in doc if _analyze_token(token) is not None] diff --git a/lingolift/nlp_lambda_handlers.py b/lingolift/nlp_lambda_handlers.py index a3fb6dd..6c31463 100644 --- a/lingolift/nlp_lambda_handlers.py +++ b/lingolift/nlp_lambda_handlers.py @@ -1,19 +1,14 @@ import json import logging -from shared.exception import ( - LanguageNotAvailableException, - LanguageNotIdentifiedException, -) - from lingolift.nlp.syntactical_analysis import perform_analysis from lingolift.nlp_lambda_context_container import NLPLambdaContextContainer -from lingolift.util.lambda_proxy import check_pre_warm, fail, ok +from lingolift.util.lambda_proxy import check_pre_warm, ok """ -The split into lambda_handlers and lambda_handlers_nlp is unfortunately required. +The split into the lambda_handlers and lambda_handlers_nlp files is unfortunately required. When importing from the syntactical_analysis module, spaCy gets imported transitively. -For memory reasons, spaCy is only included where required. +For memory reasons, spaCy is only included where required, so its import will fail in the non-dockerized lambdas. """ # configure logging @@ -30,15 +25,9 @@ def syntactical_analysis_handler(event, _) -> dict: return pre_warm_response body = json.loads(event.get("body")) sentence = body.get("sentence") - language_code = body.get("language_code") - logger.info( - f"Received sentence, language: {sentence}, language_code: {language_code}" - ) - try: - analyses = perform_analysis(sentence, language_code) - return ok([a.model_dump() for a in analyses]) - except (LanguageNotAvailableException, LanguageNotIdentifiedException) as e: - return fail(e, 400) + logger.info(f"Received sentence, language: {sentence}") + analyses = perform_analysis(sentence) + return ok([a.model_dump() for a in analyses]) def inflection_handler(event, _) -> dict: diff --git a/pyproject.toml b/pyproject.toml index 2cbbe6b..6620a35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,8 @@ urllib3 = "<2" pytest-cov = "^5.0.0" coverage-badge = "^1.1.1" pytest-asyncio = "^0.23.7" +# All tests including spaCy are run on the German model +de-core-news-sm = { url = "https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0.tar.gz" } [tool.poetry.group.webserver.dependencies] flask = "^3.0.1" diff --git a/test/integration/conftest.py b/test/integration/conftest.py index 48a601c..1e9c89c 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -22,6 +22,11 @@ def set_llm_response(context_container: AbstractLambdaContextContainer, response context_container.llm_adapter.next_response(response) +@pytest.fixture(autouse=True) +def spacy_model_env(monkeypatch): + monkeypatch.setenv("SPACY_MODEL", "de_core_news_sm") + + @pytest.fixture def mock_llm_adapter(): return MockLLMAdapter() diff --git a/test/integration/test_lambda_handler.py b/test/integration/test_lambda_handler.py index 716c4e3..9461e56 100644 --- a/test/integration/test_lambda_handler.py +++ b/test/integration/test_lambda_handler.py @@ -114,14 +114,6 @@ def test_syntactical_analysis_real_event(real_event, core_context_container): assert len(body) == 4 -def test_syntactical_analysis_regular_call_with_exception(real_event): - real_event["body"] = json.dumps({"sentence": "bleep blurp"}) - response = syntactical_analysis_handler(real_event, None) - - assert response["statusCode"] == 400 - assert "error_message" in json.loads(response["body"]) - - def test_pre_warm_inflection(pre_warm_event): response = inflection_handler(pre_warm_event, None) diff --git a/test/unit/test_syntactical_analysis.py b/test/unit/test_syntactical_analysis.py index 3a57232..7763782 100644 --- a/test/unit/test_syntactical_analysis.py +++ b/test/unit/test_syntactical_analysis.py @@ -1,10 +1,13 @@ from lingolift.nlp.syntactical_analysis import perform_analysis -def test_happy_path(): +def test_happy_path(monkeypatch): + # Define the environment variable to load the correct spaCy model. + monkeypatch.setenv("SPACY_MODEL", "de_core_news_sm") + # Perform one comprehensive test, because analyses are quite slow. sentence = "Satzzeichen werden nicht gezählt." - result = list(perform_analysis(sentence, "DE")) + result = list(perform_analysis(sentence)) # ensure punctuation tokens are omitted from the analysis assert len(result) == 4