Parameterize lambda docker images with spaCy model

twaslowski · Jun 20, 2024 · 7d26853 · 7d26853
1 parent a4d0e7a
commit 7d26853
Show file tree

Hide file tree

Showing 12 changed files with 45 additions and 62 deletions.
diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml
@@ -8,6 +8,9 @@ inputs:
   image-name:
     description: 'Name of the Docker image'
     required: true
+  spacy_model:
+    description: 'Spacy model to download'
+    required: true
 
 runs:
   using: "composite"
@@ -56,6 +59,8 @@ runs:
       uses: docker/build-push-action@v5
       with:
         context: .
+        build-args: |
+          SPACY_MODEL=${{ inputs.spacy_model }}
         file: ${{ inputs.dockerfile }}
         provenance: false
         platforms: linux/amd64

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Run tests
         run: ./do test
 
-  build_syntactical_analysis_docker:
+  build_syntactical_analysis_docker_german:
     needs: test
     runs-on: ubuntu-latest
     env:
@@ -39,8 +39,9 @@ jobs:
         with:
           dockerfile: ./docker/syntactical_analysis.Dockerfile
           image-name: syntactical_analysis-lambda
+          spacy_model: de_core_news_sm
 
-  build_inflection_docker:
+  build_inflection_docker_german:
     needs: test
     runs-on: ubuntu-latest
     env:
@@ -54,6 +55,7 @@ jobs:
         with:
           dockerfile: ./docker/inflection.Dockerfile
           image-name: inflection-lambda
+          spacy_model: de_core_news_sm
 
   deploy_dev_infrastructure:
     needs: [build_syntactical_analysis_docker, build_inflection_docker]

diff --git a/do b/do
@@ -85,11 +85,6 @@ function task_build_webserver() {
 
   while [[ $# -gt 0 ]]; do
     case $1 in
-      --source-lang)
-        SOURCE_LANG="$2"
-        shift # past argument
-        shift # past value
-        ;;
       --spacy-model)
         SPACY_MODEL="$2"
         shift # past argument

diff --git a/docker/inflection.Dockerfile b/docker/inflection.Dockerfile
@@ -7,6 +7,10 @@ ENV PYTHONPATH=/var/task/
 # Set up a working directory
 WORKDIR /var/task
 
+# And corresponding spacy model id, e.g. "en_core_web_sm"
+ARG SPACY_MODEL
+ENV SPACY_MODEL=${SPACY_MODEL}
+
 # Copy project
 COPY lingolift/generative ./lingolift/generative/
 COPY lingolift/llm ./lingolift/llm/
@@ -21,4 +25,7 @@ COPY lingolift/nlp_lambda_context_container.py ./lingolift/nlp_lambda_context_co
 COPY package/requirements.txt ./
 RUN python3 -m pip install -r requirements.txt
 
+# Install spaCy model
+RUN python3 -m spacy download ${SPACY_MODEL}
+
 CMD [ "nlp_lambda_handlers.inflection_handler" ]
diff --git a/docker/syntactical_analysis.Dockerfile b/docker/syntactical_analysis.Dockerfile
@@ -7,6 +7,10 @@ ENV PYTHONPATH=/var/task/
 # Set up a working directory
 WORKDIR /var/task
 
+# And corresponding spacy model id, e.g. "en_core_web_sm"
+ARG SPACY_MODEL
+ENV SPACY_MODEL=${SPACY_MODEL}
+
 # Copy project
 COPY lingolift/generative ./lingolift/generative/
 COPY lingolift/llm ./lingolift/llm/
@@ -21,4 +25,7 @@ COPY lingolift/nlp_lambda_context_container.py ./lingolift/nlp_lambda_context_co
 COPY package/requirements.txt ./
 RUN python3 -m pip install -r requirements.txt
 
+# Install spaCy model
+RUN python3 -m spacy download ${SPACY_MODEL}
+
 CMD [ "nlp_lambda_handlers.syntactical_analysis_handler" ]
diff --git a/lingolift/nlp/morphologizer.py b/lingolift/nlp/morphologizer.py
@@ -16,7 +16,7 @@ def __init__(self, morphology_generator: InflectionGenerator):
     @timed
     def retrieve_all_inflections(self, word: str) -> Inflections:
         # Get the part of speech tag for the word
-        analysis = perform_analysis(word, "DE")[
+        analysis = perform_analysis(word)[
             0
         ]  # only analyze one word at a time right now, only support German
         pos = analysis.pos

diff --git a/lingolift/nlp/syntactical_analysis.py b/lingolift/nlp/syntactical_analysis.py
@@ -1,48 +1,24 @@
-from typing import Optional
+import os
 
 import shared.universal_features as universal_features
 import spacy
-from shared.exception import LanguageNotAvailableException
 from shared.model.syntactical_analysis import (
     Morphology,
     PartOfSpeech,
     SyntacticalAnalysis,
 )
 from spacy.tokens.token import Token
 
-from lingolift.nlp.abstract_language_detector import AbstractLanguageDetector
-from lingolift.nlp.lingua_language_detector import LinguaLanguageDetector
 
-models = {
-    "DE": "de_core_news_sm",
-    "RU": "ru_core_news_sm",
-    "ES": "es_core_news_sm",
-    "FR": "fr_core_news_md",
-    "PT": "pt_core_news_sm",
-}
 
-
-def perform_analysis(
-    sentence: str,
-    language_code: Optional[str] = None,
-    language_detector: AbstractLanguageDetector = LinguaLanguageDetector(),
-) -> list[SyntacticalAnalysis]:
+def perform_analysis(sentence: str) -> list[SyntacticalAnalysis]:
     """
     Performs a syntactical analysis on a sentence in a given language.
-    :param language_detector: LanguageDetector to detect languages if language code is not provided.
-    :param language_code: Can optionally be supplied to override the language detection.
     :param sentence: Source sentence
     :return:
     """
-    if not language_code:
-        language_code = str(language_detector.detect_language(sentence))
-    try:
-        model = models[language_code]
-        nlp = spacy.load(model)
-    except KeyError:
-        raise LanguageNotAvailableException()
+    nlp = spacy.load(os.getenv("SPACY_MODEL"))
     doc = nlp(sentence)
-
     return [_analyze_token(token) for token in doc if _analyze_token(token) is not None]
 
 

diff --git a/lingolift/nlp_lambda_handlers.py b/lingolift/nlp_lambda_handlers.py
@@ -1,19 +1,14 @@
 import json
 import logging
 
-from shared.exception import (
-    LanguageNotAvailableException,
-    LanguageNotIdentifiedException,
-)
-
 from lingolift.nlp.syntactical_analysis import perform_analysis
 from lingolift.nlp_lambda_context_container import NLPLambdaContextContainer
-from lingolift.util.lambda_proxy import check_pre_warm, fail, ok
+from lingolift.util.lambda_proxy import check_pre_warm, ok
 
 """
-The split into lambda_handlers and lambda_handlers_nlp is unfortunately required.
+The split into the lambda_handlers and lambda_handlers_nlp files is unfortunately required.
 When importing from the syntactical_analysis module, spaCy gets imported transitively.
-For memory reasons, spaCy is only included where required.
+For memory reasons, spaCy is only included where required, so its import will fail in the non-dockerized lambdas.
 """
 
 # configure logging
@@ -30,15 +25,9 @@ def syntactical_analysis_handler(event, _) -> dict:
         return pre_warm_response
     body = json.loads(event.get("body"))
     sentence = body.get("sentence")
-    language_code = body.get("language_code")
-    logger.info(
-        f"Received sentence, language: {sentence}, language_code: {language_code}"
-    )
-    try:
-        analyses = perform_analysis(sentence, language_code)
-        return ok([a.model_dump() for a in analyses])
-    except (LanguageNotAvailableException, LanguageNotIdentifiedException) as e:
-        return fail(e, 400)
+    logger.info(f"Received sentence, language: {sentence}")
+    analyses = perform_analysis(sentence)
+    return ok([a.model_dump() for a in analyses])
 
 
 def inflection_handler(event, _) -> dict:

diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,8 @@ urllib3 = "<2"
 pytest-cov = "^5.0.0"
 coverage-badge = "^1.1.1"
 pytest-asyncio = "^0.23.7"
+# All tests including spaCy are run on the German model
+de-core-news-sm = { url = "https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0.tar.gz" }
 
 [tool.poetry.group.webserver.dependencies]
 flask = "^3.0.1"

diff --git a/test/integration/conftest.py b/test/integration/conftest.py
@@ -22,6 +22,11 @@ def set_llm_response(context_container: AbstractLambdaContextContainer, response
     context_container.llm_adapter.next_response(response)
 
 
+@pytest.fixture(autouse=True)
+def spacy_model_env(monkeypatch):
+    monkeypatch.setenv("SPACY_MODEL", "de_core_news_sm")
+
+
 @pytest.fixture
 def mock_llm_adapter():
     return MockLLMAdapter()

diff --git a/test/integration/test_lambda_handler.py b/test/integration/test_lambda_handler.py
@@ -114,14 +114,6 @@ def test_syntactical_analysis_real_event(real_event, core_context_container):
     assert len(body) == 4
 
 
-def test_syntactical_analysis_regular_call_with_exception(real_event):
-    real_event["body"] = json.dumps({"sentence": "bleep blurp"})
-    response = syntactical_analysis_handler(real_event, None)
-
-    assert response["statusCode"] == 400
-    assert "error_message" in json.loads(response["body"])
-
-
 def test_pre_warm_inflection(pre_warm_event):
     response = inflection_handler(pre_warm_event, None)
 

diff --git a/test/unit/test_syntactical_analysis.py b/test/unit/test_syntactical_analysis.py
@@ -1,10 +1,13 @@
 from lingolift.nlp.syntactical_analysis import perform_analysis
 
 
-def test_happy_path():
+def test_happy_path(monkeypatch):
+    # Define the environment variable to load the correct spaCy model.
+    monkeypatch.setenv("SPACY_MODEL", "de_core_news_sm")
+
     # Perform one comprehensive test, because analyses are quite slow.
     sentence = "Satzzeichen werden nicht gezählt."
-    result = list(perform_analysis(sentence, "DE"))
+    result = list(perform_analysis(sentence))
 
     # ensure punctuation tokens are omitted from the analysis
     assert len(result) == 4