Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Parameterize lambda docker images with spaCy model
Browse files Browse the repository at this point in the history
  • Loading branch information
twaslowski committed Jun 20, 2024
1 parent a4d0e7a commit 7d26853
Show file tree
Hide file tree
Showing 12 changed files with 45 additions and 62 deletions.
5 changes: 5 additions & 0 deletions .github/actions/docker-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ inputs:
image-name:
description: 'Name of the Docker image'
required: true
spacy_model:
description: 'Spacy model to download'
required: true

runs:
using: "composite"
Expand Down Expand Up @@ -56,6 +59,8 @@ runs:
uses: docker/build-push-action@v5
with:
context: .
build-args: |
SPACY_MODEL=${{ inputs.spacy_model }}
file: ${{ inputs.dockerfile }}
provenance: false
platforms: linux/amd64
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: Run tests
run: ./do test

build_syntactical_analysis_docker:
build_syntactical_analysis_docker_german:
needs: test
runs-on: ubuntu-latest
env:
Expand All @@ -39,8 +39,9 @@ jobs:
with:
dockerfile: ./docker/syntactical_analysis.Dockerfile
image-name: syntactical_analysis-lambda
spacy_model: de_core_news_sm

build_inflection_docker:
build_inflection_docker_german:
needs: test
runs-on: ubuntu-latest
env:
Expand All @@ -54,6 +55,7 @@ jobs:
with:
dockerfile: ./docker/inflection.Dockerfile
image-name: inflection-lambda
spacy_model: de_core_news_sm

deploy_dev_infrastructure:
needs: [build_syntactical_analysis_docker, build_inflection_docker]
Expand Down
5 changes: 0 additions & 5 deletions do
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,6 @@ function task_build_webserver() {

while [[ $# -gt 0 ]]; do
case $1 in
--source-lang)
SOURCE_LANG="$2"
shift # past argument
shift # past value
;;
--spacy-model)
SPACY_MODEL="$2"
shift # past argument
Expand Down
7 changes: 7 additions & 0 deletions docker/inflection.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ ENV PYTHONPATH=/var/task/
# Set up a working directory
WORKDIR /var/task

# And corresponding spacy model id, e.g. "en_core_web_sm"
ARG SPACY_MODEL
ENV SPACY_MODEL=${SPACY_MODEL}

# Copy project
COPY lingolift/generative ./lingolift/generative/
COPY lingolift/llm ./lingolift/llm/
Expand All @@ -21,4 +25,7 @@ COPY lingolift/nlp_lambda_context_container.py ./lingolift/nlp_lambda_context_co
COPY package/requirements.txt ./
RUN python3 -m pip install -r requirements.txt

# Install spaCy model
RUN python3 -m spacy download ${SPACY_MODEL}

CMD [ "nlp_lambda_handlers.inflection_handler" ]
7 changes: 7 additions & 0 deletions docker/syntactical_analysis.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ ENV PYTHONPATH=/var/task/
# Set up a working directory
WORKDIR /var/task

# And corresponding spacy model id, e.g. "en_core_web_sm"
ARG SPACY_MODEL
ENV SPACY_MODEL=${SPACY_MODEL}

# Copy project
COPY lingolift/generative ./lingolift/generative/
COPY lingolift/llm ./lingolift/llm/
Expand All @@ -21,4 +25,7 @@ COPY lingolift/nlp_lambda_context_container.py ./lingolift/nlp_lambda_context_co
COPY package/requirements.txt ./
RUN python3 -m pip install -r requirements.txt

# Install spaCy model
RUN python3 -m spacy download ${SPACY_MODEL}

CMD [ "nlp_lambda_handlers.syntactical_analysis_handler" ]
2 changes: 1 addition & 1 deletion lingolift/nlp/morphologizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, morphology_generator: InflectionGenerator):
@timed
def retrieve_all_inflections(self, word: str) -> Inflections:
# Get the part of speech tag for the word
analysis = perform_analysis(word, "DE")[
analysis = perform_analysis(word)[
0
] # only analyze one word at a time right now, only support German
pos = analysis.pos
Expand Down
30 changes: 3 additions & 27 deletions lingolift/nlp/syntactical_analysis.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,24 @@
from typing import Optional
import os

import shared.universal_features as universal_features
import spacy
from shared.exception import LanguageNotAvailableException
from shared.model.syntactical_analysis import (
Morphology,
PartOfSpeech,
SyntacticalAnalysis,
)
from spacy.tokens.token import Token

from lingolift.nlp.abstract_language_detector import AbstractLanguageDetector
from lingolift.nlp.lingua_language_detector import LinguaLanguageDetector

models = {
"DE": "de_core_news_sm",
"RU": "ru_core_news_sm",
"ES": "es_core_news_sm",
"FR": "fr_core_news_md",
"PT": "pt_core_news_sm",
}


def perform_analysis(
sentence: str,
language_code: Optional[str] = None,
language_detector: AbstractLanguageDetector = LinguaLanguageDetector(),
) -> list[SyntacticalAnalysis]:
def perform_analysis(sentence: str) -> list[SyntacticalAnalysis]:
"""
Performs a syntactical analysis on a sentence in a given language.
:param language_detector: LanguageDetector to detect languages if language code is not provided.
:param language_code: Can optionally be supplied to override the language detection.
:param sentence: Source sentence
:return:
"""
if not language_code:
language_code = str(language_detector.detect_language(sentence))
try:
model = models[language_code]
nlp = spacy.load(model)
except KeyError:
raise LanguageNotAvailableException()
nlp = spacy.load(os.getenv("SPACY_MODEL"))
doc = nlp(sentence)

return [_analyze_token(token) for token in doc if _analyze_token(token) is not None]


Expand Down
23 changes: 6 additions & 17 deletions lingolift/nlp_lambda_handlers.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
import json
import logging

from shared.exception import (
LanguageNotAvailableException,
LanguageNotIdentifiedException,
)

from lingolift.nlp.syntactical_analysis import perform_analysis
from lingolift.nlp_lambda_context_container import NLPLambdaContextContainer
from lingolift.util.lambda_proxy import check_pre_warm, fail, ok
from lingolift.util.lambda_proxy import check_pre_warm, ok

"""
The split into lambda_handlers and lambda_handlers_nlp is unfortunately required.
The split into the lambda_handlers and lambda_handlers_nlp files is unfortunately required.
When importing from the syntactical_analysis module, spaCy gets imported transitively.
For memory reasons, spaCy is only included where required.
For memory reasons, spaCy is only included where required, so its import will fail in the non-dockerized lambdas.
"""

# configure logging
Expand All @@ -30,15 +25,9 @@ def syntactical_analysis_handler(event, _) -> dict:
return pre_warm_response
body = json.loads(event.get("body"))
sentence = body.get("sentence")
language_code = body.get("language_code")
logger.info(
f"Received sentence, language: {sentence}, language_code: {language_code}"
)
try:
analyses = perform_analysis(sentence, language_code)
return ok([a.model_dump() for a in analyses])
except (LanguageNotAvailableException, LanguageNotIdentifiedException) as e:
return fail(e, 400)
logger.info(f"Received sentence, language: {sentence}")
analyses = perform_analysis(sentence)
return ok([a.model_dump() for a in analyses])


def inflection_handler(event, _) -> dict:
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ urllib3 = "<2"
pytest-cov = "^5.0.0"
coverage-badge = "^1.1.1"
pytest-asyncio = "^0.23.7"
# All tests including spaCy are run on the German model
de-core-news-sm = { url = "https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0.tar.gz" }

[tool.poetry.group.webserver.dependencies]
flask = "^3.0.1"
Expand Down
5 changes: 5 additions & 0 deletions test/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ def set_llm_response(context_container: AbstractLambdaContextContainer, response
context_container.llm_adapter.next_response(response)


@pytest.fixture(autouse=True)
def spacy_model_env(monkeypatch):
monkeypatch.setenv("SPACY_MODEL", "de_core_news_sm")


@pytest.fixture
def mock_llm_adapter():
return MockLLMAdapter()
Expand Down
8 changes: 0 additions & 8 deletions test/integration/test_lambda_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,6 @@ def test_syntactical_analysis_real_event(real_event, core_context_container):
assert len(body) == 4


def test_syntactical_analysis_regular_call_with_exception(real_event):
real_event["body"] = json.dumps({"sentence": "bleep blurp"})
response = syntactical_analysis_handler(real_event, None)

assert response["statusCode"] == 400
assert "error_message" in json.loads(response["body"])


def test_pre_warm_inflection(pre_warm_event):
response = inflection_handler(pre_warm_event, None)

Expand Down
7 changes: 5 additions & 2 deletions test/unit/test_syntactical_analysis.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from lingolift.nlp.syntactical_analysis import perform_analysis


def test_happy_path():
def test_happy_path(monkeypatch):
# Define the environment variable to load the correct spaCy model.
monkeypatch.setenv("SPACY_MODEL", "de_core_news_sm")

# Perform one comprehensive test, because analyses are quite slow.
sentence = "Satzzeichen werden nicht gezählt."
result = list(perform_analysis(sentence, "DE"))
result = list(perform_analysis(sentence))

# ensure punctuation tokens are omitted from the analysis
assert len(result) == 4
Expand Down

0 comments on commit 7d26853

Please sign in to comment.