From 68c8996c10d1b58267e0d72967d7559b9674ce2f Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Mon, 25 Sep 2023 09:49:08 -0400
Subject: [PATCH] docker: add build script to ease cutting new releases

---
 docker/Dockerfile.cpu | 16 +++++---
 docker/Dockerfile.gpu | 15 ++++---
 docker/MAINTAINER.md  | 45 +++++++++++----------
 docker/build.py       | 91 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 133 insertions(+), 34 deletions(-)
 create mode 100755 docker/build.py

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 162a8aa6..d80a4986 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -1,19 +1,21 @@
 FROM python:3.9 as base
 
+ARG cnlpt_version
+
 RUN --mount=type=cache,target=/root/.cache \
   pip install cython
 RUN --mount=type=cache,target=/root/.cache \
-  pip install cnlp-transformers
-
-# temporary workaround for transformers/huggingface version issue 
-RUN pip uninstall -y huggingface-hub
-RUN pip install huggingface-hub==0.10.1
+  pip install cnlp-transformers==$cnlpt_version
 
 WORKDIR /home
 # this copy is to support the preload of train models in the downstream images
 COPY .. /home
 ENTRYPOINT ["/bin/bash"]
 
+FROM base as current
+run python -c "import sys;sys.path.append('/home/docker');import model_download; model_download.current()"
+ENTRYPOINT ["cnlpt_current_rest", "-p", "8000"]
+
 FROM base as dtr
 run python -c "import sys;sys.path.append('/home/docker');import model_download; model_download.dtr()"
 ENTRYPOINT ["cnlpt_dtr_rest", "-p", "8000"]
@@ -28,7 +30,9 @@ ENTRYPOINT ["cnlpt_negation_rest", "-p", "8000"]
 
 FROM base as termexists
 run python -c "import sys;sys.path.append('/home/docker');import model_download; model_download.termexists()"
-ENTRYPOINT ["cnlpt_termexists", "-p", "8000"]
+# Temporary fix, remove once the released pip package has the new model
+run sed -i 's/sharpseed-termexists/termexists_pubmedbert_ssm/g' /usr/local/lib/python3.9/site-packages/cnlpt/api/termexists_rest.py
+ENTRYPOINT ["cnlpt_termexists_rest", "-p", "8000"]
 
 FROM base as temporal
 run python -c "import sys;sys.path.append('/home/docker');import model_download; model_download.temporal()"
diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu
index cd43c1b1..4e8a8085 100644
--- a/docker/Dockerfile.gpu
+++ b/docker/Dockerfile.gpu
@@ -1,13 +1,10 @@
-FROM nvidia/cuda:10.2-runtime-ubi8 as base
+FROM nvidia/cuda:12.2.0-runtime-ubi8 as base
 
-RUN yum -y install python39 python39-pip
-RUN  pip3.9 install cython torch
-RUN pip3.9 install cnlp-transformers
-
-# temporary workaround for transformers/huggingface version issue 
-RUN pip3.9 uninstall -y huggingface-hub
-RUN pip3.9 install huggingface-hub==0.11.0
+ARG cnlpt_version
 
+RUN yum -y install python39 python39-pip
+RUN pip3.9 install cython torch
+RUN pip3.9 install cnlp-transformers==$cnlpt_version
 
 
 WORKDIR /opt/cnlp/
@@ -33,6 +30,8 @@ ENTRYPOINT ["cnlpt_negation_rest", "-p", "8000"]
 
 FROM base as termexists
 run /usr/bin/python3.9 -c "import sys;sys.path.append('/home/docker');import model_download; model_download.termexists()"
+# Temporary fix, remove once the released pip package has the new model
+run sed -i 's/sharpseed-termexists/termexists_pubmedbert_ssm/g' /usr/local/lib/python3.9/site-packages/cnlpt/api/termexists_rest.py
 ENTRYPOINT ["cnlpt_termexists_rest", "-p", "8000"]
 
 FROM base as temporal
diff --git a/docker/MAINTAINER.md b/docker/MAINTAINER.md
index 08af2691..48bf2a2a 100644
--- a/docker/MAINTAINER.md
+++ b/docker/MAINTAINER.md
@@ -1,25 +1,30 @@
-To deploy images to dockerhub, first auth with docker with an account that
-has access to the smartonfhir organization. Then, the following commands
-should build and publish images (in the CPU case, for multiple architectures).
+# Creating cNLPT Docker Images
 
+## Setup
+- First authenticate with Docker with an account that has access to the
+  [smartonfhir](https://hub.docker.com/u/smartonfhir/) organization.
+- Make sure you have a local docker buildx setup that supports both amd64 and arm64.
+  - Run `docker buildx ls` to see your current setup.
+  - If you don't have a multi-platform instance already, you can create a new default one with:
+    `docker buildx create --driver docker-container --name cross-builder --platform linux/amd64,linux/arm64 --use`
 
-MODEL should be one of: [base, dtr, event, negation, temporal, timex]
-PROCESSOR should be one of: [cpu, gpu]
-PLATFORMS should be linux/amd64 for GPU, and linux/amd64,linux/arm64 for CPU
+## Building
+Use the `./build.py` script to build new images.
+Pass `--help` to see all your options.
+
+### Local Testing
+Use the `./build.py` script to build the image you care about,
+and then run something like the following, depending on your model:
+```shell
+docker run --rm -p 8000:8000 smartonfhir/cnlp-transformers:termexists-latest-cpu
 ```
-export MAJOR=0
-export MINOR=4
-export PATCH=0
-export MODEL=negation
-export PROCESSOR=cpu
-export PLATFORMS=linux/amd64,linux/arm64
 
-docker buildx build \
---push --platform $PLATFORMS \
---tag smartonfhir/cnlp-transformers:$MODEL-latest-$PROCESSOR \
---tag smartonfhir/cnlp-transformers:$MODEL-$MAJOR-$PROCESSOR \
---tag smartonfhir/cnlp-transformers:$MODEL-$MAJOR.$MINOR-$PROCESSOR \
---tag smartonfhir/cnlp-transformers:$MODEL-$MAJOR.$MINOR.$PATCH-$PROCESSOR \
--f Dockerfile.$PROCESSOR \
---target $MODEL . 
+With that specific example of the `termexists` model, you could smoke test it like so:
+```shell
+curl http://localhost:8000/termexists/process -H "Content-Type: application/json" -d '{"doc_text": "Patient has no cough", "entities": [[0, 6], [15, 19]]}'; echo
 ```
+Which should print `{"statuses":[1,-1]}` (the word `cough` was negated, but `Patient` was not).
+
+### Publishing to Docker Hub
+Run the same `./build.py` command you tested with, but add the `--push` flag.
+The built images will be pushed to Docker Hub.
diff --git a/docker/build.py b/docker/build.py
new file mode 100755
index 00000000..637a5940
--- /dev/null
+++ b/docker/build.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import subprocess
+
+# At time of writing this comment, the cnn and hier models see to be works in progress, so aren't included here.
+MODELS = [
+    "current",
+    "dtr",
+    "event",
+    "negation",
+    "temporal",
+    "termexists",
+    "timex",
+]
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", action="append", choices=["all"] + MODELS)
+parser.add_argument("--processor", choices=["all", "cpu", "gpu"], default="all")
+parser.add_argument("--push", action="store_true", default=False)
+args = parser.parse_args()
+
+
+def get_latest_pip_version(package: str) -> str:
+    """Query pip for the latest release of a software package"""
+    process = subprocess.run(
+        # Use a python version that matches the Dockerfiles
+        ["pip", "index", "--python-version=3.9", "versions", package],
+        capture_output=True,
+        check=True,
+    )
+    last_line = process.stdout.decode("utf8").strip().split("\n")[-1].strip()
+    if "LATEST:" not in last_line:
+        raise SystemExit("Did not understand 'pip index versions' output")
+    return last_line.split()[-1]
+
+
+def build_one(model: str, processor: str, *, version: str, push: bool = False) -> None:
+    """Builds a single docker image"""
+    print(f"Building model {model} for processor {processor}:")
+
+    pwd = os.path.dirname(__file__)
+
+    version_parts = version.split(".")
+    major = version_parts[0]
+    minor = version_parts[1]
+    patch = version_parts[2]
+
+    platforms = "linux/amd64"
+    if processor == "cpu" and push:  # only build extra platforms on push because --load can't do multi-platforms
+        platforms += ",linux/arm64"
+
+    build_args = [
+        f"--build-arg=cnlpt_version={version}",  # to make sure that we don't have a version mismatch, we pin cnlpt
+        f"--file={pwd}/Dockerfile.{processor}",
+        f"--platform={platforms}",
+        f"--tag=smartonfhir/cnlp-transformers:{model}-latest-{processor}",
+        f"--tag=smartonfhir/cnlp-transformers:{model}-{major}-{processor}",
+        f"--tag=smartonfhir/cnlp-transformers:{model}-{major}.{minor}-{processor}",
+        f"--tag=smartonfhir/cnlp-transformers:{model}-{major}.{minor}.{patch}-{processor}",
+        f"--target={model}",
+        pwd,
+    ]
+    if push:
+        build_args.append("--push")  # to push to docker hub
+    else:
+        build_args.append("--load")  # to load into docker locally
+
+    subprocess.run(["docker", "buildx", "build"] + build_args, check=True)
+
+
+if __name__ == '__main__':
+    if args.processor == "all":
+        processors = ["cpu", "gpu"]
+    else:
+        processors = [args.processor]
+
+    models = args.model
+    if not args.model or "all" in args.model:
+        models = MODELS
+
+    # Check version of cnlpt available via pip.
+    # Our Dockerfiles pull directly from pip, so we want to be setting the same version as we'll install.
+    # We don't want to pull the version from our sibling code in this repo, because it might not be released yet,
+    # but we still want to be able to push new builds of the existing releases.
+    version = get_latest_pip_version("cnlp-transformers")
+
+    for model in models:
+        for processor in processors:
+            build_one(model, processor, version=version, push=args.push)