diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..37a08a0 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,10 @@ +version: 2 +updates: +- package-ecosystem: "docker" + directory: "/" + schedule: + interval: "daily" +- package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f11b75 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c4b6a1c..ea3c4b6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,10 +50,6 @@ For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of opensource-codeofconduct@amazon.com with any additional questions or comments. -## Security issue notifications -If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. - - ## Licensing See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. diff --git a/README.md b/README.md index 847260c..19ef5db 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -## My Project +## Neuron Deep Learning Containers TODO: Fill this README out! @@ -9,7 +9,7 @@ Be sure to: ## Security -See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. +See [SECURITY](SECURITY.md) for more information. ## License diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..75a3b51 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,11 @@ +## Reporting Security Issues + +We take all security reports seriously. +When we receive such reports, +we will investigate and subsequently address +any potential vulnerabilities as quickly as possible. +If you discover a potential security issue in this project, +please notify AWS/Amazon Security via our +[vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/) +or directly via email to [AWS Security](mailto:aws-security@amazon.com). +Please do *not* create a public GitHub issue in this project. diff --git a/docker/common/deep_learning_container.py b/docker/common/deep_learning_container.py new file mode 100644 index 0000000..207df7d --- /dev/null +++ b/docker/common/deep_learning_container.py @@ -0,0 +1,365 @@ +# Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +import argparse +import json +import logging +import multiprocessing +import os +import re +import signal +import sys + +import botocore.session +import requests + +TIMEOUT_SECS = 5 + + +def requests_helper(url, headers=None, timeout=0.1): + """ + Requests to get instance metadata using imdsv1 and imdsv2 + :param url: str, url to get the request + :param headers: str, headers needed to make a request + :param timeout: float, timeout value for a request + """ + response = None + try: + if headers: + response = requests.get(url, headers=headers, timeout=timeout) + else: + response = requests.get(url, timeout=timeout) + + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + return response + + +def requests_helper_imds(url, token=None): + """ + Requests to get instance metadata using imdsv1 and imdsv2 + :param url: str, url to get the request + :param token: str, token is needed to use imdsv2 + """ + response_text = None + response = None + headers = None + if token: + headers = {"X-aws-ec2-metadata-token": token} + timeout = 1 + try: + while timeout <= 3: + if headers: + response = requests.get(url, headers=headers, timeout=timeout) + else: + response = requests.get(url, timeout=timeout) + if response: + break + timeout += 1 + + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + if response is not None and not (400 <= response.status_code < 600): + response_text = response.text + + return response_text + + +def get_imdsv2_token(): + """ + Retrieve token using imdsv2 service + """ + response = None + token = None + headers = {"X-aws-ec2-metadata-token-ttl-seconds": "600"} + url = "http://169.254.169.254/latest/api/token" + timeout = 1 + + try: + while timeout <= 3: + response = requests.put(url, headers=headers, timeout=timeout) + if response: + break + timeout += 1 + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + if response is not None and not (400 <= response.status_code < 600): + token = response.text + + return token + + +def _validate_instance_id(instance_id): + """ + Validate instance ID + """ + instance_id_regex = r"^(i-\S{17})" + compiled_regex = re.compile(instance_id_regex) + match = compiled_regex.match(instance_id) + + if not match: + return None + + return match.group(1) + + +def _retrieve_instance_id(token=None): + """ + Retrieve instance ID from instance metadata service + """ + instance_id = None + instance_url = "http://169.254.169.254/latest/meta-data/instance-id" + + if token: + instance_id = requests_helper_imds(instance_url, token) + else: + instance_id = requests_helper_imds(instance_url) + + if instance_id: + instance_id = _validate_instance_id(instance_id) + + return instance_id + + +def _retrieve_instance_region(token=None): + """ + Retrieve instance region from instance metadata service + """ + region = None + response_json = None + valid_regions = [ + "ap-northeast-1", + "ap-northeast-2", + "ap-southeast-1", + "ap-southeast-2", + "ap-south-1", + "ca-central-1", + "eu-central-1", + "eu-north-1", + "eu-west-1", + "eu-west-2", + "eu-west-3", + "sa-east-1", + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", + ] + + region_url = "http://169.254.169.254/latest/dynamic/instance-identity/document" + + if token: + response_text = requests_helper_imds(region_url, token) + else: + response_text = requests_helper_imds(region_url) + + if response_text: + response_json = json.loads(response_text) + + if response_json["region"] in valid_regions: + region = response_json["region"] + + return region + + +def _retrieve_device(): + return ( + "gpu" + if os.path.isdir("/usr/local/cuda") + else "eia" + if os.path.isdir("/opt/ei_tools") + else "neuron" + if os.path.exists("/usr/local/bin/tensorflow_model_server_neuron") + else "cpu" + ) + + +def _retrieve_cuda(): + cuda_version = "" + try: + cuda_path = os.path.basename(os.readlink("/usr/local/cuda")) + cuda_version_search = re.search(r"\d+\.\d+", cuda_path) + cuda_version = "" if not cuda_version_search else cuda_version_search.group() + except Exception as e: + logging.error(f"Failed to get cuda path: {e}") + return cuda_version + + +def _retrieve_os(): + version = "" + name = "" + with open("/etc/os-release", "r") as f: + for line in f.readlines(): + if re.match(r"^ID=\w+$", line): + name = re.search(r"^ID=(\w+)$", line).group(1) + if re.match(r'^VERSION_ID="\d+\.\d+"$', line): + version = re.search(r'^VERSION_ID="(\d+\.\d+)"$', line).group(1) + return name + version + + +def parse_args(): + """ + Parsing function to parse input arguments. + Return: args, which containers parsed input arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--framework", + choices=["tensorflow", "mxnet", "pytorch"], + help="framework of container image.", + required=True, + ) + parser.add_argument( + "--framework-version", help="framework version of container image.", required=True + ) + parser.add_argument( + "--container-type", + choices=["training", "inference"], + help="What kind of jobs you want to run on container. Either training or inference.", + required=True, + ) + + args, _unknown = parser.parse_known_args() + + fw_version_pattern = r"\d+(\.\d+){1,2}(-rc\d)?" + + # PT 1.10 and above has +cpu or +cu113 string, so handle accordingly + if args.framework == "pytorch": + pt_fw_version_pattern = r"(\d+(\.\d+){1,2}(-rc\d)?)((\+cpu)|(\+cu\d{3})|(a0\+git\w{7}))" + pt_fw_version_match = re.fullmatch(pt_fw_version_pattern, args.framework_version) + if pt_fw_version_match: + args.framework_version = pt_fw_version_match.group(1) + assert re.fullmatch(fw_version_pattern, args.framework_version), ( + f"args.framework_version = {args.framework_version} does not match {fw_version_pattern}\n" + f"Please specify framework version as X.Y.Z or X.Y." + ) + # TFS 2.12.1 still uses TF 2.12.0 and breaks the telemetry check as it is checking TF version + # instead of TFS version. WE are forcing the version we want. + if ( + args.framework == "tensorflow" + and args.container_type == "inference" + and args.framework_version == "2.12.0" + ): + args.framework_version = "2.12.1" + + return args + + +def query_bucket(instance_id, region): + """ + GET request on an empty object from an Amazon S3 bucket + """ + response = None + args = parse_args() + framework, framework_version, container_type = ( + args.framework, + args.framework_version, + args.container_type, + ) + py_version = sys.version.split(" ")[0] + + if instance_id is not None and region is not None: + url = ( + "https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com" + "/dlc-containers-{1}.txt?x-instance-id={1}&x-framework={2}&x-framework_version={3}&x-py_version={4}&x-container_type={5}".format( + region, instance_id, framework, framework_version, py_version, container_type + ) + ) + response = requests_helper(url, timeout=0.2) + if os.environ.get("TEST_MODE") == str(1): + with open(os.path.join(os.sep, "tmp", "test_request.txt"), "w+") as rf: + rf.write(url) + + logging.debug("Query bucket finished: {}".format(response)) + + return response + + +def tag_instance(instance_id, region): + """ + Apply instance tag on the instance that is running the container using botocore + """ + args = parse_args() + framework, framework_version, container_type = ( + args.framework, + args.framework_version, + args.container_type, + ) + py_version = sys.version.split(" ")[0] + device = _retrieve_device() + cuda_version = f"_cuda{_retrieve_cuda()}" if device == "gpu" else "" + os_version = _retrieve_os() + + tag = f"{framework}_{container_type}_{framework_version}_python{py_version}_{device}{cuda_version}_{os_version}" + tag_struct = {"Key": "aws-dlc-autogenerated-tag-do-not-delete", "Value": tag} + + request_status = None + if instance_id and region: + try: + session = botocore.session.get_session() + ec2_client = session.create_client("ec2", region_name=region) + response = ec2_client.create_tags(Resources=[instance_id], Tags=[tag_struct]) + request_status = response.get("ResponseMetadata").get("HTTPStatusCode") + if os.environ.get("TEST_MODE") == str(1): + with open(os.path.join(os.sep, "tmp", "test_tag_request.txt"), "w+") as rf: + rf.write(json.dumps(tag_struct, indent=4)) + except Exception as e: + logging.error(f"Error. {e}") + logging.debug("Instance tagged successfully: {}".format(request_status)) + else: + logging.error("Failed to retrieve instance_id or region") + + return request_status + + +def main(): + """ + Invoke bucket query + """ + # Logs are not necessary for normal run. Remove this line while debugging. + logging.getLogger().disabled = True + + logging.basicConfig(level=logging.ERROR) + token = None + instance_id = None + region = None + token = get_imdsv2_token() + if token: + instance_id = _retrieve_instance_id(token) + region = _retrieve_instance_region(token) + else: + instance_id = _retrieve_instance_id() + region = _retrieve_instance_region() + + bucket_process = multiprocessing.Process(target=query_bucket, args=(instance_id, region)) + tag_process = multiprocessing.Process(target=tag_instance, args=(instance_id, region)) + + bucket_process.start() + tag_process.start() + + tag_process.join(TIMEOUT_SECS) + bucket_process.join(TIMEOUT_SECS) + + if tag_process.is_alive(): + os.kill(tag_process.pid, signal.SIGKILL) + tag_process.join() + if bucket_process.is_alive(): + os.kill(bucket_process.pid, signal.SIGKILL) + bucket_process.join() + + +if __name__ == "__main__": + main() diff --git a/docker/common/neuron-monitor.sh b/docker/common/neuron-monitor.sh new file mode 100644 index 0000000..d623064 --- /dev/null +++ b/docker/common/neuron-monitor.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +neuron_monitor_running=0 +if [[ ! -z "${NEURON_MONITOR_CW_REGION}" ]]; then + # Start neuron monitor. If namespace/region variable is set then use it. + if [[ ! -z "${NEURON_MONITOR_CONFIG_FILE}" ]]; then + config="--config-file ${NEURON_MONITOR_CONFIG_FILE:+ $NEURON_MONITOR_CONFIG_FILE}" + fi + if [[ ! -z "${NEURON_MONITOR_CW_NAMESPACE}" ]]; then + mnnamespace="--namespace ${NEURON_MONITOR_CW_NAMESPACE:+ $NEURON_MONITOR_CW_NAMESPACE}" + fi + region="--region ${NEURON_MONITOR_CW_REGION:+ $NEURON_MONITOR_CW_REGION}" + /opt/aws/neuron/bin/neuron-monitor ${config:+ $config} | /opt/aws/neuron/bin/neuron-monitor-cloudwatch.py ${mnnamespace:+ $mnnamespace} ${region:+$region} >> /tmp/nm.log 2>&1 & + nm_pid=$! + echo "Neuron Monitor Started" + neuron_monitor_running=1 +fi diff --git a/docker/pytorch/inference/1.13.1/Dockerfile.neuron b/docker/pytorch/inference/1.13.1/Dockerfile.neuron new file mode 100644 index 0000000..e769c43 --- /dev/null +++ b/docker/pytorch/inference/1.13.1/Dockerfile.neuron @@ -0,0 +1,152 @@ +FROM public.ecr.aws/docker/library/ubuntu:20.04 + +LABEL dlc_major_version="1" +LABEL maintainer="Amazon AI" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# Neuron SDK components version numbers +ARG NEURONX_TOOLS_VERSION=2.16.* +ARG NEURON_FRAMEWORK_VERSION=1.13.1.2.9.17.* +ARG NEURON_CC_VERSION=1.21.* + +ARG PYTHON=python3.10 +ARG PYTHON_VERSION=3.10.12 +ARG TORCHSERVE_VERSION=0.9.0 +ARG SM_TOOLKIT_VERSION=2.0.21 +ARG MAMBA_VERSION=23.1.0-4 + +# See http://bugs.python.org/issue19846 +ENV LANG=C.UTF-8 +ENV LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH +ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH +ENV SAGEMAKER_SERVING_MODULE=sagemaker_pytorch_serving_container.serving:main +ENV TEMP=/home/model-server/tmp + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends software-properties-common \ + && add-apt-repository ppa:openjdk-r/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + apt-transport-https \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + openjdk-11-jdk \ + vim \ + wget \ + unzip \ + zlib1g-dev \ + libcap-dev \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files +RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ + mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ + /var/lib/dpkg/info/ca-certificates-java.postinst configure; + +RUN curl -L -o ~/mambaforge.sh https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-x86_64.sh \ + && chmod +x ~/mambaforge.sh \ + && ~/mambaforge.sh -b -p /opt/conda \ + && rm ~/mambaforge.sh \ + && /opt/conda/bin/conda update -y conda \ + && /opt/conda/bin/conda install -c conda-forge -y \ + python=$PYTHON_VERSION \ + pyopenssl \ + cython \ + mkl-include \ + mkl \ + parso \ + typing \ + # Below 2 are included in miniconda base, but not mamba so need to install + conda-content-trust \ + charset-normalizer \ + + && /opt/conda/bin/conda clean -ya + +RUN conda install -c conda-forge \ + scikit-learn \ + h5py \ + requests \ + && conda clean -ya \ + && pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ + && pip install packaging \ + enum-compat \ + ipython + +RUN pip install --no-cache-dir -U \ + opencv-python>=4.8.1.78 \ + "numpy<1.24,>1.21" \ + "scipy>=1.8.0" \ + six \ + "pillow>=10.0.1" \ + "awscli<2" \ + pandas==1.* \ + boto3 \ + cryptography + +RUN pip install neuron-cc==$NEURON_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + torch-neuron==$NEURON_FRAMEWORK_VERSION \ + && pip install -U protobuf==3.19.5 \ + torchserve==${TORCHSERVE_VERSION} \ + torch-model-archiver==${TORCHSERVE_VERSION} \ + && pip install --no-deps --no-cache-dir -U torchvision==0.14.* + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp /opt/ml/model \ + && chown -R model-server /home/model-server /opt/ml/model + +COPY neuron-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY neuron-monitor.sh /usr/local/bin/neuron-monitor.sh +COPY torchserve-neuron.sh /usr/local/bin/entrypoint.sh +COPY config.properties /home/model-server + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/neuron-monitor.sh \ + && chmod +x /usr/local/bin/entrypoint.sh + +ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +RUN pip install --no-cache-dir "sagemaker-pytorch-inference==${SM_TOOLKIT_VERSION}" + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + # conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya + && rm -rf ${HOME_DIR}/.cache/conda + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt -o /license.txt + +EXPOSE 8080 8081 + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["/usr/local/bin/entrypoint.sh"] diff --git a/docker/pytorch/inference/1.13.1/Dockerfile.neuron.cve_allowlist.json b/docker/pytorch/inference/1.13.1/Dockerfile.neuron.cve_allowlist.json new file mode 100644 index 0000000..8c3e065 --- /dev/null +++ b/docker/pytorch/inference/1.13.1/Dockerfile.neuron.cve_allowlist.json @@ -0,0 +1,68 @@ +{ + "CVE-2023-5678": { + "vulnerability_id": "CVE-2023-5678", + "title": "CVE-2023-5678 - cryptography", + "description": "Issue summary: Generating excessively long X9.42 DH keys or checking\nexcessively long X9.42 DH keys or parameters may be very slow.\n\nImpact summary: Applications that use the functions DH_generate_key() to\ngenerate an X9.42 DH key may experience long delays. Likewise, applications\nthat use DH_check_pub_key(), DH_check_pub_key_ex() or EVP_PKEY_public_check()\nto check an X9.42 DH key or X9.42 DH parameters may experience long delays.\nWhere the key or parameters that are being checked have been obtained from\nan untrusted source this may lead to a Denial of Service.\n\nWhile DH_check() performs all the necessary checks (as of CVE-2023-3817),\nDH_check_pub_key() doesn't make any of these checks, and is therefore\nvulnerable for excessively large P and Q parameters.\n\nLikewise, while DH_generate_key() performs a check for an excessively large\nP, it doesn't check for an excessively large Q.\n\nAn application that calls DH_generate_key() or DH_check_pub_key() and\nsupplies a key or parameters obtained from an untrusted sour", + "vulnerable_packages": [ + { + "epoch": 0, + "filePath": "opt/conda/lib/python3.10/site-packages/cryptography-41.0.7.dist-info/METADATA", + "name": "cryptography", + "packageManager": "PYTHONPKG", + "version": "41.0.7" + } + ], + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "score": 5.3, + "score_details": { + "cvss": { + "adjustments": [], + "score": 5.3, + "scoreSource": "NVD", + "scoringVector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:L", + "version": "3.1" + } + }, + "source": "NVD", + "source_url": "https://nvd.nist.gov/vuln/detail/CVE-2023-5678", + "severity": "MEDIUM", + "status": "ACTIVE" + }, + "SNYK-PYTHON-CRYPTOGRAPHY-6126975": { + "vulnerability_id": "SNYK-PYTHON-CRYPTOGRAPHY-6126975", + "title": "IN1-PYTHON-CRYPTOGRAPHY-6126975 - cryptography", + "description": "## Overview\n\nAffected versions of this package are vulnerable to Information Exposure. This issue may allow a remote attacker to decrypt captured messages in TLS servers that use RSA key exchanges, which may lead to exposure of confidential or sensitive data.\n\n**Note:**\n\n\nThis vulnerability exists due to an incomplete fix for [CVE-2020-25659](https://security.snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-1022152).\n## Remediation\nThere is no fixed version for `cryptography`.\n## References\n- [GitHub Issue](https://github.com/pyca/cryptography/issues/9785#issuecomment-1856209406)", + "vulnerable_packages": [ + { + "epoch": 0, + "filePath": "opt/conda/lib/python3.10/site-packages/cryptography-41.0.7.dist-info/METADATA", + "name": "cryptography", + "packageManager": "PYTHONPKG", + "version": "41.0.7" + } + ], + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "score": 5.9, + "score_details": { + "cvss": { + "adjustments": [], + "score": 5.9, + "scoreSource": "SNYK", + "scoringVector": "CVSS:3.1/AV:N/AC:H/PR:N/UI:N/S:U/C:H/I:N/A:N/E:P", + "version": "3.1" + } + }, + "source": "SNYK", + "source_url": "https://security.snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6126975", + "severity": "MEDIUM", + "status": "ACTIVE" + } +} diff --git a/docker/pytorch/inference/1.13.1/Dockerfile.neuronx b/docker/pytorch/inference/1.13.1/Dockerfile.neuronx new file mode 100644 index 0000000..f6c57c1 --- /dev/null +++ b/docker/pytorch/inference/1.13.1/Dockerfile.neuronx @@ -0,0 +1,167 @@ +FROM public.ecr.aws/docker/library/ubuntu:20.04 + +LABEL dlc_major_version="1" +LABEL maintainer="Amazon AI" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# Neuron SDK components version numbers +ARG NEURONX_RUNTIME_LIB_VERSION=2.19.* +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.19.* +ARG NEURONX_TOOLS_VERSION=2.16.* +ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.13.* +ARG NEURONX_TRANSFORMERS_VERSION=0.9.* +ARG NEURONX_CC_VERSION=2.12.* +ARG NEURONX_DISTRIBUTED_VERSION=0.6.* + +ARG PYTHON=python3.10 +ARG PYTHON_VERSION=3.10.12 +ARG TORCHSERVE_VERSION=0.9.0 +ARG SM_TOOLKIT_VERSION=2.0.21 +ARG MAMBA_VERSION=23.1.0-4 + +# See http://bugs.python.org/issue19846 +ENV LANG=C.UTF-8 +ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH +ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH +ENV SAGEMAKER_SERVING_MODULE=sagemaker_pytorch_serving_container.serving:main +ENV TEMP=/home/model-server/tmp + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends software-properties-common \ + && add-apt-repository ppa:openjdk-r/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + apt-transport-https \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + openjdk-11-jdk \ + vim \ + wget \ + unzip \ + zlib1g-dev \ + libcap-dev \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files +RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ + mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ + /var/lib/dpkg/info/ca-certificates-java.postinst configure; + +RUN curl -L -o ~/mambaforge.sh https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-x86_64.sh \ + && chmod +x ~/mambaforge.sh \ + && ~/mambaforge.sh -b -p /opt/conda \ + && rm ~/mambaforge.sh \ + && /opt/conda/bin/conda update -y conda \ + && /opt/conda/bin/conda install -c conda-forge -y \ + python=$PYTHON_VERSION \ + pyopenssl \ + cython \ + mkl-include \ + mkl \ + parso \ + typing \ + # Below 2 are included in miniconda base, but not mamba so need to install + conda-content-trust \ + charset-normalizer \ + + && /opt/conda/bin/conda clean -ya + +RUN conda install -c conda-forge \ + scikit-learn \ + h5py \ + requests \ + && conda clean -ya \ + && pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ + && pip install packaging \ + enum-compat \ + ipython + +RUN pip install --no-cache-dir -U \ + opencv-python>=4.8.1.78 \ + "numpy<1.24,>1.21" \ + "scipy>=1.8.0" \ + six \ + "pillow>=10.0.1" \ + "awscli<2" \ + pandas==1.* \ + boto3 \ + cryptography + +RUN pip install -U --extra-index-url https://pip.repos.neuron.amazonaws.com \ + neuronx-cc==$NEURONX_CC_VERSION \ + neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ + torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ + transformers-neuronx==$NEURONX_TRANSFORMERS_VERSION \ + && pip install -U "protobuf>=3.18.3,<4" \ + torchserve==${TORCHSERVE_VERSION} \ + torch-model-archiver==${TORCHSERVE_VERSION} \ + && pip install --no-deps --no-cache-dir -U torchvision==0.14.* + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp /opt/ml/model \ + && chown -R model-server /home/model-server /opt/ml/model + +COPY neuron-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY neuron-monitor.sh /usr/local/bin/neuron-monitor.sh +COPY torchserve-neuron.sh /usr/local/bin/entrypoint.sh +COPY config.properties /home/model-server + +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ + && chmod +x /usr/local/bin/neuron-monitor.sh \ + && chmod +x /usr/local/bin/entrypoint.sh + +ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +RUN pip install --no-cache-dir "sagemaker-pytorch-inference==${SM_TOOLKIT_VERSION}" + +# patch default_pytorch_inference_handler.py to import torch_neuronx +RUN DEST_DIR=$(python -c "import os.path, sagemaker_pytorch_serving_container; print(os.path.dirname(sagemaker_pytorch_serving_container.__file__))") \ + && DEST_FILE=${DEST_DIR}/default_pytorch_inference_handler.py \ + && sed -i "s/import torch/import torch, torch_neuronx/" ${DEST_FILE} + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + # conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya + && rm -rf ${HOME_DIR}/.cache/conda + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt -o /license.txt + +EXPOSE 8080 8081 + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["/usr/local/bin/entrypoint.sh"] diff --git a/docker/pytorch/inference/1.13.1/Dockerfile.neuronx.cve_allowlist.json b/docker/pytorch/inference/1.13.1/Dockerfile.neuronx.cve_allowlist.json new file mode 100644 index 0000000..8c3e065 --- /dev/null +++ b/docker/pytorch/inference/1.13.1/Dockerfile.neuronx.cve_allowlist.json @@ -0,0 +1,68 @@ +{ + "CVE-2023-5678": { + "vulnerability_id": "CVE-2023-5678", + "title": "CVE-2023-5678 - cryptography", + "description": "Issue summary: Generating excessively long X9.42 DH keys or checking\nexcessively long X9.42 DH keys or parameters may be very slow.\n\nImpact summary: Applications that use the functions DH_generate_key() to\ngenerate an X9.42 DH key may experience long delays. Likewise, applications\nthat use DH_check_pub_key(), DH_check_pub_key_ex() or EVP_PKEY_public_check()\nto check an X9.42 DH key or X9.42 DH parameters may experience long delays.\nWhere the key or parameters that are being checked have been obtained from\nan untrusted source this may lead to a Denial of Service.\n\nWhile DH_check() performs all the necessary checks (as of CVE-2023-3817),\nDH_check_pub_key() doesn't make any of these checks, and is therefore\nvulnerable for excessively large P and Q parameters.\n\nLikewise, while DH_generate_key() performs a check for an excessively large\nP, it doesn't check for an excessively large Q.\n\nAn application that calls DH_generate_key() or DH_check_pub_key() and\nsupplies a key or parameters obtained from an untrusted sour", + "vulnerable_packages": [ + { + "epoch": 0, + "filePath": "opt/conda/lib/python3.10/site-packages/cryptography-41.0.7.dist-info/METADATA", + "name": "cryptography", + "packageManager": "PYTHONPKG", + "version": "41.0.7" + } + ], + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "score": 5.3, + "score_details": { + "cvss": { + "adjustments": [], + "score": 5.3, + "scoreSource": "NVD", + "scoringVector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:L", + "version": "3.1" + } + }, + "source": "NVD", + "source_url": "https://nvd.nist.gov/vuln/detail/CVE-2023-5678", + "severity": "MEDIUM", + "status": "ACTIVE" + }, + "SNYK-PYTHON-CRYPTOGRAPHY-6126975": { + "vulnerability_id": "SNYK-PYTHON-CRYPTOGRAPHY-6126975", + "title": "IN1-PYTHON-CRYPTOGRAPHY-6126975 - cryptography", + "description": "## Overview\n\nAffected versions of this package are vulnerable to Information Exposure. This issue may allow a remote attacker to decrypt captured messages in TLS servers that use RSA key exchanges, which may lead to exposure of confidential or sensitive data.\n\n**Note:**\n\n\nThis vulnerability exists due to an incomplete fix for [CVE-2020-25659](https://security.snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-1022152).\n## Remediation\nThere is no fixed version for `cryptography`.\n## References\n- [GitHub Issue](https://github.com/pyca/cryptography/issues/9785#issuecomment-1856209406)", + "vulnerable_packages": [ + { + "epoch": 0, + "filePath": "opt/conda/lib/python3.10/site-packages/cryptography-41.0.7.dist-info/METADATA", + "name": "cryptography", + "packageManager": "PYTHONPKG", + "version": "41.0.7" + } + ], + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "score": 5.9, + "score_details": { + "cvss": { + "adjustments": [], + "score": 5.9, + "scoreSource": "SNYK", + "scoringVector": "CVSS:3.1/AV:N/AC:H/PR:N/UI:N/S:U/C:H/I:N/A:N/E:P", + "version": "3.1" + } + }, + "source": "SNYK", + "source_url": "https://security.snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6126975", + "severity": "MEDIUM", + "status": "ACTIVE" + } +} diff --git a/docker/pytorch/inference/common/config.properties b/docker/pytorch/inference/common/config.properties new file mode 100644 index 0000000..d0efd6e --- /dev/null +++ b/docker/pytorch/inference/common/config.properties @@ -0,0 +1,26 @@ +vmargs=-XX:+UseContainerSupport -XX:InitialRAMPercentage=8.0 -XX:MaxRAMPercentage=10.0 -XX:-UseLargePages -XX:+UseG1GC -XX:+ExitOnOutOfMemoryError +model_store=/opt/ml/model +load_models=ALL +inference_address=http://0.0.0.0:8080 +management_address=http://0.0.0.0:8081 +enable_envvars_config=true +# management_address=unix:/tmp/management.sock +# number_of_netty_threads=0 +# netty_client_threads=0 +# default_response_timeout=120 +# default_workers_per_model=0 +# job_queue_size=100 +# async_logging=false +# number_of_gpu=1 +# cors_allowed_origin +# cors_allowed_methods +# cors_allowed_headers +# keystore=src/test/resources/keystore.p12 +# keystore_pass=changeit +# keystore_type=PKCS12 +# private_key_file=src/test/resources/key.pem +# certificate_file=src/test/resources/certs.pem +# max_response_size=6553500 +# max_request_size=6553500 +# blacklist_env_vars= +# decode_input_request=false diff --git a/docker/pytorch/inference/common/neuron-entrypoint.py b/docker/pytorch/inference/common/neuron-entrypoint.py new file mode 100644 index 0000000..beb3043 --- /dev/null +++ b/docker/pytorch/inference/common/neuron-entrypoint.py @@ -0,0 +1,31 @@ +# Copyright 2019-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +from __future__ import absolute_import + +import shlex +import subprocess +import sys + +neuron_cmd = "/usr/local/bin/neuron-monitor.sh" +subprocess.check_call(shlex.split(neuron_cmd)) + +if sys.argv[1] == "serve": + from sagemaker_pytorch_serving_container import serving + + serving.main() +else: + subprocess.check_call(shlex.split(" ".join(sys.argv[1:]))) + +# prevent docker exit +subprocess.call(["tail", "-f", "/dev/null"]) diff --git a/docker/pytorch/inference/common/torchserve-neuron.sh b/docker/pytorch/inference/common/torchserve-neuron.sh new file mode 100644 index 0000000..c11b208 --- /dev/null +++ b/docker/pytorch/inference/common/torchserve-neuron.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +MODEL_STORE=/opt/ml/model +TS_CONFIG=/home/model-server/config.properties +MODEL_PATH="" + +while getopts ":m:t:" opt; do + case $opt in + m) MODEL_PATH="$OPTARG" + ;; + t) TS_CONFIG="$OPTARG" + ;; + \?) echo "Invalid option -$OPTARG" >&2 + ;; + esac +done + +printf "Model path: %s\n" "$MODEL_PATH" +printf "TS_CONFIG: %s\n" "$TS_CONFIG" +# Start the Model Server +if [[ -z "$MODEL_PATH" ]]; then + torchserve --start --ts-config /home/model-server/config.properties --model-store /opt/ml/model & +else + torchserve --start --ts-config $TS_CONFIG --models $MODEL_PATH & +fi +status=$? +if [ $status -ne 0 ]; then + echo "Failed to start TF Model Server: $status" + exit $status +fi \ No newline at end of file diff --git a/docker/pytorch/training/1.13.1/Dockerfile.neuronx b/docker/pytorch/training/1.13.1/Dockerfile.neuronx new file mode 100644 index 0000000..e1efdc3 --- /dev/null +++ b/docker/pytorch/training/1.13.1/Dockerfile.neuronx @@ -0,0 +1,223 @@ +FROM public.ecr.aws/docker/library/ubuntu:20.04 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +# Neuron SDK components version numbers +ARG NEURONX_RUNTIME_LIB_VERSION=2.19.* +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.19.* +ARG NEURONX_TOOLS_VERSION=2.16.* +ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.13.* +ARG NEURONX_CC_VERSION=2.12.* +ARG NEURONX_DISTRIBUTED_VERSION=0.6.* + +ARG PYTHON=python3.10 +ARG PYTHON_VERSION=3.10.12 +ARG PIP=pip3 +ARG OMPI_VERSION=4.1.5 + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ARG DEBIAN_FRONTEND=noninteractive + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV PATH /opt/aws/neuron/bin/:$PATH +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main +ENV DGLBACKEND=pytorch + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libopencv-dev \ + openjdk-8-jdk-headless \ + openjdk-8-jdk \ + openjdk-8-jre \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libxext6 \ + libxrender-dev \ + openjdk-11-jdk \ + software-properties-common \ + wget \ + unzip \ + vim \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install Open MPI +RUN mkdir -p /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +WORKDIR / + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + +RUN ${PIP} install --no-cache-dir -U \ + "bokeh>=2.3,<3" \ + "awscli<2" \ + scipy \ + click \ + "cryptography" \ + "sagemaker>=2,<2.184" \ + "sagemaker-pytorch-training" \ + psutil==5.6.7 \ + dataset \ + transformers \ + Pillow + +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt +RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com + +# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 +# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 +# awscli 1.25.47 has requirement docutils<0.17,>=0.10 +# etcd for kubernetes installation +# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. +# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 +RUN ${PIP} install --no-cache-dir -U \ + "attrs<24,>=23.1.0" \ + "protobuf>=3.18.3,<=3.20.3" \ + "docutils>=0.10,<0.17" \ + "rsa<4.8,>=3.1.2" \ + "python-etcd" \ + "urllib3>=1.26.0,<1.27" + +# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) +RUN pip install --no-cache-dir -U \ + "bokeh>=3.0.1,<4" \ + "imageio>=2.22,<3" \ + "opencv-python>=4.8.1.78" \ + "plotly>=5.11,<6" \ + "seaborn>=0.12,<1" \ + "numba>=0.56.4,<0.57" \ + "shap>=0.41,<1" \ + "numpy<1.24,>1.21" + +# EFA Installer does apt get. Make sure to run apt update before that +RUN apt-get update +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME + + +# Clean up after apt update +RUN rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install some common packages used by training scripts +# torchvision needed for MLP. since it depends on torch and torch neuron/torch +# is already installed install it with nodeps +RUN pip3 install --no-cache-dir --no-deps -U \ + torchvision==0.14.* + +# Needed for running bert training scripts +RUN pip3 install --no-cache-dir -U \ + graphviz \ + tensorboard==2.6 \ + accelerate \ + sentencepiece!=0.1.92 \ + h5py \ + requests + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt + +# Starts framework +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] diff --git a/docker/pytorch/training/1.13.1/Dockerfile.neuronx.cve_allowlist.json b/docker/pytorch/training/1.13.1/Dockerfile.neuronx.cve_allowlist.json new file mode 100644 index 0000000..04cee05 --- /dev/null +++ b/docker/pytorch/training/1.13.1/Dockerfile.neuronx.cve_allowlist.json @@ -0,0 +1,68 @@ +{ + "CVE-2023-5678": { + "vulnerability_id": "CVE-2023-5678", + "title": "CVE-2023-5678 - cryptography", + "description": "Issue summary: Generating excessively long X9.42 DH keys or checking\nexcessively long X9.42 DH keys or parameters may be very slow.\n\nImpact summary: Applications that use the functions DH_generate_key() to\ngenerate an X9.42 DH key may experience long delays. Likewise, applications\nthat use DH_check_pub_key(), DH_check_pub_key_ex() or EVP_PKEY_public_check()\nto check an X9.42 DH key or X9.42 DH parameters may experience long delays.\nWhere the key or parameters that are being checked have been obtained from\nan untrusted source this may lead to a Denial of Service.\n\nWhile DH_check() performs all the necessary checks (as of CVE-2023-3817),\nDH_check_pub_key() doesn't make any of these checks, and is therefore\nvulnerable for excessively large P and Q parameters.\n\nLikewise, while DH_generate_key() performs a check for an excessively large\nP, it doesn't check for an excessively large Q.\n\nAn application that calls DH_generate_key() or DH_check_pub_key() and\nsupplies a key or parameters obtained from an untrusted sour", + "vulnerable_packages": [ + { + "epoch": 0, + "filePath": "usr/local/lib/python3.10/site-packages/cryptography-41.0.7.dist-info/METADATA", + "name": "cryptography", + "packageManager": "PYTHONPKG", + "version": "41.0.7" + } + ], + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "score": 5.3, + "score_details": { + "cvss": { + "adjustments": [], + "score": 5.3, + "scoreSource": "NVD", + "scoringVector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:L", + "version": "3.1" + } + }, + "source": "NVD", + "source_url": "https://nvd.nist.gov/vuln/detail/CVE-2023-5678", + "severity": "MEDIUM", + "status": "ACTIVE" + }, + "SNYK-PYTHON-CRYPTOGRAPHY-6126975": { + "vulnerability_id": "SNYK-PYTHON-CRYPTOGRAPHY-6126975", + "title": "IN1-PYTHON-CRYPTOGRAPHY-6126975 - cryptography", + "description": "## Overview\n\nAffected versions of this package are vulnerable to Information Exposure. This issue may allow a remote attacker to decrypt captured messages in TLS servers that use RSA key exchanges, which may lead to exposure of confidential or sensitive data.\n\n**Note:**\n\n\nThis vulnerability exists due to an incomplete fix for [CVE-2020-25659](https://security.snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-1022152).\n## Remediation\nThere is no fixed version for `cryptography`.\n## References\n- [GitHub Issue](https://github.com/pyca/cryptography/issues/9785#issuecomment-1856209406)", + "vulnerable_packages": [ + { + "epoch": 0, + "filePath": "usr/local/lib/python3.10/site-packages/cryptography-41.0.7.dist-info/METADATA", + "name": "cryptography", + "packageManager": "PYTHONPKG", + "version": "41.0.7" + } + ], + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "score": 5.9, + "score_details": { + "cvss": { + "adjustments": [], + "score": 5.9, + "scoreSource": "SNYK", + "scoringVector": "CVSS:3.1/AV:N/AC:H/PR:N/UI:N/S:U/C:H/I:N/A:N/E:P", + "version": "3.1" + } + }, + "source": "SNYK", + "source_url": "https://security.snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6126975", + "severity": "MEDIUM", + "status": "ACTIVE" + } +} diff --git a/docker/pytorch/training/common/changehostname.c b/docker/pytorch/training/common/changehostname.c new file mode 100644 index 0000000..0db2209 --- /dev/null +++ b/docker/pytorch/training/common/changehostname.c @@ -0,0 +1,33 @@ +#include +#include + +/** + * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You + * may not use this file except in compliance with the License. A copy of + * the License is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF + * ANY KIND, either express or implied. See the License for the specific + * language governing permissions and limitations under the License. + */ + +/** + * Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker. + * + * Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host, + * not realizing that it needs to use NET/Socket. + * + * When docker container starts we read 'current_host' value from /opt/ml/input/config/resourceconfig.json + * and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library. + */ +int gethostname(char *name, size_t len) +{ + const char *val = PLACEHOLDER_HOSTNAME; + strncpy(name, val, len); + return 0; +} diff --git a/docker/pytorch/training/common/start_with_right_hostname.sh b/docker/pytorch/training/common/start_with_right_hostname.sh new file mode 100644 index 0000000..63c4fee --- /dev/null +++ b/docker/pytorch/training/common/start_with_right_hostname.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +if [[ "$1" = "train" ]]; then + CURRENT_HOST=$(jq .current_host /opt/ml/input/config/resourceconfig.json) + sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c + gcc -o changehostname.o -c -fPIC -Wall changehostname.c + gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl + LD_PRELOAD=/libchangehostname.so train +else + eval "$@" +fi