From 3ad90d15b32d3b02e76f199426bfbcbafc720ec0 Mon Sep 17 00:00:00 2001 From: Satish Pasumarthi <35979860+satishpasumarthi@users.noreply.github.com> Date: Fri, 8 Jul 2022 15:29:43 -0700 Subject: [PATCH] fix: CI (#234) --- CONTRIBUTING.md | 6 +- buildspec-gputests.yml | 66 +++++++++---------- buildspec-release.yml | 2 +- buildspec-unittests.yml | 2 +- buildspec.yml | 28 ++++---- setup.py | 6 +- test/conftest.py | 4 +- .../{1.6.0 => 1.11.0}/Dockerfile.dlc.cpu | 0 .../{1.6.0 => 1.11.0}/Dockerfile.dlc.gpu | 2 +- .../{1.6.0 => 1.11.0}/Dockerfile.pytorch | 2 +- test/container/1.4.0/Dockerfile.dlc.cpu | 10 --- test/container/1.4.0/Dockerfile.dlc.gpu | 28 -------- test/container/1.4.0/Dockerfile.pytorch | 20 ------ test/integration/sagemaker/test_horovod.py | 2 +- tox.ini | 2 +- 15 files changed, 56 insertions(+), 124 deletions(-) rename test/container/{1.6.0 => 1.11.0}/Dockerfile.dlc.cpu (100%) rename test/container/{1.6.0 => 1.11.0}/Dockerfile.dlc.gpu (66%) rename test/container/{1.6.0 => 1.11.0}/Dockerfile.pytorch (92%) delete mode 100644 test/container/1.4.0/Dockerfile.dlc.cpu delete mode 100644 test/container/1.4.0/Dockerfile.dlc.gpu delete mode 100644 test/container/1.4.0/Dockerfile.pytorch diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 216b1a11..a58134e3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,7 +11,7 @@ information to effectively respond to your bug report or contribution. We welcome you to use the GitHub issue tracker to report bugs or suggest features. -When filing an issue, please check [existing open](https://github.com/aws-samples/sagemaker-pytorch-containers/issues), or [recently closed](https://github.com/aws-samples/sagemaker-pytorch-containers/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already +When filing an issue, please check [existing open](https://github.com/aws/sagemaker-pytorch-training-toolkit/issues), or [recently closed](https://github.com/aws/sagemaker-pytorch-training-toolkit/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: * A reproducible test case or series of steps @@ -41,7 +41,7 @@ GitHub provides additional document on [forking a repository](https://help.githu ## Finding contributions to work on -Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/sagemaker-pytorch-containers/labels/help%20wanted) issues is a great place to start. +Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/sagemaker-pytorch-training-toolkit/labels/help%20wanted) issues is a great place to start. ## Code of Conduct @@ -56,6 +56,6 @@ If you discover a potential security issue in this project we ask that you notif ## Licensing -See the [LICENSE](https://github.com/aws-samples/sagemaker-pytorch-containers/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. +See the [LICENSE](https://github.com/aws/sagemaker-pytorch-training-toolkit/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. diff --git a/buildspec-gputests.yml b/buildspec-gputests.yml index e6b73a32..d072298a 100644 --- a/buildspec-gputests.yml +++ b/buildspec-gputests.yml @@ -2,8 +2,8 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.6.0' - GPU_INSTANCE_TYPE: 'ml.p2.8xlarge' + FRAMEWORK_VERSION: '1.11.0' + GPU_INSTANCE_TYPE: 'ml.p3.16xlarge' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-pytorch-container' DLC_ACCOUNT: '763104351884' @@ -26,46 +26,40 @@ phases: - pip3 install -U -e .[test] # define tags - - GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID" - DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID" - - # launch remote GPU instance - - prefix='ml.' - - instance_type=${GPU_INSTANCE_TYPE#"$prefix"} - - create-key-pair - - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest + - echo 'Skipping DLC creation as it is taken care in DLC pipelines' + # # launch remote GPU instance + # - prefix='ml.' + # - instance_type=${GPU_INSTANCE_TYPE#"$prefix"} + # - create-key-pair + # - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest # build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test - - python3 setup.py sdist - - build_dir="test/container/$FRAMEWORK_VERSION" - - $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - build_cmd="docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION ." - - execute-command-if-has-matching-changes "$build_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - # push DLC GPU image to ECR - - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - push_cmd="docker push $PREPROD_IMAGE:$DLC_GPU_TAG" - - execute-command-if-has-matching-changes "$push_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + # - python3 setup.py sdist + # - build_dir="test/container/$FRAMEWORK_VERSION" + # - $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) + # - build_cmd="docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION ." + # - execute-command-if-has-matching-changes "$build_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + # # push DLC GPU image to ECR + # - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) + # - push_cmd="docker push $PREPROD_IMAGE:$DLC_GPU_TAG" + # - execute-command-if-has-matching-changes "$push_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - # run GPU local integration tests - - printf "$SETUP_CMDS" > $SETUP_FILE - - generic_cmd="pytest test/integration/local --build-image --push-image --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type pytorch --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG" - - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - - dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type dlc.gpu --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" - - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + # # run GPU local integration tests + # - printf "$SETUP_CMDS" > $SETUP_FILE + # - dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type dlc.gpu --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" + # - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup" + # - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - # run GPU sagemaker integration tests - - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type pytorch --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type dlc.gpu --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + # # run GPU sagemaker integration tests + # - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type dlc.gpu --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG" + # - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" finally: + - echo 'Done' # shut down remote GPU instance - - cleanup-gpu-instances - - cleanup-key-pairs + # - cleanup-gpu-instances + # - cleanup-key-pairs - # remove ECR image - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG + # # remove ECR image + # - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG diff --git a/buildspec-release.yml b/buildspec-release.yml index 7285bb3b..ca45377c 100644 --- a/buildspec-release.yml +++ b/buildspec-release.yml @@ -12,7 +12,7 @@ phases: # run unit tests - AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN= AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION= - tox -e py27,py36,py37 -- test/unit + tox -e py38 -- test/unit # run local integ tests #- $(aws ecr get-login --no-include-email --region us-west-2) diff --git a/buildspec-unittests.yml b/buildspec-unittests.yml index 7efb931a..1f77ecd4 100644 --- a/buildspec-unittests.yml +++ b/buildspec-unittests.yml @@ -13,4 +13,4 @@ phases: - tox -e flake8,twine # run unit tests - - tox -e py27,py36,py37 test/unit + - tox -e py38 test/unit diff --git a/buildspec.yml b/buildspec.yml index f43aba1a..c8ead2c4 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.6.0' + FRAMEWORK_VERSION: '1.11.0' CPU_INSTANCE_TYPE: 'ml.c4.xlarge' ECR_REPO: 'sagemaker-test' @@ -21,22 +21,18 @@ phases: - pip3 install -U -e .[test] # define tags - - GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID" - DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID" + - echo 'Skipping DLC creation as it is taken care in DLC pipelines' + # # run local CPU integration tests (build and push the image to ECR repo) + # - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" + # # execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + # - "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - # run local CPU integration tests (build and push the image to ECR repo) - - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - - # run CPU sagemaker integration tests - - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type pytorch --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" - - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type dlc.cpu --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" + # # run CPU sagemaker integration tests + # - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --dockerfile-type dlc.cpu --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG" + # - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" "lib/*" finally: - # remove ECR image - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG + - echo 'Done' + # # remove ECR image + # - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG diff --git a/setup.py b/setup.py index 8b031859..fe698668 100644 --- a/setup.py +++ b/setup.py @@ -48,12 +48,12 @@ def read(fname): "Natural Language :: English", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ], - install_requires=['retrying', 'sagemaker-training>=3.7.0', 'six>=1.12.0'], + install_requires=['retrying', 'sagemaker-training>=4.2.0', 'six>=1.12.0'], extras_require={ 'test': test_dependencies }, diff --git a/test/conftest.py b/test/conftest.py index 0b0dc8b3..1adfc029 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -46,13 +46,13 @@ def pytest_addoption(parser): parser.addoption('--build-image', '-B', action='store_true') parser.addoption('--push-image', '-P', action='store_true') parser.addoption('--dockerfile-type', '-T', choices=['dlc.cpu', 'dlc.gpu', 'pytorch'], - default=None) + default='pytorch') parser.addoption('--dockerfile', '-D', default=None) parser.addoption('--aws-id', default=None) parser.addoption('--instance-type') parser.addoption('--docker-base-name', default='sagemaker-pytorch-training') parser.addoption('--region', default='us-west-2') - parser.addoption('--framework-version', default="1.4.0") + parser.addoption('--framework-version', default="1.10.0") parser.addoption('--py-version', choices=['2', '3'], default=str(sys.version_info.major)) parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu') # If not specified, will default to {framework-version}-{processor}-py{py-version} diff --git a/test/container/1.6.0/Dockerfile.dlc.cpu b/test/container/1.11.0/Dockerfile.dlc.cpu similarity index 100% rename from test/container/1.6.0/Dockerfile.dlc.cpu rename to test/container/1.11.0/Dockerfile.dlc.cpu diff --git a/test/container/1.6.0/Dockerfile.dlc.gpu b/test/container/1.11.0/Dockerfile.dlc.gpu similarity index 66% rename from test/container/1.6.0/Dockerfile.dlc.gpu rename to test/container/1.11.0/Dockerfile.dlc.gpu index 72cb7328..2dcafe7d 100644 --- a/test/container/1.6.0/Dockerfile.dlc.gpu +++ b/test/container/1.11.0/Dockerfile.dlc.gpu @@ -1,5 +1,5 @@ ARG region -from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu110-ubuntu18.04 +FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_training.tar.gz && \ diff --git a/test/container/1.6.0/Dockerfile.pytorch b/test/container/1.11.0/Dockerfile.pytorch similarity index 92% rename from test/container/1.6.0/Dockerfile.pytorch rename to test/container/1.11.0/Dockerfile.pytorch index 0ad7b8f0..b7b6c9d4 100644 --- a/test/container/1.6.0/Dockerfile.pytorch +++ b/test/container/1.11.0/Dockerfile.pytorch @@ -1,4 +1,4 @@ -from pytorch/pytorch:1.6.0-cuda10.1-cudnn7-runtime +FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime RUN apt-get update && apt-get install -y --no-install-recommends \ jq \ diff --git a/test/container/1.4.0/Dockerfile.dlc.cpu b/test/container/1.4.0/Dockerfile.dlc.cpu deleted file mode 100644 index b86f14c6..00000000 --- a/test/container/1.4.0/Dockerfile.dlc.cpu +++ /dev/null @@ -1,10 +0,0 @@ -ARG region -from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-cpu-py2 - -COPY lib/changehostname.c / -COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh -RUN chmod +x /usr/local/bin/start_with_right_hostname.sh - -COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz -RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_training.tar.gz && \ - rm /sagemaker_pytorch_training.tar.gz diff --git a/test/container/1.4.0/Dockerfile.dlc.gpu b/test/container/1.4.0/Dockerfile.dlc.gpu deleted file mode 100644 index d391f92d..00000000 --- a/test/container/1.4.0/Dockerfile.dlc.gpu +++ /dev/null @@ -1,28 +0,0 @@ -ARG region -from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.4.0-gpu-py3 - -# TODO(@bvveeram): Remove once the 1.4.0-gpu-py3 DLC image installs mpi4py -RUN pip3 install mpi4py==3.0.3 - -# TODO(@bvveeram): Remove once the 1.4.0-gpu-py3 DLC image fixes OpenSSH config -# Configure OpenSSH so that nodes can communicate with each other -RUN mkdir -p /var/run/sshd && \ - sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd - -RUN rm -rf /root/.ssh/ && \ - mkdir -p /root/.ssh/ && \ - ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ - cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ - && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config - -# TODO(@bvveeram): Remove once the 1.4.0-gpu-py3 DLC image fixes MPI config -# Comment line in MPI config to prevent mutually exclusive MCA settings -RUN sed -i '62,62 s/^/#/' /home/.openmpi/etc/openmpi-mca-params.conf - -COPY lib/changehostname.c / -COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh -RUN chmod +x /usr/local/bin/start_with_right_hostname.sh - -COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz -RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_training.tar.gz && \ - rm /sagemaker_pytorch_training.tar.gz diff --git a/test/container/1.4.0/Dockerfile.pytorch b/test/container/1.4.0/Dockerfile.pytorch deleted file mode 100644 index 9849c68b..00000000 --- a/test/container/1.4.0/Dockerfile.pytorch +++ /dev/null @@ -1,20 +0,0 @@ -from pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime - -RUN apt-get update \ - && apt-get install -y --no-install-recommends jq \ - && rm -rf /var/lib/apt/lists/* - -COPY lib/changehostname.c / -COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh -RUN chmod +x /usr/local/bin/start_with_right_hostname.sh - -COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz -RUN pip install --no-cache-dir /sagemaker_pytorch_training.tar.gz && \ - rm /sagemaker_pytorch_training.tar.gz - -ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main - -WORKDIR / - -# Starts framework -ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] diff --git a/test/integration/sagemaker/test_horovod.py b/test/integration/sagemaker/test_horovod.py index f0d3cf40..09276e44 100644 --- a/test/integration/sagemaker/test_horovod.py +++ b/test/integration/sagemaker/test_horovod.py @@ -27,7 +27,7 @@ @pytest.mark.skip_generic @pytest.mark.parametrize( "instances, processes, train_instance_type", - [(1, 8, "ml.p2.8xlarge"), (2, 4, "ml.p3.8xlarge")], + [(2, 4, "ml.p3.8xlarge")], ) def test_horovod_simple( instances, diff --git a/tox.ini b/tox.ini index a6d7983b..9b732dfe 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = flake8,twine,py27,py36,py37 +envlist = flake8,twine,py38 skip_missing_interpreters = False [flake8]