From 5ff2dfcde822d4e8c8ba0aed3c01aa971a630bd0 Mon Sep 17 00:00:00 2001 From: Hanyu Cui Date: Fri, 20 Jan 2023 12:12:28 -0800 Subject: [PATCH] docs: add release notes for 0.19.10 (#5785) Add release notes for 0.19.10. --- docs/release-notes.rst | 64 +++++++++++++++++++ docs/release-notes/5567-data-layer.txt | 8 --- .../FOUNDENG-329-enroot-support.rst | 7 -- .../allow-killing-of-cancel-trial.rst | 7 -- .../cli-job-ls-resource-pool-fix.rst | 6 -- docs/release-notes/early-stopping-bug.rst | 7 -- .../experiment-tag-propagation.rst | 11 ---- docs/release-notes/fsx.rst | 6 -- .../hpc-launcher-resource-pools.rst | 10 --- docs/release-notes/job-submission-time.rst | 7 -- docs/release-notes/notebook-timeout.rst | 6 -- 11 files changed, 64 insertions(+), 75 deletions(-) delete mode 100644 docs/release-notes/5567-data-layer.txt delete mode 100644 docs/release-notes/FOUNDENG-329-enroot-support.rst delete mode 100644 docs/release-notes/allow-killing-of-cancel-trial.rst delete mode 100644 docs/release-notes/cli-job-ls-resource-pool-fix.rst delete mode 100644 docs/release-notes/early-stopping-bug.rst delete mode 100644 docs/release-notes/experiment-tag-propagation.rst delete mode 100644 docs/release-notes/fsx.rst delete mode 100644 docs/release-notes/hpc-launcher-resource-pools.rst delete mode 100644 docs/release-notes/job-submission-time.rst delete mode 100644 docs/release-notes/notebook-timeout.rst diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 057344eda28..dab255b7274 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -10,6 +10,70 @@ Version 0.19 ************** +Version 0.19.10 +=============== + +**Release Date:** January 20, 2023 + +**Breaking Changes** + +- The name of the resource pool in Kubernetes has changed from ``"kubernetes"`` to ``"default"``. + Forked experiments will need to have their configurations manually modified to update the + resource pool name. + +**New Features** + +- Cluster: Add support for experiment tag propagation. + + - The enterprise edition of Determined (`HPE Machine Learning Development + `_) + now allows for experiment tags to be propagated as labels to the associated jobs on the HPC + cluster. A number of labeling schemes are supported, controlled by the configuration item + ``resource_manager.job_project_source``. + +- Cluster: Add support for launcher-provided resource pools. + + - The enterprise edition of Determined (`HPE Machine Learning Development + `_) + now allows for custom resource pools to be defined that submit work to an underlying Slurm/PBS + partition on an HPC cluster with different submission options. + +- Cluster: Determined Enterprise Edition now supports the `NVIDIA Enroot + `__ container platform as an alternative to + Apptainer/Singularity/PodMan. + +**Improvements** + +- Notebooks: The default idle notebook termination timeout can now be set via the + ``notebook_timeout`` master config option. + +- Trials: Trials can now be killed when in the ``STOPPING_CANCELED`` state. Previously, if a trial + did not implement preemption correctly and was canceled, the trial did not stop and was + unkillable until the preemption timeout of an hour. + +**Bug Fixes** + +- Fix a bug where notebooks, TensorBoards, shells, and commands restored after a master restart + would have a submission time of when the master restarted rather than the original job submission + time. + +- ``det deploy aws``: Fix reliability issue in ``efs`` deployment type, fix broken ``fsx`` + deployment type. + +- Job queue: Fix an issue where the CLI command ``det job list`` would ignore the argument + ``--resource-pool``. + +- Distributed training: Fix a bug where a distributed training trial that called + ``context.set_stop_requested`` would cause the trial to error and prevent it from completing + successfully. + +**Removed Features** + +- The data layer feature, which was deprecated in 0.18.0 (May 2022), has been removed. A migration + guide to use the underlying `yogadl library `_ directly + may be found `here `_. + Affected users are encouraged to follow the migration guide before upgrading to avoid downtime. + Version 0.19.8 ============== diff --git a/docs/release-notes/5567-data-layer.txt b/docs/release-notes/5567-data-layer.txt deleted file mode 100644 index 1e7f36a4df9..00000000000 --- a/docs/release-notes/5567-data-layer.txt +++ /dev/null @@ -1,8 +0,0 @@ -:orphan: - -**Removed Features** - -- The data layer feature, which was deprecated in 0.18.0 (May 2022), has been removed. A migration - guide to use the underlying `yogadl library`_ directly - may be found `here`_. - Affected users are encouraged to follow the migration guide before upgrading to avoid downtime. diff --git a/docs/release-notes/FOUNDENG-329-enroot-support.rst b/docs/release-notes/FOUNDENG-329-enroot-support.rst deleted file mode 100644 index 75e794274a7..00000000000 --- a/docs/release-notes/FOUNDENG-329-enroot-support.rst +++ /dev/null @@ -1,7 +0,0 @@ -:orphan: - -**New Features** - -- Cluster: Determined Enterprise Edition now supports the `NVIDIA Enroot - `__ container platform as an alternative to - Apptainer/Singularity/PodMan. diff --git a/docs/release-notes/allow-killing-of-cancel-trial.rst b/docs/release-notes/allow-killing-of-cancel-trial.rst deleted file mode 100644 index 512f54cb4f3..00000000000 --- a/docs/release-notes/allow-killing-of-cancel-trial.rst +++ /dev/null @@ -1,7 +0,0 @@ -:orphan: - -**Improvements** - -- Trials: Trials can now be killed when in the ``STOPPING_CANCELED`` state. Previously if a trial - did not implement preemption correctly and was canceled the trial did not stop and was unkillable - until the preemption timeout of an hour. diff --git a/docs/release-notes/cli-job-ls-resource-pool-fix.rst b/docs/release-notes/cli-job-ls-resource-pool-fix.rst deleted file mode 100644 index 419d65ed558..00000000000 --- a/docs/release-notes/cli-job-ls-resource-pool-fix.rst +++ /dev/null @@ -1,6 +0,0 @@ -:orphan: - -**Bug Fixes** - -- Job queue: Fix an issue where the CLI command ``det job list`` would ignore the argument - ``--resource-pool``. diff --git a/docs/release-notes/early-stopping-bug.rst b/docs/release-notes/early-stopping-bug.rst deleted file mode 100644 index a9c7f6c97c0..00000000000 --- a/docs/release-notes/early-stopping-bug.rst +++ /dev/null @@ -1,7 +0,0 @@ -:orphan: - -**Bug Fixes** - -- Distributed training: We fixed a bug where a distributed training trial that calls - context.set_stop_requested was causing the trial to error and preventing it from completing - successfully. diff --git a/docs/release-notes/experiment-tag-propagation.rst b/docs/release-notes/experiment-tag-propagation.rst deleted file mode 100644 index 83c0730ce8c..00000000000 --- a/docs/release-notes/experiment-tag-propagation.rst +++ /dev/null @@ -1,11 +0,0 @@ -:orphan: - -**New Feature** - -- Cluster: Add support for experiment tag propagation. - - - The enterprise edition of [HPE Machine Learning Development Environment] - (https://www.hpe.com/us/en/solutions/artificial-intelligence/machine-learning-development-environment.html) - now allows for experiment tags to be propagated as labels to the associated jobs on the HPC - cluster. A number of labelling schemes are supported, controlled by the configuration item - ``resource_manager.job_project_source``. diff --git a/docs/release-notes/fsx.rst b/docs/release-notes/fsx.rst deleted file mode 100644 index 17047f19a73..00000000000 --- a/docs/release-notes/fsx.rst +++ /dev/null @@ -1,6 +0,0 @@ -:orphan: - -**Bug Fixes** - -- ``det deploy aws``: fix reliability issue in ``efs`` deployment type, fix broken ``fsx`` - deployment type. diff --git a/docs/release-notes/hpc-launcher-resource-pools.rst b/docs/release-notes/hpc-launcher-resource-pools.rst deleted file mode 100644 index 55bb7d44aaa..00000000000 --- a/docs/release-notes/hpc-launcher-resource-pools.rst +++ /dev/null @@ -1,10 +0,0 @@ -:orphan: - -**New Feature** - -- Cluster: Add support for launcher-provided resource pools. - - - The enterprise edition of [HPE Machine Learning Development Environment] - (https://www.hpe.com/us/en/solutions/artificial-intelligence/machine-learning-development-environment.html) - now allows for custom resource pools to be defined that submit work to an underlying Slurm/PBS - partition on an HPC cluster with different submission options. diff --git a/docs/release-notes/job-submission-time.rst b/docs/release-notes/job-submission-time.rst deleted file mode 100644 index 4012faae5d3..00000000000 --- a/docs/release-notes/job-submission-time.rst +++ /dev/null @@ -1,7 +0,0 @@ -:orphan: - -**Bug Fixes** - -- Fix a bug where Notebooks, Tensorboards, Shells, Commands being restored from a master restart - would have a submission time of when master restarted rather than the original job submission - time. diff --git a/docs/release-notes/notebook-timeout.rst b/docs/release-notes/notebook-timeout.rst deleted file mode 100644 index 5c9363b7fae..00000000000 --- a/docs/release-notes/notebook-timeout.rst +++ /dev/null @@ -1,6 +0,0 @@ -:orphan: - -**Improvements** - -- Notebooks: default idle notebook termination timeout can now be set via ``notebook_timeout`` - master config option.