Skip to content

Commit

Permalink
Merge branch 'feature/ET-684/run-centric-api' into aaron_amanuel/sear…
Browse files Browse the repository at this point in the history
…ch-actions
  • Loading branch information
AmanuelAaron committed Oct 1, 2024
2 parents 16155eb + 57b7c87 commit 6c301a3
Show file tree
Hide file tree
Showing 107 changed files with 4,195 additions and 789 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.36.1-dev0
current_version = 0.37.1-dev0
commit = true
tag = true
tag_name = {new_version}
Expand Down
2 changes: 1 addition & 1 deletion .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ executors:
parameters:
det-version:
type: string
default: 0.36.1-dev0
default: 0.37.1-dev0
docker-image:
type: string
default: determinedai/cimg-base:latest
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.36.1-dev0
0.37.1-dev0
4 changes: 4 additions & 0 deletions docs/.redirects/all_published_urls_ever.json
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
"integrations/notification/_index",
"integrations/notification/index",
"integrations/notification/slack",
"integrations/notification/workload-alerting",
"integrations/notification/zapier",
"integrations/observability/_index",
"integrations/pachyderm/pachyderm",
Expand All @@ -148,6 +149,7 @@
"introduction",
"join-community",
"manage/_index",
"manage/cluster-overview",
"manage/elasticsearch-logging-backend",
"manage/historical-cluster-usage-data",
"manage/security/_index",
Expand Down Expand Up @@ -434,6 +436,7 @@
"setup-cluster/on-prem/options/linux-packages",
"setup-cluster/on-prem/options/wsl",
"setup-cluster/on-prem/requirements",
"setup-cluster/rocm-support",
"setup-cluster/security/index",
"setup-cluster/security/oauth",
"setup-cluster/security/oidc",
Expand All @@ -444,6 +447,7 @@
"setup-cluster/security/tls",
"setup-cluster/setup-clients",
"setup-cluster/slurm/_index",
"setup-cluster/slurm/hpc-environment-requirements",
"setup-cluster/slurm/hpc-launching-architecture",
"setup-cluster/slurm/hpc-security-considerations",
"setup-cluster/slurm/hpc-with-agent",
Expand Down
6 changes: 5 additions & 1 deletion docs/_static/version-switcher/versions.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
[
{
"version": "0.36.1-dev0",
"version": "0.37.1-dev0",
"url": "https://docs.determined.ai/latest/"
},
{
"version": "0.37.0",
"url": "https://docs.determined.ai/0.37.0/"
},
{
"version": "0.36.0",
"url": "https://docs.determined.ai/0.36.0/"
Expand Down
1 change: 1 addition & 0 deletions docs/deploy/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import mimetypes
import os
import pathlib
import time

import boto3

Expand Down
4 changes: 2 additions & 2 deletions docs/model-dev-guide/hyperparameter/search-methods/_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ checkpoints for downstream serving.
Adaptive Search
*****************

Our default recommended search method is `Adaptive (ASHA) <https://arxiv.org/pdf/1810.05934.pdf>`_,
a state-of-the-art early-stopping based technique that speeds up traditional techniques like random
Our default recommended search method is `Adaptive (ASHA) <http://arxiv.org/pdf/1810.05934>`_, a
state-of-the-art early-stopping based technique that speeds up traditional techniques like random
search by periodically abandoning low-performing hyperparameter configurations in a principled
fashion.

Expand Down
19 changes: 19 additions & 0 deletions docs/reference/deploy/helm-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,25 @@
PersistentVolumes is disabled, users must manually create a PersistentVolume that will match
the PersistentVolumeClaim.

- ``claimSuffix``: Optional configuration that defines the persistent volume claim name for the
Determined database. If not undefined, a new PVC is created, with the name
``determined-db-pvc-$releaseName``. If specified, the provided value will be used as the
suffix to ``determined-db-pvc-``.

- ``snapshotSuffix``: Optional configuration for naming the volume snapshot, which serves as a
backup of the Determined database's persistentVolume. If defined, Helm will create a snapshot
of the database during the next upgrade, creating a volumeSnapshot named
``determined-db-snapsnot-$snapshotSuffix``. This **must** **not** match the name of an
existing persistent volume claim, such as ``determined-db-pvc-$releaseName``, because the
``snapshotSuffix`` will be used to create a new persistent volume claim when restoring.

- ``restoreSnapshotSuffix``: Optional configuration of the volumeSnapshot to restore from during
an upgrade. If this is set during an upgrade, a new persistentVolume will be created and
swapped into Determined's database by creating a new persistent volume claim named
``determined-db-pvc-$restoreSnapshotSuffix``, restoring data from
``determined-db-snapsnot-$restoreSnapshotSuffix``. This **must not** match the name of an
existing persistent volume claim (PVC), such as ``$releaseName``.

- ``checkpointStorage``: Specifies where model checkpoints will be stored. This can be overridden
on a per-experiment basis in the :ref:`experiment-config-reference`. A checkpoint contains the
architecture and weights of the model being trained. Determined currently supports several kinds
Expand Down
6 changes: 3 additions & 3 deletions docs/reference/deploy/master-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ Defaults to ``false``.
=================

Defines the default `Docker registry credentials
<https://docs.docker.com/engine/api/v1.30/#tag/System/operation/SystemAuth>`__ to use when pulling a
custom base Docker image, if needed. If credentials are specified in the :ref:`experiment config
<https://docs.docker.com/reference/api/engine/version/v1.30/>`__ to use when pulling a custom base
Docker image, if needed. If credentials are specified in the :ref:`experiment config
<exp-environment-image>` this default value is overridden. Credentials are specified as the
following nested fields:

Expand Down Expand Up @@ -283,7 +283,7 @@ master config that provides a readable name for the Determined deployment.
``name``
--------

(deprecated) Specifies the resource managers name. ``cluster_name`` should be specified instead.
(deprecated) Specifies the resource manager's name. ``cluster_name`` should be specified instead.

``metadata``
============
Expand Down
4 changes: 2 additions & 2 deletions docs/reference/experiment-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1313,8 +1313,8 @@ Optional. Whether the bind-mount should be a read-only mount. Defaults to ``fals
===============

Optional. `Propagation behavior
<https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation>`__ for replicas of the
bind-mount. Defaults to ``rprivate``.
<https://docs.docker.com/engine/storage/bind-mounts/#configure-bind-propagation>`__ for replicas of
the bind-mount. Defaults to ``rprivate``.

For example, to mount ``/data`` on the host to the same path in the container, use:

Expand Down
46 changes: 46 additions & 0 deletions docs/release-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,52 @@
Release Notes
###############

**************
Version 0.37
**************

Version 0.37.0
==============

**Release Date:** September 27, 2024

**Breaking Changes**

- API: Remove the ``model_hub`` library from Determined.

- Starting with this release, ``MMDetTrial`` and ``BaseTransformerTrial`` are removed. HuggingFace
users should refer to the provided `HuggingFace TrainerAPI examples
<https://github.com/determined-ai/determined/tree/main/examples/hf_trainer_api>`__, which use a
custom callback instead of BaseTransformerTrial. Users of ``MMDetTrial`` can refer to :ref:`Core
API <api-core-ug>`.

**New Features**

- Webhooks: Add support for experiment monitoring and alerting. Capabilities include
workspace-level subscriptions for "All experiments" or "Specific experiment(s) with matching
configuration" options. New trigger types include ``COMPLETED``, ``ERROR``, ``TASKLOG``, and
``CUSTOM``. Support for custom triggers, code-based alerts, experiment-specific webhook
exclusions, and editable webhook URLs is also added. For details, visit
:ref:`supported-webhook-triggers`.

- Master Configuration: Add support for POSIX claims in the master configuration. It now accepts
``agent_uid_attribute_name``, ``agent_gid_attribute_name``, ``agent_user_name_attribute_name``,
or ``agent_group_name_attribute_name``. Refer to the :ref:`OIDC master configuration
<master-config-oidc>` or :ref:`SAML master configuration <master-config-saml>` for details. If
any of these fields are configured, they will sync with the database.

**Improvements**

- WebUI: Change the "Compute Slots Allocated" label to "Unspecified Slots Allocated" for resource
pools with no or multiple slot types. Add error logs for zero or multi-slot-type cases and update
the progress bar to include all agents when the slot type is ``TYPE_UNSPECIFIED``.

**Bug Fixes**

- API/Tasks: Fix a bug where a master-configured ``log_retention_days`` value is not applied to
experiments and tasks. The master-configured value is now correctly applied to new experiments,
and all pre-existing experiments will also follow the specified ``log_retention_days``.

**************
Version 0.36
**************
Expand Down
7 changes: 7 additions & 0 deletions docs/release-notes/9966-fix-grid.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
:orphan:

**Fixes**

- Previously, during a grid search, if a hyperparameter contained an empty nested hyperparameter
(that is, just an empty map), that hyperparameter would not appear in the hparams passed to the
trial.
9 changes: 0 additions & 9 deletions docs/release-notes/auto-populate-posix.rst

This file was deleted.

7 changes: 0 additions & 7 deletions docs/release-notes/fix-master-log-retention-days.rst

This file was deleted.

6 changes: 6 additions & 0 deletions docs/release-notes/helm-db-snapshot.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
:orphan:

**New Features**

- Helm: Add support for capturing and restoring snapshots of the database persistent volume. Visit
:ref:`helm-config-reference` for more details.
11 changes: 0 additions & 11 deletions docs/release-notes/remove-model-hub.rst

This file was deleted.

7 changes: 0 additions & 7 deletions docs/release-notes/unspecified-slots.rst

This file was deleted.

9 changes: 0 additions & 9 deletions docs/release-notes/workload-alerting.rst

This file was deleted.

5 changes: 3 additions & 2 deletions docs/setup-cluster/on-prem/options/docker.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ large range of ports, as it does not require network address translation (NAT) a
The host networking driver only works on Linux hosts, and is not supported on Docker Desktop for
Mac, Docker Desktop for Windows, or Docker EE for Windows Server.

See `Docker's documentation <https://docs.docker.com/network/drivers/host/>`_ for more details.
See `Docker's documentation <https://docs.docker.com/engine/network/drivers/host/>`_ for more
details.

.. note::

Expand All @@ -209,7 +210,7 @@ See `Docker's documentation <https://docs.docker.com/network/drivers/host/>`_ fo

By default, ``docker run`` will run in the foreground, so that a container can be stopped simply by
pressing Control-C. If you wish to keep Determined running for the long term, consider running the
containers `detached <ttps://docs.docker.com/engine/reference/commandline/container_run/#detach>`_
containers `detached <https://docs.docker.com/engine/reference/commandline/container_run/#detach>`_
and/or with `restart policies
<https://docs.docker.com/engine/containers/start-containers-automatically/>`_. Using :ref:`our
deployment tool <install-using-deploy>` is also an option.
6 changes: 3 additions & 3 deletions e2e_tests/tests/cluster/test_priority_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ def test_priortity_scheduler_noop_command(
managed_cluster_priority_scheduler.ensure_agent_ok()
assert str(conf.MASTER_PORT) == "8082"
# without slots (and default priority)
command_id = utils.run_command(sess, slots=0)
command_id = utils.run_command(sess, 0, slots=0)
utils.wait_for_command_state(sess, command_id, "TERMINATED", 40)
utils.assert_command_succeeded(sess, command_id)
# with slots (and default priority)
command_id = utils.run_command(sess, slots=1)
command_id = utils.run_command(sess, 0, slots=1)
utils.wait_for_command_state(sess, command_id, "TERMINATED", 60)
utils.assert_command_succeeded(sess, command_id)
# explicity priority
command_id = utils.run_command_set_priority(sess, slots=0, priority=60)
command_id = utils.run_command_set_priority(sess, 0, slots=0, priority=60)
utils.wait_for_command_state(sess, command_id, "TERMINATED", 60)
utils.assert_command_succeeded(sess, command_id)

Expand Down
6 changes: 3 additions & 3 deletions e2e_tests/tests/cluster/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def num_free_slots(sess: api.Session) -> int:


def run_command_set_priority(
sess: api.Session, sleep: int = 30, slots: int = 1, priority: int = 0
sess: api.Session, sleep: int, slots: int = 1, priority: int = 0
) -> str:
cmd = [
"det",
Expand All @@ -118,7 +118,7 @@ def run_command_set_priority(
return detproc.check_output(sess, cmd).strip()


def run_command(sess: api.Session, sleep: int = 30, slots: int = 1) -> str:
def run_command(sess: api.Session, sleep: int, slots: int = 1) -> str:
cmd = [
"det",
"command",
Expand All @@ -144,7 +144,7 @@ def run_command_args(sess: api.Session, entrypoint: str, args: Optional[List[str
return detproc.check_output(sess, cmd + [entrypoint]).strip()


def run_zero_slot_command(sess: api.Session, sleep: int = 30) -> str:
def run_zero_slot_command(sess: api.Session, sleep: int) -> str:
return run_command(sess, sleep=sleep, slots=0)


Expand Down
2 changes: 1 addition & 1 deletion harness/determined/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.36.1-dev0"
__version__ = "0.37.1-dev0"
Loading

0 comments on commit 6c301a3

Please sign in to comment.