determined-ai · ioga · Jan 26, 2024 · Jan 24, 2024 · Jan 24, 2024 · Jan 24, 2024
diff --git a/docs/reference/training/experiment-config-reference.rst b/docs/reference/training/experiment-config-reference.rst
@@ -1191,6 +1191,18 @@ Optional. The resource pool where this experiment will be scheduled. If no resou
 specified, experiments will run in the default GPU pool. Refer to :ref:`resource-pools` for more
 information.
 
+``is_single_node``
+==================
+
+Optional. When true, all the requested slots for the tasks are forced to be scheduled in a single
+container on a single node, or in a single pod. When false, it may be split across different nodes
+or pods. Defaults to false for experiments. This field is set to true for notebooks, tensorboards,
+shells, and commands, and cannot be modified.
+
+.. note::
+
+   This option is currently not supported by Slurm RM.
+
 .. _exp-resources-devices:
 
 ``devices``

diff --git a/docs/release-notes/command-capacity.rst b/docs/release-notes/command-capacity.rst
@@ -0,0 +1,14 @@
+:orphan:
+
+**New Features**
+
+-  Experiments: Add ``resources.is_single_node`` option which disallows scheduling the trial across
+   multiple nodes or pods, and forces it to be scheduled within a single container. If the requested
+   ``slots_per_trial`` count is impossible to fulfill in the cluster, the experiment submission will
+   be rejected.
+
+**Improvements**
+
+-  Notebooks, Shells, and Commands: On static agent-based clusters (not using dynamic cloud
+   provisioning), when a ``slots`` request for a notebook, shell, or command cannot be fulfilled,
+   it'll be rejected.
@@ -106,3 +106,22 @@ def list_free_agents() -> List[bindings.v1Agent]:
         pytest.fail(f"missing agents: {agents}")
 
     return [a for a in agents.agents or [] if len(a.containers or {}) == 0]
+
+
+@pytest.mark.e2e_cpu_2a
+@pytest.mark.timeout(600)
+def test_experiment_is_single_node() -> None:
+    slots = _wait_for_slots(2)
+    assert len(slots) == 2
+
+    with pytest.raises(AssertionError):
+        exp.create_experiment(
+            conf.fixtures_path("no_op/single.yaml"),
+            conf.fixtures_path("no_op"),
+            [
+                "--config",
+                "resources.slots_per_trial=2",
+                "--config",
+                "resources.is_single_node=true",
+            ],
+        )
@@ -2,7 +2,6 @@
 import re
 import subprocess
 import tempfile
-import time
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
@@ -469,27 +468,23 @@ def test_image_pull_after_remove() -> None:
     )
 
 
-@pytest.mark.slow
 @pytest.mark.e2e_cpu
-def test_killed_pending_command_terminates() -> None:
+def test_outrageous_command_rejected() -> None:
     # Specify an outrageous number of slots to be sure that it can't be scheduled.
-    # NB: slot # higher than postgres smallint (i.e. 32k) is rejected outright.
-    with cmd.interactive_command(
-        "cmd", "run", "--config", "resources.slots=10485", "sleep infinity"
-    ) as command:
-        assert command.task_id is not None
-        for _ in range(10):
-            assert cmd.get_command(command.task_id)["state"] == "STATE_QUEUED"
-            time.sleep(1)
-
-    # The command is killed when the context is exited; now it should reach TERMINATED soon.
-    for _ in range(5):
-        if cmd.get_command(command.task_id)["state"] == "STATE_TERMINATED":
-            break
-        time.sleep(1)
-    else:
-        state = cmd.get_command(command.task_id)["state"]
-        raise AssertionError(f"Task was in state {state} rather than STATE_TERMINATED")
+    with pytest.raises(subprocess.CalledProcessError):
+        _run_and_verify_failure(
+            [
+                "det",
+                "-m",
+                conf.make_master_url(),
+                "cmd",
+                "run",
+                "--config",
+                "resources.slots=10485",
+                "sleep infinity",
+            ],
+            "request unfulfillable",
+        )
 
 
 @pytest.mark.e2e_gpu

@@ -298,7 +298,8 @@ func (m *Master) parseCreateExperiment(req *apiv1.CreateExperimentRequest, owner
 	if err != nil {
 		return nil, config, nil, nil, errors.Wrapf(err, "invalid resource configuration")
 	}
-	if err = m.rm.ValidateResources(poolName, resources.SlotsPerTrial(), false); err != nil {
+	isSingleNode := resources.IsSingleNode() != nil && *resources.IsSingleNode()
+	if err = m.rm.ValidateResources(poolName, resources.SlotsPerTrial(), isSingleNode); err != nil {
 		return nil, config, nil, nil, errors.Wrapf(err, "error validating resources")
 	}
 	taskContainerDefaults, err := m.rm.TaskContainerDefaults(

@@ -42,36 +42,3 @@ func TestResourceManagerForwardMessage(t *testing.T) {
 	assert.DeepEqual(t, taskSummary, make(map[model.AllocationID]sproto.AllocationSummary))
 	rm.stop()
 }
-
-func TestResourceManagerValidateRPResourcesUnknown(t *testing.T) {
-	user.InitService(nil, nil)
-	// We can reliably run this check only for AWS, GCP, or Kube resource pools,
-	// but initializing either of these in the test is not viable. So let's at least
-	// check if we properly return "unknown" for on-prem-like setups.
-	conf := &config.ResourceConfig{
-		ResourceManager: &config.ResourceManagerConfig{
-			AgentRM: &config.AgentResourceManagerConfig{
-				Scheduler: &config.SchedulerConfig{
-					FairShare:     &config.FairShareSchedulerConfig{},
-					FittingPolicy: best,
-				},
-			},
-		},
-		ResourcePools: []config.ResourcePoolConfig{
-			{
-				PoolName:                 defaultResourcePoolName,
-				MaxAuxContainersPerAgent: 100,
-			},
-		},
-	}
-
-	rm := New(nil, echo.New(), conf, nil, nil)
-
-	resp, err := rm.ValidateCommandResources(sproto.ValidateCommandResourcesRequest{
-		ResourcePool: defaultResourcePoolName,
-		Slots:        1,
-	})
-	assert.Assert(t, err == nil, err)
-	assert.Assert(t, resp.Fulfillable)
-	rm.stop()
-}
@@ -596,10 +596,24 @@ func (rp *resourcePool) ValidateCommandResources(
 	rp.mu.Lock()
 	defer rp.mu.Unlock()
 
-	fulfillable := true // Default to "true" when unknown.
+	var fulfillable bool
+
 	if rp.slotsPerInstance > 0 {
 		fulfillable = rp.slotsPerInstance >= msg.Slots
+	} else {
+		rp.agentStatesCache = rp.agentService.list(rp.config.PoolName)
+		defer func() {
+			rp.agentStatesCache = nil
+		}()
+
+		maxSlots := 0
+		for _, a := range rp.agentStatesCache {
+			maxSlots = max(maxSlots, len(a.slotStates))
+		}
+
+		fulfillable = maxSlots >= msg.Slots
 	}
+
 	return sproto.ValidateCommandResourcesResponse{Fulfillable: fulfillable}
 }
 

@@ -383,6 +383,8 @@ func (t *trial) maybeAllocateTask() error {
 		return err
 	}
 
+	isSingleNode := t.config.Resources().IsSingleNode() != nil && *t.config.Resources().IsSingleNode()
+
 	restoredAllocation, err := t.maybeRestoreAllocation()
 	if err != nil {
 		t.syslog.WithError(err).Warn("failed to restore trial allocation")
@@ -403,7 +405,7 @@ func (t *trial) maybeAllocateTask() error {
 			SlotsNeeded:       t.config.Resources().SlotsPerTrial(),
 			ResourcePool:      t.config.Resources().ResourcePool(),
 			FittingRequirements: sproto.FittingRequirements{
-				SingleAgent: false,
+				SingleAgent: isSingleNode,
 			},
 
 			Preemptible: true,
@@ -448,7 +450,7 @@ func (t *trial) maybeAllocateTask() error {
 		SlotsNeeded:  t.config.Resources().SlotsPerTrial(),
 		ResourcePool: t.config.Resources().ResourcePool(),
 		FittingRequirements: sproto.FittingRequirements{
-			SingleAgent: false,
+			SingleAgent: isSingleNode,
 		},
 
 		Preemptible: true,

@@ -68,5 +68,6 @@ func (c *CommandConfig) Validate() []error {
 			},
 			"invalid notebook idle type",
 		),
+		check.True(c.Resources.IsSingleNode == nil, "resources.is_single_node cannot be set for NTSCs"),
 	}
 }
@@ -41,6 +41,7 @@ func (r ResourcesConfig) ToExpconf() expconf.ResourcesConfig {
 		RawResourcePool:   ptrs.Ptr(r.ResourcePool),
 		RawPriority:       r.Priority,
 		RawDevices:        r.Devices.ToExpconf(),
+		RawIsSingleNode:   r.IsSingleNode,
 	})
 }
 

@@ -90,6 +90,7 @@ type ResourcesConfig struct {
 	ShmSize        *StorageSize `json:"shm_size,omitempty"`
 	ResourcePool   string       `json:"resource_pool"`
 	Priority       *int         `json:"priority,omitempty"`
+	IsSingleNode   *bool        `json:"is_single_node"`
 
 	Devices DevicesConfig `json:"devices"`
 

@@ -217,6 +217,7 @@ type ResourcesConfigV0 struct {
 	RawShmSize        *int     `json:"shm_size"`
 	RawResourcePool   *string  `json:"resource_pool"`
 	RawPriority       *int     `json:"priority"`
+	RawIsSingleNode   *bool    `json:"is_single_node"`
 
 	RawDevices DevicesConfigV0 `json:"devices"`
 }

@@ -22,6 +22,13 @@
             "default": [],
             "optionalRef": "http://determined.ai/schemas/expconf/v0/devices.json"
         },
+        "is_single_node": {
+            "type": [
+                "boolean",
+                "null"
+            ],
+            "default": null
+        },
         "max_slots": {
             "type": [
                 "integer",

@@ -215,3 +215,4 @@
     max_slots: null
     priority: null
     resource_pool: ''
+    is_single_node: null
@@ -217,6 +217,7 @@
       max_slots: null
       priority: null
       resource_pool: ''
+      is_single_node: null
     scheduling_unit: 100
     searcher:
       max_length: