determined-ai · gt2345 · Aug 19, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/docs/model-dev-guide/api-guides/apis-howto/api-core-ug-basic.rst b/docs/model-dev-guide/api-guides/apis-howto/api-core-ug-basic.rst
@@ -181,7 +181,7 @@ trial ID in the checkpoint and use it to distinguish the two types of continues.
    .. literalinclude:: ../../../../examples/tutorials/core_api/2_checkpoints.py
       :language: python
       :start-at: def main
-      :end-at: for batch in range(starting_batch, 100)
+      :end-at: for batch in range(starting_batch, max_length)
 
 #. You can checkpoint your model as frequently as you like. For this exercise, save a checkpoint
    after each training report, and check for a preemption signal after each checkpoint:

diff --git a/examples/features/unmanaged/1_singleton.py b/examples/features/unmanaged/1_singleton.py
@@ -16,8 +16,8 @@ def main():
             # project="...",
         ),
     )
-
-    for i in range(100):
+    max_length = 100
+    for i in range(max_length):
         print(f"training loss: {random.random()}")
 
         core_v2.train.report_training_metrics(steps_completed=i, metrics={"loss": random.random()})
@@ -28,6 +28,7 @@ def main():
             core_v2.train.report_validation_metrics(
                 steps_completed=i, metrics={"loss": random.random()}
             )
+            core_v2.train.report_progress(i / float(max_length))
 
     core_v2.close()
 

diff --git a/examples/features/unmanaged/2_checkpoints.py b/examples/features/unmanaged/2_checkpoints.py
@@ -21,6 +21,7 @@ def main():
 
     latest_checkpoint = core_v2.info.latest_checkpoint
     initial_i = 0
+    max_length = 100
     if latest_checkpoint is not None:
         with core_v2.checkpoint.restore_path(latest_checkpoint) as path:
             with (path / "state").open() as fin:
@@ -32,11 +33,12 @@ def main():
 
     print("determined experiment id: ", core_v2.info._trial_info.experiment_id)
     print("initial step:", initial_i)
-    for i in range(initial_i, initial_i + 100):
+    for i in range(initial_i, initial_i + max_length):
         core_v2.train.report_training_metrics(steps_completed=i, metrics={"loss": random.random()})
         if (i + 1) % 10 == 0:
             loss = random.random()
             core_v2.train.report_validation_metrics(steps_completed=i, metrics={"loss": loss})
+            core_v2.train.report_progress((i - initial_i) / float(max_length))
 
             with core_v2.checkpoint.store_path({"steps_completed": i}) as (path, uuid):
                 with (path / "state").open("w") as fout:

diff --git a/examples/tutorials/core_api/1_metrics.py b/examples/tutorials/core_api/1_metrics.py
@@ -14,7 +14,8 @@
 
 def main(core_context, increment_by):
     x = 0
-    for batch in range(100):
+    max_length = 100
+    for batch in range(max_length):
         x += increment_by
         steps_completed = batch + 1
         time.sleep(0.1)
@@ -24,6 +25,8 @@ def main(core_context, increment_by):
             core_context.train.report_training_metrics(
                 steps_completed=steps_completed, metrics={"x": x}
             )
+            # NEW: report training progress.
+            core_context.train.report_progress(steps_completed / float(max_length))
     # NEW: report a "validation" metric at the end.
     core_context.train.report_validation_metrics(steps_completed=steps_completed, metrics={"x": x})
 

diff --git a/examples/tutorials/core_api/2_checkpoints.py b/examples/tutorials/core_api/2_checkpoints.py
@@ -41,14 +41,14 @@ def load_state(trial_id, checkpoint_directory):
 
 def main(core_context, latest_checkpoint, trial_id, increment_by):
     x = 0
-
+    max_length = 100
     # NEW: load a checkpoint if one was provided.
     starting_batch = 0
     if latest_checkpoint is not None:
         with core_context.checkpoint.restore_path(latest_checkpoint) as path:
             x, starting_batch = load_state(trial_id, path)
 
-    for batch in range(starting_batch, 100):
+    for batch in range(starting_batch, max_length):
         x += increment_by
         steps_completed = batch + 1
         time.sleep(0.1)
@@ -57,6 +57,7 @@ def main(core_context, latest_checkpoint, trial_id, increment_by):
             core_context.train.report_training_metrics(
                 steps_completed=steps_completed, metrics={"x": x}
             )
+            core_context.train.report_progress(steps_completed / float(max_length))
 
             # NEW: write checkpoints at regular intervals to limit lost progress
             # in case of a crash during training.

diff --git a/harness/determined/common/api/bindings.py b/harness/determined/common/api/bindings.py
@@ -86,7 +86,7 @@ def report_progress(self, length: float) -> None:
         logger.debug(f"op.report_progress({length})")
         self._session.post(
             f"/api/v1/trials/{self._trial_id}/progress",
-            data=det.util.json_encode(length),
+            data=det.util.json_encode({"progress": length}),
         )
 
     def report_completed(self, searcher_metric: Any) -> None:

@@ -260,6 +260,30 @@ def report_early_exit(self, reason: EarlyExitReason) -> None:
         if r.status_code == 400:
             logger.warn("early exit has already been reported for this trial, ignoring new value")
 
+    def report_progress(self, progress: float) -> None:
+        """
+        Report training progress to the master.
+
+        This is optional for training, but will be used by the WebUI to render completion status.
+
+        Progress must be reported as a float between 0 and 1.0, where 1.0 is 100% completion. It
+        should represent the current iteration step as a fraction of maximum training steps
+        (i.e.: `report_progress(step_num / max_steps)`).
+
+        Note that for hyperparameter search, progress should be reported through
+        ``SearcherOperation.report_progress()`` in the Searcher API instead.
+
+        Arguments:
+            progress (float): completion progress in the range [0, 1.0].
+        """
+        logger.debug(f"report_progress with progress={progress}")
+        if progress < 0 or progress > 1:
+            raise ValueError(f"Progress should be between 0 and 1, not {progress}")
+        self._session.post(
+            f"/api/v1/trials/{self._trial_id}/progress",
+            data=det.util.json_encode({"progress": progress, "is_raw": True}),
+        )
+
     def get_experiment_best_validation(self) -> Optional[float]:
         """
         Get the best reported validation metric reported so far, across the whole experiment.
@@ -312,6 +336,9 @@ def upload_tensorboard_files(
     def report_early_exit(self, reason: EarlyExitReason) -> None:
         logger.info(f"report_early_exit({reason})")
 
+    def report_progress(self, progress: float) -> None:
+        logger.info(f"report_progress with progress={progress}")
+
     def get_experiment_best_validation(self) -> Optional[float]:
         return None
 

@@ -1393,19 +1393,25 @@ func (a *apiServer) ReportTrialProgress(
 		experiment.AuthZProvider.Get().CanEditExperiment); err != nil {
 		return nil, err
 	}
+
 	eID, rID, err := a.m.db.TrialExperimentAndRequestID(int(req.TrialId))
 	if err != nil {
 		return nil, err
 	}
 
 	e, ok := experiment.ExperimentRegistry.Load(eID)
 	if !ok {
-		return nil, api.NotFoundErrs("experiment", strconv.Itoa(eID), true)
+		// Unmanaged experiment is not included in ExperimentRegistry
+		if err := a.m.db.SaveExperimentProgress(eID, &req.Progress); err != nil {
+			return nil, err
+		}
+		return &apiv1.ReportTrialProgressResponse{}, nil
 	}
 
 	msg := experiment.TrialReportProgress{
 		RequestID: rID,
 		Progress:  searcher.PartialUnits(req.Progress),
+		IsRaw:     req.IsRaw,
 	}
 	if err := e.TrialReportProgress(msg); err != nil {
 		return nil, err

@@ -808,6 +808,9 @@ EXISTS(
 
 // SaveExperimentProgress stores the progress for an experiment in the database.
 func (db *PgDB) SaveExperimentProgress(id int, progress *float64) error {
+	if progress != nil && (*progress < 0 || *progress > 1) {
+		return errors.Errorf("invalid progress value: %f. Progress value should be between 0 and 1", *progress)
+	}
 	res, err := db.sql.Exec(`UPDATE experiments SET progress = $1 WHERE id = $2`, progress, id)
 	if err != nil {
 		return errors.Wrap(err, "saving experiment progress")

@@ -356,8 +356,12 @@ func (e *internalExperiment) TrialReportProgress(msg experiment.TrialReportProgr
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	e.searcher.SetTrialProgress(msg.RequestID, msg.Progress)
-	progress := e.searcher.Progress()
+	progress := float64(msg.Progress)
+	if !msg.IsRaw {
+		e.searcher.SetTrialProgress(msg.RequestID, msg.Progress)
+		progress = e.searcher.Progress()
+	}
+
 	if err := e.db.SaveExperimentProgress(e.ID, &progress); err != nil {
 		e.syslog.WithError(err).Error("failed to save experiment progress")
 	}

@@ -30,6 +30,7 @@ type (
 	TrialReportProgress struct {
 		RequestID model.RequestID
 		Progress  searcher.PartialUnits
+		IsRaw     bool
 	}
 
 	// UserInitiatedEarlyTrialExit is a user-injected message, provided through the early exit API. It

@@ -333,6 +333,11 @@ func UpdateUnmanagedExperimentStatesTx(
 				endTime = ptrs.Ptr(time.Now())
 			}
 			exp.EndTime = endTime
+
+			if exp.State == model.CompletedState {
+				columns = append(columns, "progress")
+				exp.Progress = ptrs.Ptr(1.0)
+			}
 		}
 
 		if _, err := tx.NewUpdate().Model(exp).Column(columns...).WherePK().Exec(ctx); err != nil {