chore: refactor searcher operations out of master side searchers

determined-ai · Oct 25, 2024 · 137f6ea · 137f6ea
1 parent db8d4b0
commit 137f6ea
Show file tree

Hide file tree

Showing 73 changed files with 3,451 additions and 4,837 deletions.
diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
@@ -2603,6 +2603,7 @@ jobs:
       - run: pip install mypy pytest coverage
       - install-codecov
       - setup-paths
+      - run: make -C harness install
       - run: COVERAGE_FILE=$PWD/test-unit-harness-tf2-pycov make -C harness test-tf2
       - run: coverage xml -i --data-file=./test-unit-harness-tf2-pycov
       - run: codecov -v -t $CODECOV_TOKEN -F harness

diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst
@@ -943,6 +943,76 @@ the model architecture of this experiment.
 Optional. Like ``source_trial_id``, but specifies an arbitrary checkpoint from which to initialize
 weights. At most one of ``source_trial_id`` or ``source_checkpoint_uuid`` should be set.
 
+.. _experiment-configuration-searcher-asha:
+
+Asynchronous Halving (ASHA)
+===========================
+
+The ``async_halving`` search performs a version of the asynchronous successive halving algorithm
+(`ASHA <https://arxiv.org/pdf/1810.05934.pdf>`_) that stops trials early if there is enough evidence
+to terminate training. Once trials are stopped, they will not be resumed.
+
+``metric``
+----------
+
+Required. The name of the validation metric used to evaluate the performance of a hyperparameter
+configuration.
+
+``time_metric``
+---------------
+
+Required. The name of the validation metric used to evaluate the progress of a given trial.
+
+``max_time``
+------------
+
+Required. The maximum value that ``time_metric`` should take when a trial finishes training. Early
+stopping is decided based on how far the ``time_metric`` has progressed towards this ``max_time``
+value.
+
+``max_trials``
+--------------
+
+Required. The number of trials, i.e., hyperparameter configurations, to evaluate.
+
+``num_rungs``
+-------------
+
+Required. The number of rounds of successive halving to perform.
+
+``smaller_is_better``
+---------------------
+
+Optional. Whether to minimize or maximize the metric defined above. The default value is ``true``
+(minimize).
+
+``divisor``
+-----------
+
+Optional. The fraction of trials to keep at each rung, and also determines the training length for
+each rung. The default setting is ``4``; only advanced users should consider changing this value.
+
+``max_concurrent_trials``
+-------------------------
+
+Optional. The maximum number of trials that can be worked on simultaneously. The default value is
+``16``, and we set reasonable values depending on ``max_trials`` and the number of rungs in the
+brackets. This is akin to controlling the degree of parallelism of the experiment. If this value is
+less than the number of brackets produced by the adaptive algorithm, it will be rounded up.
+
+``source_trial_id``
+-------------------
+
+Optional. If specified, the weights of *every* trial in the search will be initialized to the most
+recent checkpoint of the given trial ID. This will fail if the source trial's model architecture is
+inconsistent with the model architecture of any of the trials in this experiment.
+
+``source_checkpoint_uuid``
+--------------------------
+
+Optional. Like ``source_trial_id``, but specifies an arbitrary checkpoint from which to initialize
+weights. At most one of ``source_trial_id`` or ``source_checkpoint_uuid`` should be set.
+
 .. _experiment-configuration-searcher-adaptive:
 
 Adaptive ASHA
@@ -994,14 +1064,6 @@ end of the spectrum, ``conservative`` mode performs significantly less downsampl
 consequence does not explore as many configurations given the same budget. We recommend using either
 ``aggressive`` or ``standard`` mode.
 
-``stop_once``
--------------
-
-Optional. If ``stop_once`` is set to ``true``, we will use a variant of ASHA that will not resume
-trials once stopped. This variant defaults to continuing training and will only stop trials if there
-is enough evidence to terminate training. We recommend using this version of ASHA when training a
-trial for the max length as fast as possible is important or when fault tolerance is too expensive.
-
 ``divisor``
 -----------
 

diff --git a/e2e_tests/tests/experiment/test_pending_hpc.py b/e2e_tests/tests/experiment/test_pending_hpc.py
@@ -34,17 +34,15 @@ def test_hpc_job_pending_reason() -> None:
     config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml"))
     config = conf.set_slots_per_trial(config, 1)
     config = conf.set_profiling_enabled(config)
-    config["max_restarts"] = 0
-    # Shorten training to 64 batches.
     assert "--epochs 1" in config["entrypoint"], "update test to match tutorial"
     config["entrypoint"] = config["entrypoint"].replace("--epochs 1", "--batches 64")
+    config["max_restarts"] = 0
 
     # The experiment will request 6 CPUs
     config.setdefault("slurm", {})
     config["slurm"]["slots_per_node"] = 6
     config.setdefault("pbs", {})
     config["pbs"]["slots_per_node"] = 6
-    # Wrap entrypoint in torch_distributed for dtrain support.
     assert "torch_distributed" not in config["entrypoint"], "update test to match tutorial"
     config["entrypoint"] = "python3 -m determined.launch.torch_distributed " + config["entrypoint"]
 

diff --git a/e2e_tests/tests/nightly/test_distributed.py b/e2e_tests/tests/nightly/test_distributed.py
@@ -79,7 +79,7 @@ def test_textual_inversion_stable_diffusion_finetune() -> None:
             "textual_inversion_stable_diffusion/finetune_const_advanced.yaml"
         )
     )
-    config["hyperparameters"]["training"]["num_sgd_steps"] = 10
+    config["searcher"]["max_length"] = 10
     try:
         config = conf.set_environment_variables(
             config, [f'HF_AUTH_TOKEN={os.environ["HF_READ_ONLY_TOKEN"]}']

diff --git a/harness/determined/cli/cli.py b/harness/determined/cli/cli.py
@@ -47,66 +47,52 @@
     version,
     workspace,
 )
-from determined.common import api, util, yaml
+from determined.common import api, util
 from determined.common.api import bindings, certs
 
 
+def _render_search_summary(resp: bindings.v1PreviewHPSearchResponse) -> str:
+    output = [
+        termcolor.colored("Using search configuration:", "green"),
+    ]
+
+    # For mypy
+    assert resp.summary and resp.summary.config and resp.summary.trials
+    # Exclude empty configs from rendering.
+    searcher_config = {k: v for k, v in resp.summary.config.items() if v is not None}
+
+    config_str = render.format_object_as_yaml(searcher_config)
+    output.append(config_str)
+    headers = ["Trials", "Training Time"]
+    trial_summaries = []
+    for trial_summary in resp.summary.trials:
+        num_trials = trial_summary.count
+        trial_unit = trial_summary.unit
+        if trial_unit.maxLength:
+            summary = "train to completion"
+        else:
+            summary = f"train for {trial_unit.value} {trial_unit.name}"
+        trial_summaries.append([num_trials, summary])
+
+    output.append(tabulate.tabulate(trial_summaries, headers, tablefmt="presto"))
+    return "\n".join(output)
+
+
 def preview_search(args: argparse.Namespace) -> None:
     sess = cli.setup_session(args)
     experiment_config = util.safe_load_yaml_with_exceptions(args.config_file)
     args.config_file.close()
 
     if "searcher" not in experiment_config:
-        print("Experiment configuration must have 'searcher' section")
-        sys.exit(1)
-    r = sess.post("searcher/preview", json=experiment_config)
-    j = r.json()
+        raise errors.CliError("Missing 'searcher' config section in experiment config.")
 
-    def to_full_name(kind: str) -> str:
-        try:
-            # The unitless searcher case, for masters newer than 0.17.6.
-            length = int(kind)
-            return f"train for {length}"
-        except ValueError:
-            pass
-        if kind[-1] == "R":
-            return "train {} records".format(kind[:-1])
-        if kind[-1] == "B":
-            return "train {} batch(es)".format(kind[:-1])
-        if kind[-1] == "E":
-            return "train {} epoch(s)".format(kind[:-1])
-        if kind == "V":
-            return "validation"
-        raise ValueError("unexpected kind: {}".format(kind))
-
-    def render_sequence(sequence: List[str]) -> str:
-        if not sequence:
-            return "N/A"
-        instructions = []
-        current = sequence[0]
-        count = 0
-        for k in sequence:
-            if k != current:
-                instructions.append("{} x {}".format(count, to_full_name(current)))
-                current = k
-                count = 1
-            else:
-                count += 1
-        instructions.append("{} x {}".format(count, to_full_name(current)))
-        return ", ".join(instructions)
-
-    headers = ["Trials", "Breakdown"]
-    values = [
-        (count, render_sequence(operations.split())) for operations, count in j["results"].items()
-    ]
-
-    print(termcolor.colored("Using search configuration:", "green"))
-    yml = yaml.YAML()
-    yml.indent(mapping=2, sequence=4, offset=2)
-    yml.dump(experiment_config["searcher"], sys.stdout)
-    print()
-    print("This search will create a total of {} trial(s).".format(sum(j["results"].values())))
-    print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=False)
+    resp = bindings.post_PreviewHPSearch(
+        session=sess,
+        body=bindings.v1PreviewHPSearchRequest(
+            config=experiment_config,
+        ),
+    )
+    print(_render_search_summary(resp=resp))
 
 
 args_description = [

diff --git a/harness/determined/transformers/_hf_callback.py b/harness/determined/transformers/_hf_callback.py
@@ -98,20 +98,11 @@ def _check_searcher_config(
         if args.max_steps > -1:
             args_unit = "batches"
             args_len = args.max_steps
-            len_arg = "--max_steps"
         else:
             args_unit = "epochs"
             args_len = args.num_train_epochs
-            len_arg = "--num_train_epochs"
 
-        if isinstance(cfg.get("max_length"), int):
-            # Legacy searcher config (unitless).  Has never been supported, actually.
-            raise ValueError(
-                "HF trainer no longer respects the deprecated searcher.max_length "
-                "field.  searcher.max_length is deprecated; please remove it and rely "
-                f"on {len_arg} instead to avoid ambiguous training specifications."
-            )
-        elif isinstance(cfg.get("max_length"), dict):
+        if "max_length" in cfg:
             # Legacy searcher config; max_length must match provided args.
             search_unit, search_len = next(iter(cfg["max_length"].items()))
             if (search_unit, search_len) != (args_unit, args_len):
@@ -130,6 +121,7 @@ def _check_searcher_config(
                 self.required_metrics.append(search_unit)
             elif (search_unit, search_len) != (args_unit, args_len):
                 name = cfg["name"]
+                len_arg = "--max_steps" if args_unit == "batches" else "--num_train_epochs"
                 raise ValueError(
                     "HF trainer units does not match configured the max_time configured for "
                     f"{name} searcher ({args_unit}={args_len} != {search_unit}={search_len}.  "