fix: ensure properly wrapping in pytorch. [DET-3745] (#1114)

* fix: ensure user code using the old pytorch API has proper wrapping. [DET-3745] * docs: add a Pytorch FAQ section
determined-ai · Aug 24, 2020 · 7d0ef9d · 7d0ef9d
1 parent 250e4f4
commit 7d0ef9d
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 6 deletions.
diff --git a/docs/faq.txt b/docs/faq.txt
@@ -135,6 +135,21 @@ Core APIs, we recommend porting your model to use :ref:`Estimator Trial <estimat
 <https://github.com/determined-ai/determined/blob/master/examples/experimental/trial/mnist_tp_to_estimator/model_def.py>`_.
 
 
+Pytorch Supporrt
+----------------
+
+Why am I seeing significantly different metrics for trials which are paused and later continued (jumps in loss, worse convergence, etc) than trials which aren't paused?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Please verify the following in the model code:
+
+* The model is wrapped with :meth:`wrap_model <determined.pytorch.PyTorchTrialContext.wrap_model>`.
+* The optimizer is wrapped with :meth:`wrap_optimizer <determined.pytorch.PyTorchTrialContext.wrap_optimizer>`
+  and based on the output of ``wrap_model``, not the original unwrapped model.
+* The LR scheduler is wrapped with :meth:`wrap_lr_scheduler <determined.pytorch.PyTorchTrialContext.wrap_lr_scheduler>`
+  and based on the output of ``wrap_optimizer``, not the original unwrapped optimizer.
+
+
 TensorBoard Support
 -------------------
 

diff --git a/docs/release-notes/1114-ensure-proper-wrapping.txt b/docs/release-notes/1114-ensure-proper-wrapping.txt
@@ -0,0 +1,6 @@
+:orphan:
+
+**Improvements**
+
+- Warn out if not properly refer Pytorch LR schedulers with a optimizer wrapped with
+  :meth:`determined.pytorch.PytorchTrialContext.wrap_optimizer`.
diff --git a/harness/determined/pytorch/_pytorch_context.py b/harness/determined/pytorch/_pytorch_context.py
@@ -189,12 +189,11 @@ def wrap_lr_scheduler(
         The LR scheduler must use an optimizer wrapped by :meth:`wrap_optimizer`.  If ``apex.amp``
         is in use, the optimizer must also have been configured with :meth:`configure_apex_amp`.
         """
-
-        check.is_in(
-            lr_scheduler.optimizer,  # type: ignore
-            self.optimizers,
-            "Must use an optimizer that is returned by wrap_optimizer()",
-        )
+        opt = getattr(lr_scheduler, "optimizer", None)
+        if opt is not None:
+            check.is_in(
+                opt, self.optimizers, "Must use an optimizer that is returned by wrap_optimizer()",
+            )
         wrapped = pytorch.LRScheduler(lr_scheduler, step_mode)
         self.lr_schedulers.append(wrapped)
 

diff --git a/harness/determined/pytorch/_pytorch_trial.py b/harness/determined/pytorch/_pytorch_trial.py
@@ -171,6 +171,14 @@ def _apply_backwards_compatibility(self) -> None:
 
             lr_scheduler = self.trial.create_lr_scheduler(optim)
             if lr_scheduler is not None:
+                opt = getattr(lr_scheduler._scheduler, "optimizer", None)
+                if opt is not None:
+                    check.is_in(
+                        opt,
+                        self.context.optimizers,
+                        "Must use a wrapped optimizer that is passed in by the optimizer "
+                        "argument of create_lr_scheduler",
+                    )
                 self.context.lr_schedulers.append(lr_scheduler)
 
             if det.ExperimentConfig(self.context.get_experiment_config()).mixed_precision_enabled():