Add initial epoch multiplier as a parameter to the PC script.

HumanCompatibleAI · Jan 11, 2024 · 78553c9 · 78553c9
1 parent 55aa6eb
commit 78553c9
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 0 deletions.
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -42,6 +42,8 @@ def train_defaults():
     transition_oversampling = 1
     # fraction of total_comparisons that will be sampled right at the beginning
     initial_comparison_frac = 0.1
+    # factor by which to oversample the number of epochs in the first iteration
+    initial_epoch_multiplier = 200.0
     # fraction of sampled trajectories that will include some random actions
     exploration_frac = 0.0
     preference_model_kwargs = {}

diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
@@ -68,6 +68,7 @@ def train_preference_comparisons(
     fragment_length: int,
     transition_oversampling: float,
     initial_comparison_frac: float,
+    initial_epoch_multiplier: float,
     exploration_frac: float,
     trajectory_path: Optional[str],
     trajectory_generator_kwargs: Mapping[str, Any],
@@ -106,6 +107,9 @@ def train_preference_comparisons(
             sampled before the rest of training begins (using the randomly initialized
             agent). This can be used to pretrain the reward model before the agent
             is trained on the learned reward.
+        initial_epoch_multiplier: before agent training begins, train the reward
+                model for this many more epochs than usual (on fragments sampled from a
+                random agent).
         exploration_frac: fraction of trajectory samples that will be created using
             partially random actions, rather than the current policy. Might be helpful
             if the learned policy explores too little and gets stuck with a wrong
@@ -258,6 +262,7 @@ def train_preference_comparisons(
             fragment_length=fragment_length,
             transition_oversampling=transition_oversampling,
             initial_comparison_frac=initial_comparison_frac,
+            initial_epoch_multiplier=initial_epoch_multiplier,
             custom_logger=custom_logger,
             allow_variable_horizon=allow_variable_horizon,
             query_schedule=query_schedule,