diff --git a/test_runner.py b/test_runner.py index ca9d1320..c5b46b95 100755 --- a/test_runner.py +++ b/test_runner.py @@ -96,6 +96,15 @@ def build_test_list(args): ], "Checkpoint Integration Test - Save Model Weights Only bf16", ), + OverrideDefinitions( + [ + [ + f"--optimizer.name Adam --optimizer.fused --job.dump_folder {args.output_dir}/fused_adamw/", + f"--optimizer.name AdamW --optimizer.fused --job.dump_folder {args.output_dir}/fused_adamw/", + ] + ], + "Fused Optimizer Test", + ), ] return integration_tests_flavors diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py index 1a3e36d4..797d5826 100644 --- a/torchtitan/config_manager.py +++ b/torchtitan/config_manager.py @@ -155,6 +155,12 @@ def __init__(self): self.parser.add_argument( "--optimizer.lr", type=float, default=8e-4, help="Learning rate to use" ) + self.parser.add_argument( + "--optimizer.fused", + default=False, + action="store_true", + help="Whether the fused implementation(CUDA only) is used.", + ) # training configs self.parser.add_argument( diff --git a/train.py b/train.py index 318c7174..eb8fc4f0 100644 --- a/train.py +++ b/train.py @@ -88,14 +88,16 @@ def build_optimizer(model, job_config: JobConfig): # build optimizer name = job_config.optimizer.name lr = job_config.optimizer.lr + fused = job_config.optimizer.fused + # when fused = False, foreach = True by default. if name == "Adam": # TODO: make the optimizer options configurable by toml/cmd args optimizer = torch.optim.Adam( - model.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=0.1, foreach=True + model.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=0.1, fused=fused ) elif name == "AdamW": optimizer = torch.optim.AdamW( - model.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=0.1, foreach=True + model.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=0.1, fused=fused ) else: raise NotImplementedError(f"Optimizer {name} not added.")