diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py index 238ba366da43..c3c0180e8516 100644 --- a/colossalai/nn/optimizer/cpu_adam.py +++ b/colossalai/nn/optimizer/cpu_adam.py @@ -9,7 +9,8 @@ class CPUAdam(NVMeOptimizer): - """Implements Adam algorithm. + """ + Implements Adam algorithm. Supports parameters updating on both GPU and CPU, depending on the device of parameters. But the parameters and gradients should on the same device: diff --git a/tests/test_zero/test_low_level/test_zero1_2.py b/tests/test_zero/test_low_level/test_zero1_2.py index ebda9f6f25c5..e2196cfbf0f2 100644 --- a/tests/test_zero/test_low_level/test_zero1_2.py +++ b/tests/test_zero/test_low_level/test_zero1_2.py @@ -106,7 +106,8 @@ def exam_zero_1_2(): @parameterize("dtype", [torch.float16, torch.bfloat16]) -def exam_zero_1_torch_ddp(world_size, dtype: torch.dtype): +@parameterize("master_weights", [True, False]) +def exam_zero_1_torch_ddp(world_size, dtype: torch.dtype, master_weights: bool): """ In this test, two pairs of model and optimizers are created. 1. zero: use sharded optimizer and fp16 parameters @@ -131,7 +132,11 @@ def exam_zero_1_torch_ddp(world_size, dtype: torch.dtype): # in `check_sharded_param_consistency.py`, we will test whether # level 1 and 2 will produce exactly the same results zero_optimizer = LowLevelZeroOptimizer( - zero_optimizer, overlap_communication=True, initial_scale=1, reduce_bucket_size=1024 * 1024 + zero_optimizer, + overlap_communication=True, + initial_scale=1, + reduce_bucket_size=1024 * 1024, + master_weights=master_weights, ) torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1)