From cd05f0e36814f267c714c404fabb4ef5bac3df3f Mon Sep 17 00:00:00 2001 From: Carson Wang Date: Fri, 5 Jan 2024 12:41:53 +0800 Subject: [PATCH] Add configuration mixed_precision for finetuning (#23) * Add mixed_precision * update * update --- docs/finetune_parameters.md | 1 + finetune/finetune.py | 27 ++++++++++--------- finetune/finetune.yaml | 1 + finetune/finetune_config.py | 1 + finetune/models/bloom-560m.yaml | 1 + finetune/models/finetune_config_template.yaml | 1 + finetune/models/gpt-j-6b.yaml | 1 + finetune/models/gpt2.yaml | 1 + finetune/models/llama-2-7b-chat-hf.yaml | 1 + finetune/models/llama-7b.yaml | 1 + finetune/models/mistral-7b-v0.1.yaml | 1 + finetune/models/mpt-7b-chat.yaml | 1 + finetune/models/opt-125m.yaml | 1 + 13 files changed, 27 insertions(+), 12 deletions(-) diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index 32980dd33..39cd1239e 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -32,6 +32,7 @@ The following are the parameters supported in the finetuning workflow. |learning_rate|1e-5|Initial learning rate to use.| |lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"| |weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.| +|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp8". Default is "no" if not set. |device|CPU|The device type used, can be "CPU", "GPU".| |num_training_workers|2|The number of the training process.| |resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.| diff --git a/finetune/finetune.py b/finetune/finetune.py index 430c452de..c752de085 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -26,31 +26,34 @@ from finetune_config import FinetuneConfig -def get_accelerate_environment_variable(mode: str) -> dict: +def get_accelerate_environment_variable(mode: str, config: Dict[str, Any]) -> dict: + mixed_precision = config["Training"]["mixed_precision"] mode_env_vars = { "CPU_DDP": { - "ACCELERATE_USE_CPU": "True", - "ACCELERATE_USE_IPEX": "False", - "ACCELERATE_MIXED_PRECISION": "no", + "ACCELERATE_USE_CPU": "true", + "ACCELERATE_USE_IPEX": "false", + "ACCELERATE_MIXED_PRECISION": mixed_precision, }, "GPU_DDP": { - "ACCELERATE_USE_CPU": "False", - "ACCELERATE_USE_XPU": "True", - "ACCELERATE_USE_IPEX": "True", + "ACCELERATE_USE_CPU": "false", + "ACCELERATE_USE_XPU": "true", + "ACCELERATE_USE_IPEX": "true", + "ACCELERATE_MIXED_PRECISION": mixed_precision, }, "GPU_FSDP": { - "ACCELERATE_USE_CPU": "False", - "ACCELERATE_USE_XPU": "True", - "ACCELERATE_USE_IPEX": "True", + "ACCELERATE_USE_CPU": "false", + "ACCELERATE_USE_XPU": "true", + "ACCELERATE_USE_IPEX": "true", "ACCELERATE_USE_FSDP": "true", "FSDP_SHARDING_STRATEGY": "1", "FSDP_OFFLOAD_PARAMS": "false", - "FSDP_AUTO_WRAP_POLICY": "NO_WRAP ", + "FSDP_AUTO_WRAP_POLICY": "NO_WRAP", "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE", "FSDP_STATE_DICT_TYPE": "SHARDED_STATE_DICT", "FSDP_FORWARD_PREFETCH": "false", "FSDP_USE_ORIG_PARAMS": "false", "FSDP_SYNC_MODULE_STATES": "true", + "ACCELERATE_MIXED_PRECISION": mixed_precision, } } if mode not in mode_env_vars: @@ -193,7 +196,7 @@ def main(external_config = None): } } - accelerate_env_vars = get_accelerate_environment_variable(accelerate_mode) + accelerate_env_vars = get_accelerate_environment_variable(accelerate_mode, config) runtime_env["env_vars"].update(accelerate_env_vars) if config["General"]["gpt_base_model"] == True: diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml index 44bcadcf4..f0092022d 100644 --- a/finetune/finetune.yaml +++ b/finetune/finetune.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py index fad9d751b..fc4fe3872 100644 --- a/finetune/finetune_config.py +++ b/finetune/finetune_config.py @@ -53,6 +53,7 @@ class Training(BaseModel): num_training_workers: int resources_per_worker: RayResourceConfig accelerate_mode: str + mixed_precision: str = "no" @validator("device") def check_device(cls, v: str): diff --git a/finetune/models/bloom-560m.yaml b/finetune/models/bloom-560m.yaml index 4c361fc55..c2999ce7a 100644 --- a/finetune/models/bloom-560m.yaml +++ b/finetune/models/bloom-560m.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/models/finetune_config_template.yaml b/finetune/models/finetune_config_template.yaml index 44bcadcf4..f0092022d 100644 --- a/finetune/models/finetune_config_template.yaml +++ b/finetune/models/finetune_config_template.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml index 44bcadcf4..f0092022d 100644 --- a/finetune/models/gpt-j-6b.yaml +++ b/finetune/models/gpt-j-6b.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml index a0887b324..0f9dbf9a8 100644 --- a/finetune/models/gpt2.yaml +++ b/finetune/models/gpt2.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml index c7e7430f4..56348b2d1 100644 --- a/finetune/models/llama-2-7b-chat-hf.yaml +++ b/finetune/models/llama-2-7b-chat-hf.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/models/llama-7b.yaml b/finetune/models/llama-7b.yaml index 3bd823253..0d8e9a408 100644 --- a/finetune/models/llama-7b.yaml +++ b/finetune/models/llama-7b.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml index 46b05a5a8..8e3eec5ce 100644 --- a/finetune/models/mistral-7b-v0.1.yaml +++ b/finetune/models/mistral-7b-v0.1.yaml @@ -31,6 +31,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml index 149514c07..e8f04d209 100644 --- a/finetune/models/mpt-7b-chat.yaml +++ b/finetune/models/mpt-7b-chat.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: diff --git a/finetune/models/opt-125m.yaml b/finetune/models/opt-125m.yaml index 4d8dc7e13..16810aa30 100644 --- a/finetune/models/opt-125m.yaml +++ b/finetune/models/opt-125m.yaml @@ -22,6 +22,7 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: CPU num_training_workers: 2 resources_per_worker: