From 452431f33a64ef1d063c27fbb03370f6f9958646 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Tue, 12 Sep 2023 21:58:01 +0800 Subject: [PATCH 01/22] [chat] fix gemini strategy --- .../coati/trainer/strategies/colossalai.py | 117 ++++++++---------- applications/Chat/examples/train_sft.py | 34 ++--- applications/Chat/examples/train_sft.sh | 40 +++--- applications/Chat/requirements.txt | 2 +- applications/Chat/tests/test_checkpoint.py | 31 ++--- 5 files changed, 94 insertions(+), 130 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index fa55f97ad661..d65491ee8589 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -7,11 +7,12 @@ import colossalai from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin -from colossalai.booster.plugin.gemini_plugin import GeminiModel +# from colossalai.booster.plugin.gemini_plugin import GeminiModel from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel +# from colossalai.zero import ColoInitContext +from colossalai.lazy.lazy_init import LazyInitContext from colossalai.tensor import ProcessGroup, ShardSpec from colossalai.utils import get_current_device -from colossalai.zero import ColoInitContext from colossalai.zero.gemini.gemini_ddp import GeminiDDP from .ddp import DDPStrategy @@ -42,37 +43,37 @@ class LowLevelZeroStrategy(DDPStrategy): """ - def __init__(self, - stage: int = 2, - precision: str = 'fp16', - seed: int = 42, - placement_policy: str = 'cuda', - reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2 - overlap_communication: bool = True, # only for stage 1&2 - initial_scale: float = 2**16, - growth_factor: float = 2, - backoff_factor: float = 0.5, - growth_interval: int = 1000, - hysteresis: int = 2, - min_scale: float = 1, - max_scale: float = 2**32, - max_norm: float = 0.0, - norm_type: float = 2.0 - ) -> None: + def __init__( + self, + stage: int = 2, + precision: str = 'fp16', + seed: int = 42, + placement_policy: str = 'cuda', + reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2 + overlap_communication: bool = True, # only for stage 1&2 + initial_scale: float = 2**16, + growth_factor: float = 2, + backoff_factor: float = 0.5, + growth_interval: int = 1000, + hysteresis: int = 2, + min_scale: float = 1, + max_scale: float = 2**32, + max_norm: float = 0.0, + norm_type: float = 2.0) -> None: assert stage in (1, 2), f'Unsupported stage "{stage}"' assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"' assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"' plugin_initializer = lambda: LowLevelZeroPlugin( - # zero_config + # zero_config stage=stage, precision=precision, - # zero_optim_config + # zero_optim_config reduce_bucket_size_in_m=reduce_bucket_size, overlap_communication=overlap_communication, cpu_offload=(placement_policy == 'cpu'), - # optim_config + # optim_config initial_scale=initial_scale, growth_factor=growth_factor, backoff_factor=backoff_factor, @@ -81,8 +82,7 @@ def __init__(self, min_scale=min_scale, max_scale=max_scale, max_norm=max_norm, - norm_type=norm_type - ) + norm_type=norm_type) super().__init__(seed, plugin_initializer) @@ -131,43 +131,39 @@ class GeminiStrategy(DDPStrategy): """ - def __init__(self, - seed: int = 42, - shard_init: bool = False, # only for stage 3 - placement_policy: str = 'cuda', - pin_memory: bool = True, # only for stage 3 - force_outputs_fp32: bool = False, # only for stage 3 - search_range_m: int = 32, # only for stage 3 - hidden_dim: Optional[int] = None, # only for stage 3 - min_chunk_size_m: float = 32, # only for stage 3 - gpu_margin_mem_ratio: float = 0.0, # only for stage 3 - initial_scale: float = 2**16, - growth_factor: float = 2, - backoff_factor: float = 0.5, - growth_interval: int = 1000, - hysteresis: int = 2, - min_scale: float = 1, - max_scale: float = 2**32, - max_norm: float = 0.0, - norm_type: float = 2.0 - ) -> None: - - assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"' + def __init__( + self, + seed: int = 42, + shard_init: bool = False, # only for stage 3 + placement_policy: str = 'auto', + pin_memory: bool = True, # only for stage 3 + force_outputs_fp32: bool = False, # only for stage 3 + search_range_m: int = 32, # only for stage 3 + hidden_dim: Optional[int] = None, # only for stage 3 + min_chunk_size_m: float = 32, # only for stage 3 + gpu_margin_mem_ratio: float = 0.0, # only for stage 3 + initial_scale: float = 2**16, + growth_factor: float = 2, + backoff_factor: float = 0.5, + growth_interval: int = 1000, + hysteresis: int = 2, + min_scale: float = 1, + max_scale: float = 2**32, + max_norm: float = 0.0, + norm_type: float = 2.0) -> None: # TODO(ver217): support shard_init when using from_pretrained() if shard_init: - warnings.warn( - f'Shard init is not supported model.from_pretrained() yet. ' - 'Please load weights after strategy.prepare()' - ) + warnings.warn(f'Shard init is not supported model.from_pretrained() yet. ' + 'Please load weights after strategy.prepare()') self.shard_init = shard_init warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.') # NOTE: dist should be initialized before calling get_current_device() plugin_initializer = lambda: GeminiPlugin( - # gemini_config - device=get_current_device(), + # gemini_config + chunk_init_device=get_current_device(), placement_policy=placement_policy, precision='fp16', pin_memory=pin_memory, @@ -176,9 +172,9 @@ def __init__(self, search_range_m=search_range_m, hidden_dim=hidden_dim, min_chunk_size_m=min_chunk_size_m, - # zero_optim_config + # zero_optim_config gpu_margin_mem_ratio=gpu_margin_mem_ratio, - # optim_config + # optim_config initial_scale=initial_scale, growth_factor=growth_factor, backoff_factor=backoff_factor, @@ -187,8 +183,7 @@ def __init__(self, min_scale=min_scale, max_scale=max_scale, max_norm=max_norm, - norm_type=norm_type - ) + norm_type=norm_type) super().__init__(seed, plugin_initializer) @@ -200,16 +195,10 @@ def setup_distributed(self) -> None: colossalai.launch_from_torch({}, seed=self.seed) def model_init_context(self): - world_size = dist.get_world_size() - shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None - default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None - return ColoInitContext(device=get_current_device(), - dtype=torch.half, - default_pg=shard_pg, - default_dist_spec=default_dist_spec) + return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: - assert isinstance(model, GeminiModel) + # assert isinstance(model, GeminiModel) ddp_model = model.unwrap() assert isinstance(ddp_model, GeminiDDP) return ddp_model.module diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index f068ea2bf5de..dcc6b0281082 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -6,18 +6,18 @@ import torch.distributed as dist from coati.dataset import SFTDataset, SupervisedDataset from coati.models.bloom import BLOOMActor +from coati.models.chatglm import ChatGLMActor +from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer from coati.models.gpt import GPTActor from coati.models.llama import LlamaActor from coati.models.opt import OPTActor -from coati.models.chatglm import ChatGLMActor from coati.trainer import SFTTrainer from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from datasets import load_dataset from torch.optim import Adam from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer, AutoModel -from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer +from transformers import AutoModel, AutoTokenizer, BloomTokenizerFast, LlamaTokenizer from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer from transformers.trainer import get_scheduler @@ -31,7 +31,7 @@ def train(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='cuda') + strategy = GeminiStrategy(placement_policy='auto') elif args.strategy == 'colossalai_zero2': strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') elif args.strategy == 'colossalai_zero2_cpu': @@ -45,21 +45,13 @@ def train(args): args.grad_checkpoint = False with strategy.model_init_context(): if args.model == 'bloom': - model = BLOOMActor(pretrained=args.pretrain, - lora_rank=args.lora_rank, - checkpoint=args.grad_checkpoint) + model = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint) elif args.model == 'opt': - model = OPTActor(pretrained=args.pretrain, - lora_rank=args.lora_rank, - checkpoint=args.grad_checkpoint) + model = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint) elif args.model == 'gpt2': - model = GPTActor(pretrained=args.pretrain, - lora_rank=args.lora_rank, - checkpoint=args.grad_checkpoint) + model = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint) elif args.model == 'llama': - model = LlamaActor(pretrained=args.pretrain, - lora_rank=args.lora_rank, - checkpoint=args.grad_checkpoint) + model = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint) elif args.model == 'chatglm': model = ChatGLMActor(pretrained=args.pretrain) else: @@ -69,16 +61,14 @@ def train(args): # configure tokenizer if args.model == 'gpt2': - tokenizer = GPT2Tokenizer.from_pretrained( - 'gpt2' if args.tokenizer is None else args.tokenizer) + tokenizer = GPT2Tokenizer.from_pretrained('gpt2' if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'bloom': tokenizer = BloomTokenizerFast.from_pretrained( 'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'opt': - tokenizer = AutoTokenizer.from_pretrained( - "facebook/opt-350m" if args.tokenizer is None else args.tokenizer) + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'llama': tokenizer = LlamaTokenizer.from_pretrained( @@ -86,8 +76,8 @@ def train(args): tokenizer.eos_token = '<\s>' tokenizer.pad_token = tokenizer.unk_token elif args.model == 'chatglm': - tokenizer = ChatGLMTokenizer.from_pretrained( - "THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer, trust_remote_code=True) + tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer, + trust_remote_code=True) else: raise ValueError(f'Unsupported model "{args.model}"') diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh index 1a5cd069011d..b489ecd48edb 100755 --- a/applications/Chat/examples/train_sft.sh +++ b/applications/Chat/examples/train_sft.sh @@ -1,29 +1,29 @@ -set_n_least_used_CUDA_VISIBLE_DEVICES() { - local n=${1:-"9999"} - echo "GPU Memory Usage:" - local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv | - tail -n +2 | - nl -v 0 | - tee /dev/tty | - sort -g -k 2 | - awk '{print $1}' | - head -n $n) - export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') - echo "Now CUDA_VISIBLE_DEVICES is set to:" - echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" -} +# set_n_least_used_CUDA_VISIBLE_DEVICES() { +# local n=${1:-"9999"} +# echo "GPU Memory Usage:" +# local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv | +# tail -n +2 | +# nl -v 0 | +# tee /dev/tty | +# sort -g -k 2 | +# awk '{print $1}' | +# head -n $n) +# export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') +# echo "Now CUDA_VISIBLE_DEVICES is set to:" +# echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +# } -set_n_least_used_CUDA_VISIBLE_DEVICES 4 +# set_n_least_used_CUDA_VISIBLE_DEVICES 4 torchrun --standalone --nproc_per_node=4 train_sft.py \ - --pretrain "/path/to/LLaMa-7B/" \ + --pretrain "/home/lcjmy/data3/llama" \ --model 'llama' \ - --strategy colossalai_zero2 \ + --strategy colossalai_gemini \ --log_interval 10 \ - --save_path /path/to/Coati-7B \ - --dataset /path/to/data.json \ + --save_path "/home/lcjmy/data3/output" \ + --dataset "yizhongw/self_instruct" \ --batch_size 4 \ - --accumulation_steps 8 \ + --accumulation_steps 1 \ --lr 2e-5 \ --max_datasets_size 512 \ --max_epochs 1 diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt index e5f5ca0932a8..e78b203029a5 100644 --- a/applications/Chat/requirements.txt +++ b/applications/Chat/requirements.txt @@ -2,7 +2,7 @@ transformers>=4.20.1 tqdm datasets loralib -colossalai==0.3.1 +colossalai==0.3.2 torch<2.0.0, >=1.12.1 langchain tokenizers diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py index 3a3bf5b19cb8..c8ac416d4eff 100644 --- a/applications/Chat/tests/test_checkpoint.py +++ b/applications/Chat/tests/test_checkpoint.py @@ -22,10 +22,7 @@ def get_data(batch_size: int, seq_len: int = 10) -> dict: return dict(input_ids=input_ids, attention_mask=attention_mask) -def train_step(strategy: Strategy, - actor: GPTActor, - actor_optim: HybridAdam, - batch_size: int = 8): +def train_step(strategy: Strategy, actor: GPTActor, actor_optim: HybridAdam, batch_size: int = 8): data = get_data(batch_size) action_mask = torch.ones_like(data["attention_mask"], dtype=torch.bool) actor_output = actor(data["input_ids"], data["attention_mask"]) @@ -35,12 +32,11 @@ def train_step(strategy: Strategy, strategy.optimizer_step(actor_optim) -def run_test_checkpoint(strategy_name: str, - shard: bool): +def run_test_checkpoint(strategy_name: str, shard: bool): if strategy_name == "ddp": strategy = DDPStrategy() elif strategy_name == "colossalai_gemini": - strategy = GeminiStrategy(placement_policy="cuda", initial_scale=2**5) + strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) elif strategy_name == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") else: @@ -60,11 +56,9 @@ def run_test_checkpoint(strategy_name: str, dist.broadcast_object_list(rank0_dirname) rank0_dirname = rank0_dirname[0] - model_path = os.path.join( - rank0_dirname, "model" if shard else f"model.pt") + model_path = os.path.join(rank0_dirname, "model" if shard else f"model.pt") strategy.save_model(actor, model_path, only_rank0=not shard) - optim_path = os.path.join( - rank0_dirname, "optim" if shard else "optim.pt") + optim_path = os.path.join(rank0_dirname, "optim" if shard else "optim.pt") strategy.save_optimizer(actor_optim, optim_path, only_rank0=not shard) dist.barrier() @@ -75,11 +69,7 @@ def run_test_checkpoint(strategy_name: str, train_step(strategy, actor, actor_optim) -def run_dist(rank: int, - world_size: int, - port: int, - strategy_name: str, - shard: bool): +def run_dist(rank: int, world_size: int, port: int, strategy_name: str, shard: bool): os.environ["RANK"] = str(rank) os.environ["LOCAL_RANK"] = str(rank) os.environ["WORLD_SIZE"] = str(world_size) @@ -93,13 +83,8 @@ def run_dist(rank: int, @pytest.mark.parametrize("strategy_name", ["ddp", "colossalai_gemini", "colossalai_zero2"]) @pytest.mark.parametrize("shard", [False, True]) @rerun_if_address_is_in_use() -def test_checkpoint(world_size: int, - strategy_name: str, - shard: bool): - spawn(run_dist, - world_size, - strategy_name=strategy_name, - shard=shard) +def test_checkpoint(world_size: int, strategy_name: str, shard: bool): + spawn(run_dist, world_size, strategy_name=strategy_name, shard=shard) if __name__ == "__main__": From b0c4f28722395977d9c27a8db6d721f45ccbe6f5 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Tue, 12 Sep 2023 22:31:13 +0800 Subject: [PATCH 02/22] [chat] fix gemini strategy --- applications/Chat/requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/requirements-test.txt b/applications/Chat/requirements-test.txt index eb1a77875acb..c688935bda31 100644 --- a/applications/Chat/requirements-test.txt +++ b/applications/Chat/requirements-test.txt @@ -1,2 +1,2 @@ pytest -colossalai==0.3.1 \ No newline at end of file +colossalai==0.3.2 From 987d38d4bdb239070d5539b4f86e38a4e2698c5d Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Wed, 13 Sep 2023 14:18:16 +0800 Subject: [PATCH 03/22] [chat] fix gemini strategy --- .../coati/trainer/strategies/colossalai.py | 85 +++++++++---------- applications/Chat/examples/train_prompts.py | 46 +++++----- .../Chat/examples/train_reward_model.py | 8 +- applications/Chat/examples/train_sft.sh | 40 ++++----- colossalai/zero/gemini/colo_init_context.py | 2 +- 5 files changed, 86 insertions(+), 95 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index d65491ee8589..744fba2d134e 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -9,10 +9,9 @@ from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin # from colossalai.booster.plugin.gemini_plugin import GeminiModel from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel -# from colossalai.zero import ColoInitContext -from colossalai.lazy.lazy_init import LazyInitContext from colossalai.tensor import ProcessGroup, ShardSpec from colossalai.utils import get_current_device +from colossalai.zero import ColoInitContext from colossalai.zero.gemini.gemini_ddp import GeminiDDP from .ddp import DDPStrategy @@ -65,24 +64,20 @@ def __init__( assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"' assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"' - plugin_initializer = lambda: LowLevelZeroPlugin( - # zero_config - stage=stage, - precision=precision, - # zero_optim_config - reduce_bucket_size_in_m=reduce_bucket_size, - overlap_communication=overlap_communication, - cpu_offload=(placement_policy == 'cpu'), - # optim_config - initial_scale=initial_scale, - growth_factor=growth_factor, - backoff_factor=backoff_factor, - growth_interval=growth_interval, - hysteresis=hysteresis, - min_scale=min_scale, - max_scale=max_scale, - max_norm=max_norm, - norm_type=norm_type) + plugin_initializer = lambda: LowLevelZeroPlugin(stage=stage, + precision=precision, + reduce_bucket_size_in_m=reduce_bucket_size, + overlap_communication=overlap_communication, + cpu_offload=(placement_policy == 'cpu'), + initial_scale=initial_scale, + growth_factor=growth_factor, + backoff_factor=backoff_factor, + growth_interval=growth_interval, + hysteresis=hysteresis, + min_scale=min_scale, + max_scale=max_scale, + max_norm=max_norm, + norm_type=norm_type) super().__init__(seed, plugin_initializer) @@ -161,29 +156,25 @@ def __init__( warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.') # NOTE: dist should be initialized before calling get_current_device() - plugin_initializer = lambda: GeminiPlugin( - # gemini_config - chunk_init_device=get_current_device(), - placement_policy=placement_policy, - precision='fp16', - pin_memory=pin_memory, - force_outputs_fp32=force_outputs_fp32, - strict_ddp_mode=shard_init, - search_range_m=search_range_m, - hidden_dim=hidden_dim, - min_chunk_size_m=min_chunk_size_m, - # zero_optim_config - gpu_margin_mem_ratio=gpu_margin_mem_ratio, - # optim_config - initial_scale=initial_scale, - growth_factor=growth_factor, - backoff_factor=backoff_factor, - growth_interval=growth_interval, - hysteresis=hysteresis, - min_scale=min_scale, - max_scale=max_scale, - max_norm=max_norm, - norm_type=norm_type) + plugin_initializer = lambda: GeminiPlugin(chunk_init_device=get_current_device(), + placement_policy=placement_policy, + precision='fp16', + pin_memory=pin_memory, + force_outputs_fp32=force_outputs_fp32, + strict_ddp_mode=shard_init, + search_range_m=search_range_m, + hidden_dim=hidden_dim, + min_chunk_size_m=min_chunk_size_m, + gpu_margin_mem_ratio=gpu_margin_mem_ratio, + initial_scale=initial_scale, + growth_factor=growth_factor, + backoff_factor=backoff_factor, + growth_interval=growth_interval, + hysteresis=hysteresis, + min_scale=min_scale, + max_scale=max_scale, + max_norm=max_norm, + norm_type=norm_type) super().__init__(seed, plugin_initializer) @@ -195,7 +186,13 @@ def setup_distributed(self) -> None: colossalai.launch_from_torch({}, seed=self.seed) def model_init_context(self): - return super().model_init_context() + world_size = dist.get_world_size() + shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None + default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None + return ColoInitContext(device=get_current_device(), + dtype=torch.half, + default_pg=shard_pg, + default_dist_spec=default_dist_spec) def unwrap_model(self, model: nn.Module) -> nn.Module: # assert isinstance(model, GeminiModel) diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index d27a70a3fef6..ab6590568896 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -23,7 +23,7 @@ def main(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) + strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5) elif args.strategy == 'colossalai_zero2': strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: @@ -108,16 +108,14 @@ def main(args): # configure tokenizer if args.model == 'gpt2': - tokenizer = GPT2Tokenizer.from_pretrained( - 'gpt2' if args.tokenizer is None else args.tokenizer) + tokenizer = GPT2Tokenizer.from_pretrained('gpt2' if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'bloom': tokenizer = BloomTokenizerFast.from_pretrained( 'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'opt': - tokenizer = AutoTokenizer.from_pretrained( - "facebook/opt-350m" if args.tokenizer is None else args.tokenizer) + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'llama': tokenizer = LlamaTokenizer.from_pretrained( @@ -155,26 +153,24 @@ def main(args): strategy.prepare((actor, actor_optim), (critic, critic_optim), reward_model, initial_model) # configure trainer - trainer = PPOTrainer( - strategy, - actor, - critic, - reward_model, - initial_model, - actor_optim, - critic_optim, - kl_coef=args.kl_coef, - ptx_coef=args.ptx_coef, - train_batch_size=args.train_batch_size, - max_length=args.max_seq_len, - use_cache=True, - do_sample=True, - temperature=1.0, - top_k=50, - pad_token_id=tokenizer.pad_token_id, - eos_token_id=tokenizer.eos_token_id, - offload_inference_models=args.strategy != 'colossalai_gemini' - ) + trainer = PPOTrainer(strategy, + actor, + critic, + reward_model, + initial_model, + actor_optim, + critic_optim, + kl_coef=args.kl_coef, + ptx_coef=args.ptx_coef, + train_batch_size=args.train_batch_size, + max_length=args.max_seq_len, + use_cache=True, + do_sample=True, + temperature=1.0, + top_k=50, + pad_token_id=tokenizer.pad_token_id, + eos_token_id=tokenizer.eos_token_id, + offload_inference_models=args.strategy != 'colossalai_gemini') trainer.fit(prompt_dataloader=prompt_dataloader, pretrain_dataloader=pretrain_dataloader, diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py index 190460bc20f6..57b3a71fd0a8 100644 --- a/applications/Chat/examples/train_reward_model.py +++ b/applications/Chat/examples/train_reward_model.py @@ -27,7 +27,7 @@ def train(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='cuda') + strategy = GeminiStrategy(placement_policy='auto') elif args.strategy == 'colossalai_zero2': strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: @@ -54,16 +54,14 @@ def train(args): # configure tokenizer if args.model == 'gpt2': - tokenizer = GPT2Tokenizer.from_pretrained( - 'gpt2' if args.tokenizer is None else args.tokenizer) + tokenizer = GPT2Tokenizer.from_pretrained('gpt2' if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'bloom': tokenizer = BloomTokenizerFast.from_pretrained( 'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'opt': - tokenizer = AutoTokenizer.from_pretrained( - "facebook/opt-350m" if args.tokenizer is None else args.tokenizer) + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer) tokenizer.pad_token = tokenizer.eos_token elif args.model == 'llama': tokenizer = LlamaTokenizer.from_pretrained( diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh index b489ecd48edb..cbe3b7f416cf 100755 --- a/applications/Chat/examples/train_sft.sh +++ b/applications/Chat/examples/train_sft.sh @@ -1,29 +1,29 @@ -# set_n_least_used_CUDA_VISIBLE_DEVICES() { -# local n=${1:-"9999"} -# echo "GPU Memory Usage:" -# local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv | -# tail -n +2 | -# nl -v 0 | -# tee /dev/tty | -# sort -g -k 2 | -# awk '{print $1}' | -# head -n $n) -# export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') -# echo "Now CUDA_VISIBLE_DEVICES is set to:" -# echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" -# } +set_n_least_used_CUDA_VISIBLE_DEVICES() { + local n=${1:-"9999"} + echo "GPU Memory Usage:" + local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv | + tail -n +2 | + nl -v 0 | + tee /dev/tty | + sort -g -k 2 | + awk '{print $1}' | + head -n $n) + export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') + echo "Now CUDA_VISIBLE_DEVICES is set to:" + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +} -# set_n_least_used_CUDA_VISIBLE_DEVICES 4 +set_n_least_used_CUDA_VISIBLE_DEVICES 4 torchrun --standalone --nproc_per_node=4 train_sft.py \ - --pretrain "/home/lcjmy/data3/llama" \ + --pretrain "/path/to/LLaMa-7B/" \ --model 'llama' \ - --strategy colossalai_gemini \ + --strategy colossalai_zero2 \ --log_interval 10 \ - --save_path "/home/lcjmy/data3/output" \ - --dataset "yizhongw/self_instruct" \ + --save_path "/path/to/Coati-7B" \ + --dataset /path/to/data.json \ --batch_size 4 \ - --accumulation_steps 1 \ + --accumulation_steps 8 \ --lr 2e-5 \ --max_datasets_size 512 \ --max_epochs 1 diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py index dad852a34a71..e1e35697c534 100644 --- a/colossalai/zero/gemini/colo_init_context.py +++ b/colossalai/zero/gemini/colo_init_context.py @@ -126,7 +126,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs): replaced_tensors[param] = colo_param delattr(submodule, param_name) setattr(submodule, param_name, colo_param) - colo_param.shared_param_modules.append(submodule) + # colo_param.shared_param_modules.append(submodule) param_number = 0 meta_param_number = 0 From 23281b15f62a043066fd8c064e35da99b83c9f1b Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Wed, 13 Sep 2023 14:25:42 +0800 Subject: [PATCH 04/22] [chat] fix gemini strategy --- applications/Chat/benchmarks/benchmark_opt_lora_dummy.py | 4 +--- applications/Chat/coati/ray/utils.py | 4 +--- .../Chat/examples/community/peft/train_peft_prompts.py | 2 +- applications/Chat/examples/community/peft/train_peft_sft.py | 6 ++---- .../Chat/examples/community/ray/train_prompts_on_ray.py | 6 ++---- applications/Chat/tests/test_experience.py | 2 +- 6 files changed, 8 insertions(+), 16 deletions(-) diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py index 90471ed727b0..0e5d747c4dbe 100644 --- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py +++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py @@ -76,9 +76,7 @@ def main(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) - elif args.strategy == 'colossalai_gemini_cpu': - strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5) + strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5) elif args.strategy == 'colossalai_zero2': strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') elif args.strategy == 'colossalai_zero2_cpu': diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py index 761186b95ee5..1e661164011d 100644 --- a/applications/Chat/coati/ray/utils.py +++ b/applications/Chat/coati/ray/utils.py @@ -71,11 +71,9 @@ def get_strategy_from_args(strategy: str): if strategy == 'ddp': strategy_ = DDPStrategy() elif strategy == 'colossalai_gemini': - strategy_ = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) + strategy_ = GeminiStrategy(placement_policy='auto', initial_scale=2**5) elif strategy == 'colossalai_zero2': strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cuda') - elif strategy == 'colossalai_gemini_cpu': - strategy_ = GeminiStrategy(placement_policy='cpu', initial_scale=2**5) elif strategy == 'colossalai_zero2_cpu': strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cpu') else: diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py index 9385e457d852..0021664b88a8 100644 --- a/applications/Chat/examples/community/peft/train_peft_prompts.py +++ b/applications/Chat/examples/community/peft/train_peft_prompts.py @@ -26,7 +26,7 @@ def main(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5) + strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5) elif args.strategy == 'colossalai_zero2': strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu') else: diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py index 4af08e6d0141..8c1a315439cd 100644 --- a/applications/Chat/examples/community/peft/train_peft_sft.py +++ b/applications/Chat/examples/community/peft/train_peft_sft.py @@ -32,7 +32,7 @@ def train(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='cuda') + strategy = GeminiStrategy(placement_policy='auto') elif args.strategy == 'colossalai_zero2': strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: @@ -163,9 +163,7 @@ def train(args): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--strategy', - choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], - default='ddp') + parser.add_argument('--strategy', choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='ddp') parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom') parser.add_argument('--pretrain', type=str, default=None) parser.add_argument('--dataset', type=str, default=None) diff --git a/applications/Chat/examples/community/ray/train_prompts_on_ray.py b/applications/Chat/examples/community/ray/train_prompts_on_ray.py index 1bba9ad66fbc..155721b83f56 100644 --- a/applications/Chat/examples/community/ray/train_prompts_on_ray.py +++ b/applications/Chat/examples/community/ray/train_prompts_on_ray.py @@ -102,7 +102,7 @@ def _init_strategy(self, strategy: str): if strategy == 'ddp': self._strategy = DDPStrategy() elif strategy == 'colossalai_gemini': - self._strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) + self._strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5) elif strategy == 'colossalai_zero2': self._strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: @@ -531,9 +531,7 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--prompt_csv_url', type=str) - parser.add_argument('--strategy', - choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], - default='ddp') + parser.add_argument('--strategy', choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='ddp') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt']) parser.add_argument('--pretrain', type=str, default='gpt2') parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt') diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py index 071e50b90e8e..9dd8edec479f 100644 --- a/applications/Chat/tests/test_experience.py +++ b/applications/Chat/tests/test_experience.py @@ -42,7 +42,7 @@ def make_and_consume_experience(strategy): elif strategy == 'colossalai-zero2': strategy = LowLevelZeroStrategy() elif strategy == 'colossalai-gemini': - strategy = GeminiStrategy(placement_policy='cuda') + strategy = GeminiStrategy(placement_policy='auto') else: raise ValueError(f'Unsupported strategy "{strategy}"') From 81c662ad6bde564e57a7e4e3569d939055740e8d Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Wed, 13 Sep 2023 16:03:44 +0800 Subject: [PATCH 05/22] g# This is a combination of 2 commits. [chat] fix gemini strategy fox --- applications/Chat/coati/trainer/strategies/colossalai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index 744fba2d134e..a8931175cb4a 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -156,7 +156,7 @@ def __init__( warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.') # NOTE: dist should be initialized before calling get_current_device() - plugin_initializer = lambda: GeminiPlugin(chunk_init_device=get_current_device(), + plugin_initializer = lambda: GeminiPlugin(device=get_current_device(), placement_policy=placement_policy, precision='fp16', pin_memory=pin_memory, From 98e891bf8950a3754217748523db809259b156a3 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Thu, 14 Sep 2023 12:06:35 +0800 Subject: [PATCH 06/22] [chat] fix gemini strategy update llama2 example [chat] fix gemini strategy --- .../Chat/coati/trainer/strategies/colossalai.py | 10 ++-------- applications/Chat/examples/requirements.txt | 2 +- colossalai/zero/gemini/colo_init_context.py | 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index a8931175cb4a..88909bc8e42d 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -156,7 +156,7 @@ def __init__( warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.') # NOTE: dist should be initialized before calling get_current_device() - plugin_initializer = lambda: GeminiPlugin(device=get_current_device(), + plugin_initializer = lambda: GeminiPlugin(chunk_init_device=get_current_device(), placement_policy=placement_policy, precision='fp16', pin_memory=pin_memory, @@ -186,13 +186,7 @@ def setup_distributed(self) -> None: colossalai.launch_from_torch({}, seed=self.seed) def model_init_context(self): - world_size = dist.get_world_size() - shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None - default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None - return ColoInitContext(device=get_current_device(), - dtype=torch.half, - default_pg=shard_pg, - default_dist_spec=default_dist_spec) + return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: # assert isinstance(model, GeminiModel) diff --git a/applications/Chat/examples/requirements.txt b/applications/Chat/examples/requirements.txt index 5d0f9f927d17..0890917048d2 100644 --- a/applications/Chat/examples/requirements.txt +++ b/applications/Chat/examples/requirements.txt @@ -1,3 +1,3 @@ pandas>=1.4.1 sentencepiece -colossalai==0.3.1 \ No newline at end of file +colossalai==0.3.2 diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py index e1e35697c534..dad852a34a71 100644 --- a/colossalai/zero/gemini/colo_init_context.py +++ b/colossalai/zero/gemini/colo_init_context.py @@ -126,7 +126,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs): replaced_tensors[param] = colo_param delattr(submodule, param_name) setattr(submodule, param_name, colo_param) - # colo_param.shared_param_modules.append(submodule) + colo_param.shared_param_modules.append(submodule) param_number = 0 meta_param_number = 0 From c13ac914f7733e84fbe2c5c1dd2bc57a39170a74 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Mon, 18 Sep 2023 21:09:16 +0800 Subject: [PATCH 07/22] [fix] fix gemini strategy --- applications/Chat/coati/models/generation.py | 4 ++++ applications/Chat/coati/trainer/strategies/base.py | 1 + applications/Chat/coati/trainer/strategies/colossalai.py | 7 +++++++ applications/Chat/examples/train_prompts.py | 8 ++++---- applications/Chat/examples/train_reward_model.py | 2 +- applications/Chat/examples/train_sft.py | 2 +- colossalai/tensor/param_op_hook.py | 2 +- colossalai/zero/gemini/colo_init_context.py | 2 +- colossalai/zero/gemini/gemini_ddp.py | 6 +++--- 9 files changed, 23 insertions(+), 11 deletions(-) diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py index de0d63f95f50..c3d97ffdea3f 100644 --- a/applications/Chat/coati/models/generation.py +++ b/applications/Chat/coati/models/generation.py @@ -58,13 +58,17 @@ def _sample(model: Actor, for _ in range(input_ids.size(1), max_length): model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) \ if prepare_inputs_fn is not None else {'input_ids': input_ids} + print(model_inputs) outputs = model(**model_inputs) next_token_logits = outputs['logits'][:, -1, :] # pre-process distribution + print("input_ids" + str(input_ids)) + print("next_token_logits" + str(next_token_logits)) next_token_logits = logits_processor(input_ids, next_token_logits) # sample probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float) + print(probs) next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) # finished sentences should have their next token be a padding token diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py index c20b2b16e396..8a55143ee90f 100644 --- a/applications/Chat/coati/trainer/strategies/base.py +++ b/applications/Chat/coati/trainer/strategies/base.py @@ -49,6 +49,7 @@ def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = Fal pass def model_init_context(self): + print("aaaaaaa nullcontext") return nullcontext() def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]: diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index 88909bc8e42d..016b6e7ea3b0 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -186,6 +186,13 @@ def setup_distributed(self) -> None: colossalai.launch_from_torch({}, seed=self.seed) def model_init_context(self): + # world_size = dist.get_world_size() + # shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None + # default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None + # return ColoInitContext(device=get_current_device(), + # dtype=torch.half, + # default_pg=shard_pg, + # default_dist_spec=default_dist_spec) return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index ab6590568896..7b5a03285b7a 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -65,8 +65,8 @@ def main(args): if args.rm_path is not None: reward_model.load_state_dict(state_dict, strict=False) - initial_model.to(torch.float16).to(torch.cuda.current_device()) - reward_model.to(torch.float16).to(torch.cuda.current_device()) + initial_model.to(torch.bfloat16).to(torch.cuda.current_device()) + reward_model.to(torch.bfloat16).to(torch.cuda.current_device()) if args.model == 'gpt2': actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank) @@ -95,8 +95,8 @@ def main(args): del state_dict if args.strategy != 'colossalai_gemini': - critic.to(torch.float16).to(torch.cuda.current_device()) - actor.to(torch.float16).to(torch.cuda.current_device()) + critic.to(torch.bfloat16).to(torch.cuda.current_device()) + actor.to(torch.bfloat16).to(torch.cuda.current_device()) # configure optimizer if args.strategy.startswith('colossalai'): diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py index 57b3a71fd0a8..48ed841d7103 100644 --- a/applications/Chat/examples/train_reward_model.py +++ b/applications/Chat/examples/train_reward_model.py @@ -46,7 +46,7 @@ def train(args): else: raise ValueError(f'Unsupported model "{args.model}"') - model.to(torch.float16).to(torch.cuda.current_device()) + model.to(torch.bfloat16).to(torch.cuda.current_device()) if args.model_path is not None: state_dict = torch.load(args.model_path) diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index dcc6b0281082..d0a6becc94e0 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -57,7 +57,7 @@ def train(args): else: raise ValueError(f'Unsupported model "{args.model}"') - model.to(torch.float16).to(torch.cuda.current_device()) + model.to(torch.bfloat16).to(torch.cuda.current_device()) # configure tokenizer if args.model == 'gpt2': diff --git a/colossalai/tensor/param_op_hook.py b/colossalai/tensor/param_op_hook.py index e37859bac0c3..e2b9926f2585 100644 --- a/colossalai/tensor/param_op_hook.py +++ b/colossalai/tensor/param_op_hook.py @@ -143,7 +143,7 @@ def _flatten_grad_args(args) -> Tuple[list, list, List[bool], TreeSpec]: grad_args.append(arg) else: other_args.append(arg) - assert len(grad_args) > 0 + # assert len(grad_args) > 0 return grad_args, other_args, grad_flags, spec diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py index dad852a34a71..e1e35697c534 100644 --- a/colossalai/zero/gemini/colo_init_context.py +++ b/colossalai/zero/gemini/colo_init_context.py @@ -126,7 +126,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs): replaced_tensors[param] = colo_param delattr(submodule, param_name) setattr(submodule, param_name, colo_param) - colo_param.shared_param_modules.append(submodule) + # colo_param.shared_param_modules.append(submodule) param_number = 0 meta_param_number = 0 diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 918b08cd3150..f80ee7b88a4f 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -232,9 +232,9 @@ def _post_forward(self): def forward(self, *args, **kwargs): # check whether we are in a inference mode grad_flag = torch.is_grad_enabled() - if not grad_flag: - assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup( - ), "You should run a completed iteration as your warmup iter" + # if not grad_flag: + # assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup( + # ), "You should run a completed iteration as your warmup iter" args, kwargs = _cast_float(args, self.mixed_precision), _cast_float(kwargs, self.mixed_precision) self.module.zero_grad(set_to_none=True) From 88b7b27ea098fe752f35d8af75a9c4238af91791 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Wed, 20 Sep 2023 16:24:59 +0800 Subject: [PATCH 08/22] [fix] fix gemini strategy --- .../Chat/coati/models/bloom/bloom_actor.py | 3 ++ applications/Chat/coati/models/lora.py | 31 +++++++++++------- .../Chat/coati/trainer/strategies/base.py | 4 +-- .../Chat/coati/trainer/strategies/ddp.py | 32 +++++++++++-------- applications/Chat/examples/train_prompts.py | 6 ++++ .../Chat/examples/train_reward_model.py | 7 ++++ applications/Chat/examples/train_sft.py | 8 +++-- 7 files changed, 63 insertions(+), 28 deletions(-) diff --git a/applications/Chat/coati/models/bloom/bloom_actor.py b/applications/Chat/coati/models/bloom/bloom_actor.py index d7577f096493..e83f8a906202 100644 --- a/applications/Chat/coati/models/bloom/bloom_actor.py +++ b/applications/Chat/coati/models/bloom/bloom_actor.py @@ -3,6 +3,8 @@ import torch from transformers import BloomConfig, BloomForCausalLM, BloomModel +from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin + from ..base import Actor @@ -25,6 +27,7 @@ def __init__(self, lora_rank: int = 0, lora_train_bias: str = 'none') -> None: if pretrained is not None: + # model = BloomForCausalLM(BloomConfig()) model = BloomForCausalLM.from_pretrained(pretrained) elif config is not None: model = BloomForCausalLM(config) diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py index 546f675d7d37..f3224e2e92a0 100644 --- a/applications/Chat/coati/models/lora.py +++ b/applications/Chat/coati/models/lora.py @@ -70,18 +70,24 @@ def T(w): self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling self.merged = False - def eval(self): + # def eval(self): + def merge(self): def T(w): return w.T if self.fan_in_fan_out else w + print("self.merge_weights and not self.merged" + str(self.merge_weights) + str(not self.merged)) nn.Module.eval(self) + self.merge_weights = True if self.merge_weights and not self.merged: # Merge the weights and mark it if self.r > 0: - self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling + print(type(self.lora_A), type(self.lora_B)) + weight = T(self.lora_B @ self.lora_A) * self.scaling + self.weight.data = self.weight.data + weight delattr(self, 'lora_A') delattr(self, 'lora_B') + print("eval eval eval eval" + str(self.merged)) self.merged = True def forward(self, x: torch.Tensor): @@ -98,21 +104,24 @@ def T(w): return F.linear(x, T(self.weight), bias=self.bias) -def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int) -> LoraLinear: +def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int, merge_weights: bool = False) -> LoraLinear: assert lora_rank <= linear.in_features, f'LoRA rank ({lora_rank}) must be less than or equal to in features ({linear.in_features})' - lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank, merge_weights=False) + lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank, merge_weights=merge_weights) return lora_linear -def _convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None: +def _convert_to_lora_recursively(module: nn.Module, lora_rank: int, merge_weights: bool = False) -> None: for name, child in module.named_children(): if isinstance(child, nn.Linear): - setattr(module, name, _lora_linear_wrapper(child, lora_rank)) + setattr(module, name, _lora_linear_wrapper(child, lora_rank, merge_weights)) else: - _convert_to_lora_recursively(child, lora_rank) + _convert_to_lora_recursively(child, lora_rank, merge_weights) -def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = 'none') -> nn.Module: +def convert_to_lora_module(module: nn.Module, + lora_rank: int, + lora_train_bias: str = 'none', + merge_weights: bool = False) -> nn.Module: """Convert a torch.nn.Module to a LoRA module. Args: @@ -124,7 +133,7 @@ def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: s """ if lora_rank <= 0: return module - _convert_to_lora_recursively(module, lora_rank) + _convert_to_lora_recursively(module, lora_rank, merge_weights) lora.mark_only_lora_as_trainable(module, lora_train_bias) return module @@ -145,5 +154,5 @@ def __init__(self, lora_rank: int = 0, lora_train_bias: str = 'none') -> None: self.lora_rank = lora_rank self.lora_train_bias = lora_train_bias - def convert_to_lora(self) -> None: - convert_to_lora_module(self, self.lora_rank, self.lora_train_bias) + def convert_to_lora(self, merge_weights: bool = False) -> None: + convert_to_lora_module(self, self.lora_rank, self.lora_train_bias, merge_weights) diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py index 8a55143ee90f..18b2e9821a42 100644 --- a/applications/Chat/coati/trainer/strategies/base.py +++ b/applications/Chat/coati/trainer/strategies/base.py @@ -49,7 +49,7 @@ def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = Fal pass def model_init_context(self): - print("aaaaaaa nullcontext") + # print("aaaaaaa nullcontext") return nullcontext() def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]: @@ -110,7 +110,7 @@ def unwrap_model(model: nn.Module) -> nn.Module: return model def save_model(self, model: nn.Module, path: str, only_rank0: bool = True, **kwargs) -> None: - self.booster.save_model(model, path, shard=not only_rank0, **kwargs) + self.booster.save_model(model, path, shard=False, **kwargs) def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None: self.booster.load_model(model, path, strict) diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index a52b0460daa8..0ae532d1eeab 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -16,6 +16,7 @@ from colossalai.booster.plugin import TorchDDPPlugin from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel +from ...models.lora import LoraLinear from .base import Strategy from .sampler import DistributedSampler @@ -34,10 +35,7 @@ class DDPStrategy(Strategy): Strategy for distributed training using torch.distributed. """ - def __init__(self, - seed: int = 42, - plugin_initializer: Callable = TorchDDPPlugin - ) -> None: + def __init__(self, seed: int = 42, plugin_initializer: Callable = TorchDDPPlugin) -> None: self.seed = seed super().__init__(plugin_initializer) @@ -88,6 +86,13 @@ def unwrap_model(self, model: nn.Module) -> nn.Module: assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel." return model.unwrap() + def eval(self, model): + for module in model.children(): + if isinstance(module, LoraLinear): + module.merge() + else: + self.eval(module) + def save_pretrained(self, model: nn.Module, path: str, @@ -103,17 +108,15 @@ def save_pretrained(self, if tokenizer is not None: tokenizer.save_pretrained(path) model_path = os.path.join(path, "pytorch_model.bin") - self.save_model(model, - model_path, - only_rank0=only_rank0) + # print(model) + self.eval(model) + # print(model) + # print([module for module in model.named_children()]) + self.save_model(model, model_path, only_rank0=only_rank0) - def _replace_keys(model_path: str, - replace_fn: Callable): + def _replace_keys(model_path: str, replace_fn: Callable): state_dict = torch.load(model_path, map_location="cpu") - state_dict = { - replace_fn(k): v - for k, v in state_dict.items() - } + state_dict = {replace_fn(k): v for k, v in state_dict.items()} torch.save(state_dict, model_path) # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin @@ -121,6 +124,9 @@ def _replace_keys(model_path: str, if dist.get_rank() == 0: _replace_keys(model_path, lambda k: k.replace("model.", "", 1)) + def load_pretrained(self, model, path): + self.load_model(model, path, strict=False) + def get_model_state_dict_shard(self, model: nn.Module, **config): # TODO: implement sharding on naive strategy model = self.unwrap_model(model) diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index 7b5a03285b7a..1dd65966c8a2 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -33,12 +33,18 @@ def main(args): warnings.warn('LoRA weights should be merged with the model weights') state_dict = torch.load(args.rm_path, map_location='cpu') + if args.lora_rank > 0: + warnings.warn("Lora is not supported yet.") + args.lora_rank = 0 + with strategy.model_init_context(): # configure model if args.model == 'gpt2': initial_model = GPTActor(pretrained=args.pretrain) elif args.model == 'bloom': initial_model = BLOOMActor(pretrained=args.pretrain) + # strategy.load_pretrained(initial_model, args.pretrain+"/pytorch_model.bin") + # print(initial_model.named_parameters()) elif args.model == 'opt': initial_model = OPTActor(pretrained=args.pretrain) elif args.model == 'llama': diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py index 48ed841d7103..cab1293d50af 100644 --- a/applications/Chat/examples/train_reward_model.py +++ b/applications/Chat/examples/train_reward_model.py @@ -1,4 +1,5 @@ import argparse +import warnings from random import randint import torch @@ -34,6 +35,10 @@ def train(args): raise ValueError(f'Unsupported strategy "{args.strategy}"') # configure model + if args.lora_rank > 0: + warnings.warn("Lora is not supported yet.") + args.lora_rank = 0 + with strategy.model_init_context(): if args.model == 'bloom': model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank) @@ -166,6 +171,8 @@ def train(args): trainer.fit(train_dataloader=train_dataloader, valid_dataloader=valid_dataloader, eval_dataloader=eval_dataloader) # save model checkpoint after fitting on only rank0 + strategy.eval(model) + print(args.save_path) strategy.save_model(model, args.save_path, only_rank0=True) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index d0a6becc94e0..f575d35c4d38 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -41,8 +41,9 @@ def train(args): # configure model if args.lora_rank > 0: - warnings.warn("Gradient checkpoint is disabled when using LoRA") - args.grad_checkpoint = False + warnings.warn("Lora is not supported yet.") + args.lora_rank = 0 + with strategy.model_init_context(): if args.model == 'bloom': model = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint) @@ -168,6 +169,9 @@ def train(args): use_wandb=args.use_wandb) # save model checkpoint after fitting on only rank0 + # model.eval() + # print(type(model)) + # print("eval eval") strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: From df9bd2808d6b0fc5ceee91e54b4f5e87e4b1ec90 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Wed, 20 Sep 2023 17:11:29 +0800 Subject: [PATCH 09/22] [fix] fix gemini strategy --- .../Chat/coati/trainer/strategies/colossalai.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index 1e7caa6012c5..e5dcfd66f5f5 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -1,13 +1,16 @@ import warnings from typing import Optional +import torch +import torch.distributed as dist import torch.nn as nn import colossalai from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel -from colossalai.lazy.lazy_init import LazyInitContext +from colossalai.tensor import ProcessGroup, ShardSpec from colossalai.utils import get_current_device +from colossalai.zero import ColoInitContext from colossalai.zero.gemini.gemini_ddp import GeminiDDP from .ddp import DDPStrategy @@ -59,7 +62,7 @@ def __init__( assert stage in (1, 2), f'Unsupported stage "{stage}"' assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"' assert precision in ("fp32", "fp16"), f'Unsupported precision "{precision}"' - + plugin_initializer = lambda: LowLevelZeroPlugin( stage=stage, precision=precision, @@ -188,10 +191,15 @@ def setup_distributed(self) -> None: colossalai.launch_from_torch({}, seed=self.seed) def model_init_context(self): - return super().model_init_context() + world_size = dist.get_world_size() + shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None + default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None + return ColoInitContext( + device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec + ) + # return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: ddp_model = model.unwrap() assert isinstance(ddp_model, GeminiDDP) return ddp_model.module - From 5443763f3e35d66ba319b1643beff356964dc921 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Wed, 20 Sep 2023 17:31:16 +0800 Subject: [PATCH 10/22] [fix] fix gemini strategy --- .../benchmarks/benchmark_opt_lora_dummy.py | 20 +++++++++---------- .../Chat/coati/models/bloom/bloom_actor.py | 3 --- applications/Chat/coati/models/generation.py | 1 - applications/Chat/coati/models/lora.py | 14 ++++++------- applications/Chat/coati/ray/utils.py | 12 +++++------ .../Chat/coati/trainer/strategies/base.py | 1 - .../Chat/coati/trainer/strategies/ddp.py | 4 ---- .../community/peft/train_peft_prompts.py | 8 ++++---- .../examples/community/peft/train_peft_sft.py | 8 ++++---- applications/Chat/examples/train_prompts.py | 8 ++++---- .../Chat/examples/train_reward_model.py | 11 ++++------ applications/Chat/examples/train_sft.py | 15 ++++++-------- applications/Chat/tests/test_experience.py | 4 ++-- 13 files changed, 46 insertions(+), 63 deletions(-) diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py index 5647d8bc03d3..14a313d886d8 100644 --- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py +++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py @@ -75,16 +75,16 @@ def get_gpt_config(model_name: str) -> OPTConfig: def main(args): if args.strategy == "ddp": strategy = DDPStrategy() - elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5) - elif args.strategy == 'colossalai_zero2': - strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') - elif args.strategy == 'colossalai_zero2_cpu': - strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu') - elif args.strategy == 'colossalai_zero1': - strategy = LowLevelZeroStrategy(stage=1, placement_policy='cuda') - elif args.strategy == 'colossalai_zero1_cpu': - strategy = LowLevelZeroStrategy(stage=1, placement_policy='cpu') + elif args.strategy == "colossalai_gemini": + strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + elif args.strategy == "colossalai_zero2": + strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") + elif args.strategy == "colossalai_zero2_cpu": + strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu") + elif args.strategy == "colossalai_zero1": + strategy = LowLevelZeroStrategy(stage=1, placement_policy="cuda") + elif args.strategy == "colossalai_zero1_cpu": + strategy = LowLevelZeroStrategy(stage=1, placement_policy="cpu") else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/coati/models/bloom/bloom_actor.py b/applications/Chat/coati/models/bloom/bloom_actor.py index 8f37cc1a5396..73855a2245e7 100644 --- a/applications/Chat/coati/models/bloom/bloom_actor.py +++ b/applications/Chat/coati/models/bloom/bloom_actor.py @@ -2,8 +2,6 @@ from transformers import BloomConfig, BloomForCausalLM -from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin - from ..base import Actor @@ -28,7 +26,6 @@ def __init__( lora_train_bias: str = "none", ) -> None: if pretrained is not None: - # model = BloomForCausalLM(BloomConfig()) model = BloomForCausalLM.from_pretrained(pretrained) elif config is not None: model = BloomForCausalLM(config) diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py index 1063edad5f26..4ab0cdc8a3ea 100644 --- a/applications/Chat/coati/models/generation.py +++ b/applications/Chat/coati/models/generation.py @@ -69,7 +69,6 @@ def _sample( next_token_logits = logits_processor(input_ids, next_token_logits) # sample probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float) - print(probs) next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) # finished sentences should have their next token be a padding token diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py index 35613cd81b45..2114913e107b 100644 --- a/applications/Chat/coati/models/lora.py +++ b/applications/Chat/coati/models/lora.py @@ -70,9 +70,7 @@ def eval(self): def T(w): return w.T if self.fan_in_fan_out else w - print("self.merge_weights and not self.merged" + str(self.merge_weights) + str(not self.merged)) nn.Module.eval(self) - self.merge_weights = True if self.merge_weights and not self.merged: # Merge the weights and mark it if self.r > 0: @@ -102,12 +100,12 @@ def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int) -> LoraLinear: return lora_linear -def _convert_to_lora_recursively(module: nn.Module, lora_rank: int, merge_weights: bool = False) -> None: +def _convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None: for name, child in module.named_children(): if isinstance(child, nn.Linear): - setattr(module, name, _lora_linear_wrapper(child, lora_rank, merge_weights)) + setattr(module, name, _lora_linear_wrapper(child, lora_rank)) else: - _convert_to_lora_recursively(child, lora_rank, merge_weights) + _convert_to_lora_recursively(child, lora_rank) def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = "none") -> nn.Module: @@ -122,7 +120,7 @@ def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: s """ if lora_rank <= 0: return module - _convert_to_lora_recursively(module, lora_rank, merge_weights) + _convert_to_lora_recursively(module, lora_rank) lora.mark_only_lora_as_trainable(module, lora_train_bias) return module @@ -143,5 +141,5 @@ def __init__(self, lora_rank: int = 0, lora_train_bias: str = "none") -> None: self.lora_rank = lora_rank self.lora_train_bias = lora_train_bias - def convert_to_lora(self, merge_weights: bool = False) -> None: - convert_to_lora_module(self, self.lora_rank, self.lora_train_bias, merge_weights) + def convert_to_lora(self) -> None: + convert_to_lora_module(self, self.lora_rank, self.lora_train_bias) diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py index caf22435fdb2..33a7cdcae26a 100644 --- a/applications/Chat/coati/ray/utils.py +++ b/applications/Chat/coati/ray/utils.py @@ -70,12 +70,12 @@ def get_reward_model_from_args(model: str, pretrained: str = None, config=None): def get_strategy_from_args(strategy: str): if strategy == "ddp": strategy_ = DDPStrategy() - elif strategy == 'colossalai_gemini': - strategy_ = GeminiStrategy(placement_policy='auto', initial_scale=2**5) - elif strategy == 'colossalai_zero2': - strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cuda') - elif strategy == 'colossalai_zero2_cpu': - strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cpu') + elif strategy == "colossalai_gemini": + strategy_ = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + elif strategy == "colossalai_zero2": + strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda") + elif strategy == "colossalai_zero2_cpu": + strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu") else: raise ValueError(f'Unsupported strategy "{strategy}"') return strategy_ diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py index 2cf7e29a2135..f25cd725a633 100644 --- a/applications/Chat/coati/trainer/strategies/base.py +++ b/applications/Chat/coati/trainer/strategies/base.py @@ -49,7 +49,6 @@ def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = Fal pass def model_init_context(self): - # print("aaaaaaa nullcontext") return nullcontext() def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]: diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index 9187b74c987e..66ff6703da4d 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -16,7 +16,6 @@ from colossalai.booster.plugin import TorchDDPPlugin from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel -from ...models.lora import LoraLinear from .base import Strategy from .sampler import DistributedSampler @@ -112,9 +111,6 @@ def _replace_keys(model_path: str, replace_fn: Callable): if dist.get_rank() == 0: _replace_keys(model_path, lambda k: k.replace("model.", "", 1)) - def load_pretrained(self, model, path): - self.load_model(model, path, strict=False) - def get_model_state_dict_shard(self, model: nn.Module, **config): # TODO: implement sharding on naive strategy model = self.unwrap_model(model) diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py index accbdd28500e..dc6e0f504be7 100644 --- a/applications/Chat/examples/community/peft/train_peft_prompts.py +++ b/applications/Chat/examples/community/peft/train_peft_prompts.py @@ -23,10 +23,10 @@ def main(args): # configure strategy if args.strategy == "ddp": strategy = DDPStrategy() - elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5) - elif args.strategy == 'colossalai_zero2': - strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu') + elif args.strategy == "colossalai_gemini": + strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + elif args.strategy == "colossalai_zero2": + strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu") else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py index 032d83d93c8e..091e85c1378b 100644 --- a/applications/Chat/examples/community/peft/train_peft_sft.py +++ b/applications/Chat/examples/community/peft/train_peft_sft.py @@ -23,10 +23,10 @@ def train(args): # configure strategy if args.strategy == "ddp": strategy = DDPStrategy() - elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='auto') - elif args.strategy == 'colossalai_zero2': - strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') + elif args.strategy == "colossalai_gemini": + strategy = GeminiStrategy(placement_policy="auto") + elif args.strategy == "colossalai_zero2": + strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index 8ccf73b96e4f..f04edc030ce0 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -22,10 +22,10 @@ def main(args): # configure strategy if args.strategy == "ddp": strategy = DDPStrategy() - elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5) - elif args.strategy == 'colossalai_zero2': - strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') + elif args.strategy == "colossalai_gemini": + strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + elif args.strategy == "colossalai_zero2": + strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py index eb2ff5499e0f..6d14e42a293f 100644 --- a/applications/Chat/examples/train_reward_model.py +++ b/applications/Chat/examples/train_reward_model.py @@ -1,6 +1,5 @@ import argparse import warnings -from random import randint import torch import torch.distributed as dist @@ -27,10 +26,10 @@ def train(args): # configure strategy if args.strategy == "ddp": strategy = DDPStrategy() - elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='auto') - elif args.strategy == 'colossalai_zero2': - strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') + elif args.strategy == "colossalai_gemini": + strategy = GeminiStrategy(placement_policy="auto") + elif args.strategy == "colossalai_zero2": + strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") else: raise ValueError(f'Unsupported strategy "{args.strategy}"') @@ -164,8 +163,6 @@ def train(args): use_wandb=args.use_wandb, ) # save model checkpoint after fitting on only rank0 - strategy.eval(model) - print(args.save_path) strategy.save_model(model, args.save_path, only_rank0=True) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index 3456c09b2d31..bd060301998c 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -29,12 +29,12 @@ def train(args): # configure strategy if args.strategy == "ddp": strategy = DDPStrategy() - elif args.strategy == 'colossalai_gemini': - strategy = GeminiStrategy(placement_policy='auto') - elif args.strategy == 'colossalai_zero2': - strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') - elif args.strategy == 'colossalai_zero2_cpu': - strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu') + elif args.strategy == "colossalai_gemini": + strategy = GeminiStrategy(placement_policy="auto") + elif args.strategy == "colossalai_zero2": + strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") + elif args.strategy == "colossalai_zero2_cpu": + strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu") else: raise ValueError(f'Unsupported strategy "{args.strategy}"') @@ -179,9 +179,6 @@ def train(args): ) # save model checkpoint after fitting on only rank0 - # model.eval() - # print(type(model)) - # print("eval eval") strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py index 9ec145b7b62e..70287a35cea0 100644 --- a/applications/Chat/tests/test_experience.py +++ b/applications/Chat/tests/test_experience.py @@ -42,8 +42,8 @@ def make_and_consume_experience(strategy): strategy = DDPStrategy() elif strategy == "colossalai-zero2": strategy = LowLevelZeroStrategy() - elif strategy == 'colossalai-gemini': - strategy = GeminiStrategy(placement_policy='auto') + elif strategy == "colossalai-gemini": + strategy = GeminiStrategy(placement_policy="auto") else: raise ValueError(f'Unsupported strategy "{strategy}"') From 48f39679cbfb99a37f51c0defd8d1676e67fc2d4 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Thu, 21 Sep 2023 15:04:33 +0800 Subject: [PATCH 11/22] [fix] fix gemini strategy --- .../coati/trainer/strategies/colossalai.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index e5dcfd66f5f5..1c0d1a0d2fb8 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -1,16 +1,12 @@ import warnings from typing import Optional -import torch -import torch.distributed as dist import torch.nn as nn import colossalai from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel -from colossalai.tensor import ProcessGroup, ShardSpec from colossalai.utils import get_current_device -from colossalai.zero import ColoInitContext from colossalai.zero.gemini.gemini_ddp import GeminiDDP from .ddp import DDPStrategy @@ -191,13 +187,13 @@ def setup_distributed(self) -> None: colossalai.launch_from_torch({}, seed=self.seed) def model_init_context(self): - world_size = dist.get_world_size() - shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None - default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None - return ColoInitContext( - device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec - ) - # return super().model_init_context() + # world_size = dist.get_world_size() + # shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None + # default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None + # return ColoInitContext( + # device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec + # ) + return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: ddp_model = model.unwrap() From 8fcbcb27aa2529d4be0dd6a1f1bd919a31a7885d Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 22 Sep 2023 17:57:27 +0800 Subject: [PATCH 12/22] [fix] fix gemini strategy --- applications/Chat/benchmarks/benchmark_opt_lora_dummy.py | 2 ++ applications/Chat/coati/ray/utils.py | 2 ++ applications/Chat/coati/trainer/strategies/colossalai.py | 9 +++------ applications/Chat/examples/train_prompts.py | 2 +- applications/Chat/tests/test_experience.py | 2 +- colossalai/tensor/param_op_hook.py | 2 +- colossalai/zero/gemini/colo_init_context.py | 2 +- colossalai/zero/gemini/gemini_ddp.py | 6 +++--- 8 files changed, 14 insertions(+), 13 deletions(-) diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py index 14a313d886d8..583ce860e608 100644 --- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py +++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py @@ -77,6 +77,8 @@ def main(args): strategy = DDPStrategy() elif args.strategy == "colossalai_gemini": strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + elif args.strategy == "colossalai_gemini_cpu": + strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5) elif args.strategy == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") elif args.strategy == "colossalai_zero2_cpu": diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py index 33a7cdcae26a..6f8e2d94ce8e 100644 --- a/applications/Chat/coati/ray/utils.py +++ b/applications/Chat/coati/ray/utils.py @@ -76,6 +76,8 @@ def get_strategy_from_args(strategy: str): strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda") elif strategy == "colossalai_zero2_cpu": strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu") + elif strategy == "colossalai_gemini_cpu": + strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5) else: raise ValueError(f'Unsupported strategy "{strategy}"') return strategy_ diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index 1c0d1a0d2fb8..0e3258304107 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -159,6 +159,9 @@ def __init__( plugin_initializer = lambda: GeminiPlugin( chunk_init_device=get_current_device(), placement_policy=placement_policy, + shard_param_frac=1.0, + offload_optim_frac=1.0, + offload_param_frac=1.0, precision="fp16", pin_memory=pin_memory, force_outputs_fp32=force_outputs_fp32, @@ -187,12 +190,6 @@ def setup_distributed(self) -> None: colossalai.launch_from_torch({}, seed=self.seed) def model_init_context(self): - # world_size = dist.get_world_size() - # shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None - # default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None - # return ColoInitContext( - # device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec - # ) return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index f04edc030ce0..29063295a0c2 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -23,7 +23,7 @@ def main(args): if args.strategy == "ddp": strategy = DDPStrategy() elif args.strategy == "colossalai_gemini": - strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5) elif args.strategy == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") else: diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py index 70287a35cea0..a9591259800d 100644 --- a/applications/Chat/tests/test_experience.py +++ b/applications/Chat/tests/test_experience.py @@ -43,7 +43,7 @@ def make_and_consume_experience(strategy): elif strategy == "colossalai-zero2": strategy = LowLevelZeroStrategy() elif strategy == "colossalai-gemini": - strategy = GeminiStrategy(placement_policy="auto") + strategy = GeminiStrategy(placement_policy="static") else: raise ValueError(f'Unsupported strategy "{strategy}"') diff --git a/colossalai/tensor/param_op_hook.py b/colossalai/tensor/param_op_hook.py index a92a0d3fc40d..1fe99cd89a4e 100644 --- a/colossalai/tensor/param_op_hook.py +++ b/colossalai/tensor/param_op_hook.py @@ -142,7 +142,7 @@ def _flatten_grad_args(args) -> Tuple[list, list, List[bool], TreeSpec]: grad_args.append(arg) else: other_args.append(arg) - # assert len(grad_args) > 0 + assert len(grad_args) > 0 return grad_args, other_args, grad_flags, spec diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py index 711b0d2196db..ab2ff8f920aa 100644 --- a/colossalai/zero/gemini/colo_init_context.py +++ b/colossalai/zero/gemini/colo_init_context.py @@ -130,7 +130,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs): replaced_tensors[param] = colo_param delattr(submodule, param_name) setattr(submodule, param_name, colo_param) - # colo_param.shared_param_modules.append(submodule) + colo_param.shared_param_modules.append(submodule) param_number = 0 meta_param_number = 0 diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index c3ecd2578468..8b149a65497a 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -237,9 +237,9 @@ def _post_forward(self): def forward(self, *args, **kwargs): # check whether we are in a inference mode grad_flag = torch.is_grad_enabled() - # if not grad_flag: - # assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup( - # ), "You should run a completed iteration as your warmup iter" + if not grad_flag: + assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup( + ), "You should run a completed iteration as your warmup iter" args, kwargs = _cast_float(args, self.mixed_precision), _cast_float(kwargs, self.mixed_precision) self.module.zero_grad(set_to_none=True) From 16c5d3a7672e470efd3a0b151a532b32e5f27dd8 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 22 Sep 2023 18:02:48 +0800 Subject: [PATCH 13/22] [fix] fix gemini strategy --- applications/Chat/examples/community/peft/train_peft_prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py index dc6e0f504be7..bbac91477085 100644 --- a/applications/Chat/examples/community/peft/train_peft_prompts.py +++ b/applications/Chat/examples/community/peft/train_peft_prompts.py @@ -24,7 +24,7 @@ def main(args): if args.strategy == "ddp": strategy = DDPStrategy() elif args.strategy == "colossalai_gemini": - strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5) elif args.strategy == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu") else: From a11f72865ae20cfc92adb2c10e5e567904e9ac5c Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 22 Sep 2023 18:38:42 +0800 Subject: [PATCH 14/22] [fix] fix gemini strategy --- applications/Chat/tests/test_train.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh index 55de269005ed..e8ea9281c22d 100755 --- a/applications/Chat/tests/test_train.sh +++ b/applications/Chat/tests/test_train.sh @@ -34,7 +34,7 @@ if [ -z "$PRETRAIN_DATASET" ]; then exit 1 fi -NUM_RETRY=3 +# NUM_RETRY=3 BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE))) EXAMPLES_DIR=$BASE_DIR/examples MODELS_DIR=$BASE_DIR/examples/models_config @@ -80,9 +80,7 @@ SKIPPED_TESTS=( "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2" - "gpt2-colossalai_gemini" - "opt-colossalai_gemini" - "bloom-colossalai_gemini" + "bloom-colossalai_zero2-4" ) GRAD_CKPTS=('' '--grad_checkpoint') From 6903f9662ea618eda74aaac04ac5989982391c8c Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 22 Sep 2023 18:42:01 +0800 Subject: [PATCH 15/22] [fix] fix gemini strategy --- applications/Chat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh index e8ea9281c22d..1a470a7fcfcb 100755 --- a/applications/Chat/tests/test_train.sh +++ b/applications/Chat/tests/test_train.sh @@ -34,7 +34,7 @@ if [ -z "$PRETRAIN_DATASET" ]; then exit 1 fi -# NUM_RETRY=3 +NUM_RETRY=3 BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE))) EXAMPLES_DIR=$BASE_DIR/examples MODELS_DIR=$BASE_DIR/examples/models_config From 057b74b043b25703e9b3c9a64c8049457f05b69b Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 22 Sep 2023 18:42:56 +0800 Subject: [PATCH 16/22] [fix] fix gemini strategy --- applications/Chat/coati/ray/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py index 6f8e2d94ce8e..49ab96b6720d 100644 --- a/applications/Chat/coati/ray/utils.py +++ b/applications/Chat/coati/ray/utils.py @@ -74,10 +74,10 @@ def get_strategy_from_args(strategy: str): strategy_ = GeminiStrategy(placement_policy="auto", initial_scale=2**5) elif strategy == "colossalai_zero2": strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda") - elif strategy == "colossalai_zero2_cpu": - strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu") elif strategy == "colossalai_gemini_cpu": strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5) + elif strategy == "colossalai_zero2_cpu": + strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu") else: raise ValueError(f'Unsupported strategy "{strategy}"') return strategy_ From 5f42f1c73143021bbda4980ab93175d010b62232 Mon Sep 17 00:00:00 2001 From: Mingyan Jiang <1829166702@qq.com> Date: Mon, 25 Sep 2023 16:58:08 +0800 Subject: [PATCH 17/22] fix --- .../Chat/benchmarks/benchmark_opt_lora_dummy.py | 4 ++-- applications/Chat/coati/ray/utils.py | 4 ++-- .../Chat/coati/trainer/strategies/base.py | 2 +- .../Chat/coati/trainer/strategies/colossalai.py | 14 ++++++++------ .../Chat/coati/trainer/strategies/ddp.py | 16 ++++++++-------- .../community/peft/train_peft_prompts.py | 2 +- .../examples/community/peft/train_peft_sft.py | 2 +- applications/Chat/examples/train_prompts.py | 2 +- 8 files changed, 24 insertions(+), 22 deletions(-) diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py index 583ce860e608..0d0e2a7d34f5 100644 --- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py +++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py @@ -76,9 +76,9 @@ def main(args): if args.strategy == "ddp": strategy = DDPStrategy() elif args.strategy == "colossalai_gemini": - strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5) elif args.strategy == "colossalai_gemini_cpu": - strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5) + strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5) elif args.strategy == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") elif args.strategy == "colossalai_zero2_cpu": diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py index 49ab96b6720d..b88140c0e036 100644 --- a/applications/Chat/coati/ray/utils.py +++ b/applications/Chat/coati/ray/utils.py @@ -71,11 +71,11 @@ def get_strategy_from_args(strategy: str): if strategy == "ddp": strategy_ = DDPStrategy() elif strategy == "colossalai_gemini": - strategy_ = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5) elif strategy == "colossalai_zero2": strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda") elif strategy == "colossalai_gemini_cpu": - strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5) + strategy_ = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5) elif strategy == "colossalai_zero2_cpu": strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu") else: diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py index f25cd725a633..c1ec1a02b6a9 100644 --- a/applications/Chat/coati/trainer/strategies/base.py +++ b/applications/Chat/coati/trainer/strategies/base.py @@ -111,7 +111,7 @@ def unwrap_model(model: nn.Module) -> nn.Module: return model def save_model(self, model: nn.Module, path: str, only_rank0: bool = True, **kwargs) -> None: - self.booster.save_model(model, path, shard=False, **kwargs) + self.booster.save_model(model, path, shard=True, **kwargs) def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None: self.booster.load_model(model, path, strict) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index 0e3258304107..7129edb060ef 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -129,6 +129,9 @@ def __init__( seed: int = 42, shard_init: bool = False, # only for stage 3 placement_policy: str = "auto", + shard_param_frac: float = 1.0, # only for static placement + offload_optim_frac: float = 0.0, # only for static placement + offload_param_frac: float = 0.0, # only for static placement pin_memory: bool = True, # only for stage 3 force_outputs_fp32: bool = False, # only for stage 3 search_range_m: int = 32, # only for stage 3 @@ -159,9 +162,9 @@ def __init__( plugin_initializer = lambda: GeminiPlugin( chunk_init_device=get_current_device(), placement_policy=placement_policy, - shard_param_frac=1.0, - offload_optim_frac=1.0, - offload_param_frac=1.0, + shard_param_frac=shard_param_frac, + offload_optim_frac=offload_optim_frac, + offload_param_frac=offload_param_frac, precision="fp16", pin_memory=pin_memory, force_outputs_fp32=force_outputs_fp32, @@ -193,6 +196,5 @@ def model_init_context(self): return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: - ddp_model = model.unwrap() - assert isinstance(ddp_model, GeminiDDP) - return ddp_model.module + assert isinstance(model, GeminiDDP) + return model.module diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index 66ff6703da4d..b9be24b3e7b7 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -98,18 +98,18 @@ def save_pretrained( pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None) if tokenizer is not None: tokenizer.save_pretrained(path) - model_path = os.path.join(path, "pytorch_model.bin") - self.save_model(model, model_path, only_rank0=only_rank0) + # model_path = os.path.join(path, "pytorch_model.bin") + self.save_model(model, path, only_rank0=only_rank0) - def _replace_keys(model_path: str, replace_fn: Callable): - state_dict = torch.load(model_path, map_location="cpu") - state_dict = {replace_fn(k): v for k, v in state_dict.items()} - torch.save(state_dict, model_path) + # def _replace_keys(model_path: str, replace_fn: Callable): + # state_dict = torch.load(model_path, map_location="cpu") + # state_dict = {replace_fn(k): v for k, v in state_dict.items()} + # torch.save(state_dict, model_path) # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin # HACK: rename keys of pytorch_model.bin - if dist.get_rank() == 0: - _replace_keys(model_path, lambda k: k.replace("model.", "", 1)) + # if dist.get_rank() == 0: + # _replace_keys(model_path, lambda k: k.replace("model.", "", 1)) def get_model_state_dict_shard(self, model: nn.Module, **config): # TODO: implement sharding on naive strategy diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py index bbac91477085..99a024f1463c 100644 --- a/applications/Chat/examples/community/peft/train_peft_prompts.py +++ b/applications/Chat/examples/community/peft/train_peft_prompts.py @@ -24,7 +24,7 @@ def main(args): if args.strategy == "ddp": strategy = DDPStrategy() elif args.strategy == "colossalai_gemini": - strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5) + strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5) elif args.strategy == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu") else: diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py index 091e85c1378b..3bbef7208374 100644 --- a/applications/Chat/examples/community/peft/train_peft_sft.py +++ b/applications/Chat/examples/community/peft/train_peft_sft.py @@ -24,7 +24,7 @@ def train(args): if args.strategy == "ddp": strategy = DDPStrategy() elif args.strategy == "colossalai_gemini": - strategy = GeminiStrategy(placement_policy="auto") + strategy = GeminiStrategy(placement_policy="static") elif args.strategy == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") else: diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index 29063295a0c2..f04edc030ce0 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -23,7 +23,7 @@ def main(args): if args.strategy == "ddp": strategy = DDPStrategy() elif args.strategy == "colossalai_gemini": - strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5) + strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) elif args.strategy == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") else: From 3899f734b60e55957fa6ee0407b7854424b265bd Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 25 Sep 2023 23:50:38 +0800 Subject: [PATCH 18/22] fix --- applications/Chat/coati/models/base/actor.py | 1 + .../Chat/coati/trainer/strategies/base.py | 4 ++-- .../coati/trainer/strategies/colossalai.py | 1 + .../Chat/coati/trainer/strategies/ddp.py | 19 ++++----------- applications/Chat/examples/train_prompts.py | 23 +++++++++++-------- .../Chat/examples/train_reward_model.py | 3 ++- applications/Chat/examples/train_sft.py | 4 +++- applications/Chat/requirements-test.txt | 2 +- applications/Chat/requirements.txt | 2 +- applications/Chat/tests/test_inference.sh | 3 ++- applications/Chat/tests/test_train.sh | 17 ++++---------- 11 files changed, 36 insertions(+), 43 deletions(-) diff --git a/applications/Chat/coati/models/base/actor.py b/applications/Chat/coati/models/base/actor.py index 0634631df7a3..8b2b81ed071c 100644 --- a/applications/Chat/coati/models/base/actor.py +++ b/applications/Chat/coati/models/base/actor.py @@ -30,3 +30,4 @@ def forward( """Returns model output.""" output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs) return output + diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py index c1ec1a02b6a9..51c0b5fa6526 100644 --- a/applications/Chat/coati/trainer/strategies/base.py +++ b/applications/Chat/coati/trainer/strategies/base.py @@ -110,8 +110,8 @@ def unwrap_model(model: nn.Module) -> nn.Module: """ return model - def save_model(self, model: nn.Module, path: str, only_rank0: bool = True, **kwargs) -> None: - self.booster.save_model(model, path, shard=True, **kwargs) + def save_model(self, model: nn.Module, path: str, shard: bool = True, **kwargs) -> None: + self.booster.save_model(model, path, shard=shard, **kwargs) def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None: self.booster.load_model(model, path, strict) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index 7129edb060ef..b1257a07cc88 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -196,5 +196,6 @@ def model_init_context(self): return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: + model = model.unwrap() assert isinstance(model, GeminiDDP) return model.module diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index b9be24b3e7b7..4ce0d79fc124 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -87,9 +87,9 @@ def unwrap_model(self, model: nn.Module) -> nn.Module: return model.unwrap() def save_pretrained( - self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None + self, model: nn.Module, path: str, shard: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None ) -> None: - if not only_rank0 or dist.get_rank() == 0: + if dist.get_rank() == 0: unwrapped_model = self.unwrap_model(model) assert isinstance(unwrapped_model, (Actor, Critic, RewardModel)) pretrained_model = unwrapped_model.model @@ -98,18 +98,9 @@ def save_pretrained( pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None) if tokenizer is not None: tokenizer.save_pretrained(path) - # model_path = os.path.join(path, "pytorch_model.bin") - self.save_model(model, path, only_rank0=only_rank0) - - # def _replace_keys(model_path: str, replace_fn: Callable): - # state_dict = torch.load(model_path, map_location="cpu") - # state_dict = {replace_fn(k): v for k, v in state_dict.items()} - # torch.save(state_dict, model_path) - - # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin - # HACK: rename keys of pytorch_model.bin - # if dist.get_rank() == 0: - # _replace_keys(model_path, lambda k: k.replace("model.", "", 1)) + + self.save_model(model, path, shard=shard) + def get_model_state_dict_shard(self, model: nn.Module, **config): # TODO: implement sharding on naive strategy diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index f04edc030ce0..81343ad8f6f4 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -23,7 +23,7 @@ def main(args): if args.strategy == "ddp": strategy = DDPStrategy() elif args.strategy == "colossalai_gemini": - strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5) + strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5) elif args.strategy == "colossalai_zero2": strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda") else: @@ -40,13 +40,13 @@ def main(args): with strategy.model_init_context(): # configure model if args.model == "gpt2": - initial_model = GPTActor(pretrained=args.pretrain) + initial_model = GPTActor() elif args.model == "bloom": - initial_model = BLOOMActor(pretrained=args.pretrain) + initial_model = BLOOMActor() elif args.model == "opt": - initial_model = OPTActor(pretrained=args.pretrain) + initial_model = OPTActor() elif args.model == "llama": - initial_model = LlamaActor(pretrained=args.pretrain) + initial_model = LlamaActor() else: raise ValueError(f'Unsupported actor model "{args.model}"') @@ -73,13 +73,13 @@ def main(args): reward_model.to(torch.bfloat16).to(torch.cuda.current_device()) if args.model == "gpt2": - actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank) + actor = GPTActor(lora_rank=args.lora_rank) elif args.model == "bloom": - actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank) + actor = BLOOMActor(lora_rank=args.lora_rank) elif args.model == "opt": - actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank) + actor = OPTActor(lora_rank=args.lora_rank) elif args.model == "llama": - actor = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank) + actor = LlamaActor(lora_rank=args.lora_rank) else: raise ValueError(f'Unsupported actor model "{args.model}"') @@ -165,6 +165,9 @@ def main(args): (actor, actor_optim), (critic, critic_optim), reward_model, initial_model ) + strategy.load_model(initial_model, args.pretrain) + strategy.load_model(actor, args.pretrain) + # configure trainer trainer = PPOTrainer( strategy, @@ -197,7 +200,7 @@ def main(args): ) # save model checkpoint after fitting - strategy.save_model(actor, args.save_path, only_rank0=True) + strategy.save_model(actor, args.save_path) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: strategy.save_optimizer( diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py index 6d14e42a293f..4f2a68905c5c 100644 --- a/applications/Chat/examples/train_reward_model.py +++ b/applications/Chat/examples/train_reward_model.py @@ -163,7 +163,8 @@ def train(args): use_wandb=args.use_wandb, ) # save model checkpoint after fitting on only rank0 - strategy.save_model(model, args.save_path, only_rank0=True) + state_dict = model.state_dict() + torch.save(state_dict, args.save_path) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: strategy.save_optimizer( diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index bd060301998c..e93a3e523d47 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -179,7 +179,9 @@ def train(args): ) # save model checkpoint after fitting on only rank0 - strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer) + print(type(model)) + # print(model) + strategy.save_pretrained(model, path=args.save_path, tokenizer=tokenizer) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: strategy.save_optimizer( diff --git a/applications/Chat/requirements-test.txt b/applications/Chat/requirements-test.txt index c688935bda31..93d48bcb6f79 100644 --- a/applications/Chat/requirements-test.txt +++ b/applications/Chat/requirements-test.txt @@ -1,2 +1,2 @@ pytest -colossalai==0.3.2 +colossalai==0.3.3 diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt index 0b1ee1785fa1..e56aaca0e7cb 100644 --- a/applications/Chat/requirements.txt +++ b/applications/Chat/requirements.txt @@ -2,7 +2,7 @@ transformers>=4.20.1 tqdm datasets loralib -colossalai==0.3.2 +colossalai==0.3.3 torch<2.0.0, >=1.12.1 langchain tokenizers diff --git a/applications/Chat/tests/test_inference.sh b/applications/Chat/tests/test_inference.sh index 849db06e58ab..7f4a475320ff 100755 --- a/applications/Chat/tests/test_inference.sh +++ b/applications/Chat/tests/test_inference.sh @@ -1,6 +1,7 @@ set -xue -BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE))) +#BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE))) +BASE_DIR="/home/lcjmy/vepfs/ColossalAI/applications/Chat" EXAMPLES_DIR=$BASE_DIR/examples echo "[Test]: testing inference ..." diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh index 1a470a7fcfcb..7a01acd1755c 100755 --- a/applications/Chat/tests/test_train.sh +++ b/applications/Chat/tests/test_train.sh @@ -80,11 +80,10 @@ SKIPPED_TESTS=( "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2" - "bloom-colossalai_zero2-4" ) GRAD_CKPTS=('' '--grad_checkpoint') -for lora_rank in '0' '4'; do +for lora_rank in '0'; do for model in ${MODELS[@]}; do strategies=($(shuf -e "${STRATEGIES[@]}")) for strategy in ${strategies[@]}; do @@ -133,14 +132,11 @@ SKIPPED_TESTS=( "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2" - "gpt2-colossalai_gemini" - "opt-colossalai_gemini" - "bloom-colossalai_gemini" ) LOSS_FNS=('log_sig' 'log_exp') DATASETS=('Anthropic/hh-rlhf' 'Dahoas/rm-static') -for lora_rank in '0' '4'; do +for lora_rank in '0'; do for model in ${MODELS[@]}; do strategies=($(shuf -e "${STRATEGIES[@]}")) for strategy in ${strategies[@]}; do @@ -191,13 +187,10 @@ SKIPPED_TESTS=( "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2" - "gpt2-colossalai_gemini" - "opt-colossalai_gemini" - "bloom-colossalai_gemini" ) for model in ${MODELS[@]}; do - for lora_rank in '0' '4'; do + for lora_rank in '0'; do strategies=($(shuf -e "${STRATEGIES[@]}")) for strategy in ${strategies[@]}; do if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then @@ -221,7 +214,7 @@ for model in ${MODELS[@]}; do --experience_batch_size 2 --train_batch_size 1 --lora_rank $lora_rank \ --pretrain $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank} \ $rm_pretrain_model --rm_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt \ - --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts.pt + --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts passed=$? if [ $passed -eq 0 ]; then break @@ -236,4 +229,4 @@ for model in ${MODELS[@]}; do rm $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt done done -rm $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts.pt +rm -rf $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts From ecb821da65929d4552e19fb3f38a4f917a31a123 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Tue, 26 Sep 2023 00:08:31 +0800 Subject: [PATCH 19/22] fix --- .../coati/trainer/strategies/colossalai.py | 1 - .../Chat/coati/trainer/strategies/ddp.py | 18 +++++++++--------- applications/Chat/examples/requirements.txt | 2 +- applications/Chat/examples/train_sft.py | 2 -- applications/Chat/tests/test_inference.sh | 3 +-- 5 files changed, 11 insertions(+), 15 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index b1257a07cc88..7129edb060ef 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -196,6 +196,5 @@ def model_init_context(self): return super().model_init_context() def unwrap_model(self, model: nn.Module) -> nn.Module: - model = model.unwrap() assert isinstance(model, GeminiDDP) return model.module diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index 4ce0d79fc124..2fb2a4fa7501 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -89,15 +89,15 @@ def unwrap_model(self, model: nn.Module) -> nn.Module: def save_pretrained( self, model: nn.Module, path: str, shard: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None ) -> None: - if dist.get_rank() == 0: - unwrapped_model = self.unwrap_model(model) - assert isinstance(unwrapped_model, (Actor, Critic, RewardModel)) - pretrained_model = unwrapped_model.model - assert isinstance(pretrained_model, PreTrainedModel) - # HACK: only use hf save_pretrained to save config - pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None) - if tokenizer is not None: - tokenizer.save_pretrained(path) + # if dist.get_rank() == 0: + # unwrapped_model = self.unwrap_model(model) + # assert isinstance(unwrapped_model, (Actor, Critic, RewardModel)) + # pretrained_model = unwrapped_model.model + # assert isinstance(pretrained_model, PreTrainedModel) + # # HACK: only use hf save_pretrained to save config + # pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None) + # if tokenizer is not None: + # tokenizer.save_pretrained(path) self.save_model(model, path, shard=shard) diff --git a/applications/Chat/examples/requirements.txt b/applications/Chat/examples/requirements.txt index 0890917048d2..5474dfa16b3e 100644 --- a/applications/Chat/examples/requirements.txt +++ b/applications/Chat/examples/requirements.txt @@ -1,3 +1,3 @@ pandas>=1.4.1 sentencepiece -colossalai==0.3.2 +colossalai==0.3.3 diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index e93a3e523d47..137a8267fde4 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -179,8 +179,6 @@ def train(args): ) # save model checkpoint after fitting on only rank0 - print(type(model)) - # print(model) strategy.save_pretrained(model, path=args.save_path, tokenizer=tokenizer) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: diff --git a/applications/Chat/tests/test_inference.sh b/applications/Chat/tests/test_inference.sh index 7f4a475320ff..849db06e58ab 100755 --- a/applications/Chat/tests/test_inference.sh +++ b/applications/Chat/tests/test_inference.sh @@ -1,7 +1,6 @@ set -xue -#BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE))) -BASE_DIR="/home/lcjmy/vepfs/ColossalAI/applications/Chat" +BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE))) EXAMPLES_DIR=$BASE_DIR/examples echo "[Test]: testing inference ..." From 620c44865d107e88d6e02a5ceb56a4dd113c498e Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Tue, 26 Sep 2023 00:16:14 +0800 Subject: [PATCH 20/22] fix --- .../Chat/coati/trainer/strategies/ddp.py | 18 +++++++++--------- applications/Chat/examples/train_prompts.py | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index 2fb2a4fa7501..4ce0d79fc124 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -89,15 +89,15 @@ def unwrap_model(self, model: nn.Module) -> nn.Module: def save_pretrained( self, model: nn.Module, path: str, shard: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None ) -> None: - # if dist.get_rank() == 0: - # unwrapped_model = self.unwrap_model(model) - # assert isinstance(unwrapped_model, (Actor, Critic, RewardModel)) - # pretrained_model = unwrapped_model.model - # assert isinstance(pretrained_model, PreTrainedModel) - # # HACK: only use hf save_pretrained to save config - # pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None) - # if tokenizer is not None: - # tokenizer.save_pretrained(path) + if dist.get_rank() == 0: + unwrapped_model = self.unwrap_model(model) + assert isinstance(unwrapped_model, (Actor, Critic, RewardModel)) + pretrained_model = unwrapped_model.model + assert isinstance(pretrained_model, PreTrainedModel) + # HACK: only use hf save_pretrained to save config + pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None) + if tokenizer is not None: + tokenizer.save_pretrained(path) self.save_model(model, path, shard=shard) diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index 81343ad8f6f4..6624f9cebffd 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -40,13 +40,13 @@ def main(args): with strategy.model_init_context(): # configure model if args.model == "gpt2": - initial_model = GPTActor() + initial_model = GPTActor(pretrained=args.pretrain) elif args.model == "bloom": - initial_model = BLOOMActor() + initial_model = BLOOMActor(pretrained=args.pretrain) elif args.model == "opt": - initial_model = OPTActor() + initial_model = OPTActor(pretrained=args.pretrain) elif args.model == "llama": - initial_model = LlamaActor() + initial_model = LlamaActor(pretrained=args.pretrain) else: raise ValueError(f'Unsupported actor model "{args.model}"') @@ -73,13 +73,13 @@ def main(args): reward_model.to(torch.bfloat16).to(torch.cuda.current_device()) if args.model == "gpt2": - actor = GPTActor(lora_rank=args.lora_rank) + actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank) elif args.model == "bloom": - actor = BLOOMActor(lora_rank=args.lora_rank) + actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank) elif args.model == "opt": - actor = OPTActor(lora_rank=args.lora_rank) + actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank) elif args.model == "llama": - actor = LlamaActor(lora_rank=args.lora_rank) + actor = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank) else: raise ValueError(f'Unsupported actor model "{args.model}"') From 3c6c7172de7bb1156bf81bb276fb4491d4322c41 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Tue, 26 Sep 2023 18:52:03 +0800 Subject: [PATCH 21/22] fix --- applications/Chat/coati/trainer/strategies/base.py | 2 +- applications/Chat/coati/trainer/strategies/ddp.py | 13 +++++++++++-- applications/Chat/examples/train_prompts.py | 7 ++++--- applications/Chat/tests/test_checkpoint.py | 4 ++-- applications/Chat/tests/test_train.sh | 1 + colossalai/zero/gemini/gemini_ddp.py | 3 ++- 6 files changed, 21 insertions(+), 9 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py index 51c0b5fa6526..a78716216ae0 100644 --- a/applications/Chat/coati/trainer/strategies/base.py +++ b/applications/Chat/coati/trainer/strategies/base.py @@ -110,7 +110,7 @@ def unwrap_model(model: nn.Module) -> nn.Module: """ return model - def save_model(self, model: nn.Module, path: str, shard: bool = True, **kwargs) -> None: + def save_model(self, model: nn.Module, path: str, shard: bool = False, **kwargs) -> None: self.booster.save_model(model, path, shard=shard, **kwargs) def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None: diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index 4ce0d79fc124..f2a44aeb0961 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -87,7 +87,7 @@ def unwrap_model(self, model: nn.Module) -> nn.Module: return model.unwrap() def save_pretrained( - self, model: nn.Module, path: str, shard: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None + self, model: nn.Module, path: str, shard: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None ) -> None: if dist.get_rank() == 0: unwrapped_model = self.unwrap_model(model) @@ -99,7 +99,16 @@ def save_pretrained( if tokenizer is not None: tokenizer.save_pretrained(path) - self.save_model(model, path, shard=shard) + model_path = os.path.join(path, "pytorch_model.bin") + self.save_model(model, model_path, shard=shard) + def _replace_keys(model_path: str, replace_fn: Callable): + state_dict = torch.load(model_path, map_location="cpu") + state_dict = {replace_fn(k): v for k, v in state_dict.items()} + torch.save(state_dict, model_path) + # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin + # HACK: rename keys of pytorch_model.bin + if dist.get_rank() == 0: + _replace_keys(model_path, lambda k: k.replace("model.", "", 1)) def get_model_state_dict_shard(self, model: nn.Module, **config): diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index 6624f9cebffd..37146a7c5f68 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -74,6 +74,7 @@ def main(args): if args.model == "gpt2": actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank) + # actor = GPTActor.from_pretrained(args.pretrain) elif args.model == "bloom": actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank) elif args.model == "opt": @@ -165,8 +166,8 @@ def main(args): (actor, actor_optim), (critic, critic_optim), reward_model, initial_model ) - strategy.load_model(initial_model, args.pretrain) - strategy.load_model(actor, args.pretrain) + # strategy.load_model(initial_model, args.pretrain) + # strategy.load_model(actor, args.pretrain) # configure trainer trainer = PPOTrainer( @@ -200,7 +201,7 @@ def main(args): ) # save model checkpoint after fitting - strategy.save_model(actor, args.save_path) + strategy.save_pretrained(actor, path=args.save_path) # save optimizer checkpoint on all ranks if args.need_optim_ckpt: strategy.save_optimizer( diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py index 9dfaa7c88206..9c08aa36c9b4 100644 --- a/applications/Chat/tests/test_checkpoint.py +++ b/applications/Chat/tests/test_checkpoint.py @@ -57,9 +57,9 @@ def run_test_checkpoint(strategy_name: str, shard: bool): rank0_dirname = rank0_dirname[0] model_path = os.path.join(rank0_dirname, "model" if shard else f"model.pt") - strategy.save_model(actor, model_path, only_rank0=not shard) + strategy.save_model(actor, model_path) optim_path = os.path.join(rank0_dirname, "optim" if shard else "optim.pt") - strategy.save_optimizer(actor_optim, optim_path, only_rank0=not shard) + strategy.save_optimizer(actor_optim, optim_path) dist.barrier() strategy.load_model(actor, model_path, strict=False) diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh index 7a01acd1755c..68fca7fbf8c0 100755 --- a/applications/Chat/tests/test_train.sh +++ b/applications/Chat/tests/test_train.sh @@ -41,6 +41,7 @@ MODELS_DIR=$BASE_DIR/examples/models_config MODELS=('gpt2' 'bloom' 'opt' 'llama') STRATEGIES=('ddp' 'colossalai_gemini' 'colossalai_zero2') + export OMP_NUM_THREADS=8 # install requirements diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 8b149a65497a..580b497ce719 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -238,7 +238,8 @@ def forward(self, *args, **kwargs): # check whether we are in a inference mode grad_flag = torch.is_grad_enabled() if not grad_flag: - assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup( + assert ( + not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup() ), "You should run a completed iteration as your warmup iter" args, kwargs = _cast_float(args, self.mixed_precision), _cast_float(kwargs, self.mixed_precision) From f3201757c4e0a2661cfb297f9df30ea76e07e755 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 27 Sep 2023 10:52:54 +0800 Subject: [PATCH 22/22] Update train_prompts.py --- applications/Chat/examples/train_prompts.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index 37146a7c5f68..ecaf794f6333 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -74,7 +74,6 @@ def main(args): if args.model == "gpt2": actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank) - # actor = GPTActor.from_pretrained(args.pretrain) elif args.model == "bloom": actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank) elif args.model == "opt": @@ -166,9 +165,6 @@ def main(args): (actor, actor_optim), (critic, critic_optim), reward_model, initial_model ) - # strategy.load_model(initial_model, args.pretrain) - # strategy.load_model(actor, args.pretrain) - # configure trainer trainer = PPOTrainer( strategy,