From 452431f33a64ef1d063c27fbb03370f6f9958646 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 12 Sep 2023 21:58:01 +0800
Subject: [PATCH 01/22] [chat] fix gemini strategy

---
 .../coati/trainer/strategies/colossalai.py    | 117 ++++++++----------
 applications/Chat/examples/train_sft.py       |  34 ++---
 applications/Chat/examples/train_sft.sh       |  40 +++---
 applications/Chat/requirements.txt            |   2 +-
 applications/Chat/tests/test_checkpoint.py    |  31 ++---
 5 files changed, 94 insertions(+), 130 deletions(-)

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index fa55f97ad661..d65491ee8589 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -7,11 +7,12 @@
 
 import colossalai
 from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
-from colossalai.booster.plugin.gemini_plugin import GeminiModel
+# from colossalai.booster.plugin.gemini_plugin import GeminiModel
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
+# from colossalai.zero import ColoInitContext
+from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.tensor import ProcessGroup, ShardSpec
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext
 from colossalai.zero.gemini.gemini_ddp import GeminiDDP
 
 from .ddp import DDPStrategy
@@ -42,37 +43,37 @@ class LowLevelZeroStrategy(DDPStrategy):
 
     """
 
-    def __init__(self,
-                 stage: int = 2,
-                 precision: str = 'fp16',
-                 seed: int = 42,
-                 placement_policy: str = 'cuda',
-                 reduce_bucket_size: int = 12 * 1024**2,    # only for stage 1&2
-                 overlap_communication: bool = True,    # only for stage 1&2
-                 initial_scale: float = 2**16,
-                 growth_factor: float = 2,
-                 backoff_factor: float = 0.5,
-                 growth_interval: int = 1000,
-                 hysteresis: int = 2,
-                 min_scale: float = 1,
-                 max_scale: float = 2**32,
-                 max_norm: float = 0.0,
-                 norm_type: float = 2.0
-                 ) -> None:
+    def __init__(
+            self,
+            stage: int = 2,
+            precision: str = 'fp16',
+            seed: int = 42,
+            placement_policy: str = 'cuda',
+            reduce_bucket_size: int = 12 * 1024**2,    # only for stage 1&2
+            overlap_communication: bool = True,    # only for stage 1&2
+            initial_scale: float = 2**16,
+            growth_factor: float = 2,
+            backoff_factor: float = 0.5,
+            growth_interval: int = 1000,
+            hysteresis: int = 2,
+            min_scale: float = 1,
+            max_scale: float = 2**32,
+            max_norm: float = 0.0,
+            norm_type: float = 2.0) -> None:
 
         assert stage in (1, 2), f'Unsupported stage "{stage}"'
         assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
         assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
 
         plugin_initializer = lambda: LowLevelZeroPlugin(
-            # zero_config
+        # zero_config
             stage=stage,
             precision=precision,
-            # zero_optim_config
+        # zero_optim_config
             reduce_bucket_size_in_m=reduce_bucket_size,
             overlap_communication=overlap_communication,
             cpu_offload=(placement_policy == 'cpu'),
-            # optim_config
+        # optim_config
             initial_scale=initial_scale,
             growth_factor=growth_factor,
             backoff_factor=backoff_factor,
@@ -81,8 +82,7 @@ def __init__(self,
             min_scale=min_scale,
             max_scale=max_scale,
             max_norm=max_norm,
-            norm_type=norm_type
-        )
+            norm_type=norm_type)
 
         super().__init__(seed, plugin_initializer)
 
@@ -131,43 +131,39 @@ class GeminiStrategy(DDPStrategy):
 
     """
 
-    def __init__(self,
-                 seed: int = 42,
-                 shard_init: bool = False,    # only for stage 3
-                 placement_policy: str = 'cuda',
-                 pin_memory: bool = True,    # only for stage 3
-                 force_outputs_fp32: bool = False,    # only for stage 3
-                 search_range_m: int = 32,    # only for stage 3
-                 hidden_dim: Optional[int] = None,    # only for stage 3
-                 min_chunk_size_m: float = 32,    # only for stage 3
-                 gpu_margin_mem_ratio: float = 0.0,    # only for stage 3
-                 initial_scale: float = 2**16,
-                 growth_factor: float = 2,
-                 backoff_factor: float = 0.5,
-                 growth_interval: int = 1000,
-                 hysteresis: int = 2,
-                 min_scale: float = 1,
-                 max_scale: float = 2**32,
-                 max_norm: float = 0.0,
-                 norm_type: float = 2.0
-                 ) -> None:
-
-        assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
+    def __init__(
+            self,
+            seed: int = 42,
+            shard_init: bool = False,    # only for stage 3
+            placement_policy: str = 'auto',
+            pin_memory: bool = True,    # only for stage 3
+            force_outputs_fp32: bool = False,    # only for stage 3
+            search_range_m: int = 32,    # only for stage 3
+            hidden_dim: Optional[int] = None,    # only for stage 3
+            min_chunk_size_m: float = 32,    # only for stage 3
+            gpu_margin_mem_ratio: float = 0.0,    # only for stage 3
+            initial_scale: float = 2**16,
+            growth_factor: float = 2,
+            backoff_factor: float = 0.5,
+            growth_interval: int = 1000,
+            hysteresis: int = 2,
+            min_scale: float = 1,
+            max_scale: float = 2**32,
+            max_norm: float = 0.0,
+            norm_type: float = 2.0) -> None:
 
         # TODO(ver217): support shard_init when using from_pretrained()
         if shard_init:
-            warnings.warn(
-                f'Shard init is not supported model.from_pretrained() yet. '
-                'Please load weights after strategy.prepare()'
-            )
+            warnings.warn(f'Shard init is not supported model.from_pretrained() yet. '
+                          'Please load weights after strategy.prepare()')
         self.shard_init = shard_init
 
         warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
 
         # NOTE: dist should be initialized before calling get_current_device()
         plugin_initializer = lambda: GeminiPlugin(
-            # gemini_config
-            device=get_current_device(),
+        # gemini_config
+            chunk_init_device=get_current_device(),
             placement_policy=placement_policy,
             precision='fp16',
             pin_memory=pin_memory,
@@ -176,9 +172,9 @@ def __init__(self,
             search_range_m=search_range_m,
             hidden_dim=hidden_dim,
             min_chunk_size_m=min_chunk_size_m,
-            # zero_optim_config
+        # zero_optim_config
             gpu_margin_mem_ratio=gpu_margin_mem_ratio,
-            # optim_config
+        # optim_config
             initial_scale=initial_scale,
             growth_factor=growth_factor,
             backoff_factor=backoff_factor,
@@ -187,8 +183,7 @@ def __init__(self,
             min_scale=min_scale,
             max_scale=max_scale,
             max_norm=max_norm,
-            norm_type=norm_type
-        )
+            norm_type=norm_type)
 
         super().__init__(seed, plugin_initializer)
 
@@ -200,16 +195,10 @@ def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
-        world_size = dist.get_world_size()
-        shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
-        default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
-        return ColoInitContext(device=get_current_device(),
-                               dtype=torch.half,
-                               default_pg=shard_pg,
-                               default_dist_spec=default_dist_spec)
+        return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
-        assert isinstance(model, GeminiModel)
+        # assert isinstance(model, GeminiModel)
         ddp_model = model.unwrap()
         assert isinstance(ddp_model, GeminiDDP)
         return ddp_model.module
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index f068ea2bf5de..dcc6b0281082 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -6,18 +6,18 @@
 import torch.distributed as dist
 from coati.dataset import SFTDataset, SupervisedDataset
 from coati.models.bloom import BLOOMActor
+from coati.models.chatglm import ChatGLMActor
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
 from coati.models.gpt import GPTActor
 from coati.models.llama import LlamaActor
 from coati.models.opt import OPTActor
-from coati.models.chatglm import ChatGLMActor
 from coati.trainer import SFTTrainer
 from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from datasets import load_dataset
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer, AutoModel
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+from transformers import AutoModel, AutoTokenizer, BloomTokenizerFast, LlamaTokenizer
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 from transformers.trainer import get_scheduler
 
@@ -31,7 +31,7 @@ def train(args):
     if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='cuda')
+        strategy = GeminiStrategy(placement_policy='auto')
     elif args.strategy == 'colossalai_zero2':
         strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2_cpu':
@@ -45,21 +45,13 @@ def train(args):
         args.grad_checkpoint = False
     with strategy.model_init_context():
         if args.model == 'bloom':
-            model = BLOOMActor(pretrained=args.pretrain,
-                               lora_rank=args.lora_rank,
-                               checkpoint=args.grad_checkpoint)
+            model = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
         elif args.model == 'opt':
-            model = OPTActor(pretrained=args.pretrain,
-                             lora_rank=args.lora_rank,
-                             checkpoint=args.grad_checkpoint)
+            model = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
         elif args.model == 'gpt2':
-            model = GPTActor(pretrained=args.pretrain,
-                             lora_rank=args.lora_rank,
-                             checkpoint=args.grad_checkpoint)
+            model = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
         elif args.model == 'llama':
-            model = LlamaActor(pretrained=args.pretrain,
-                               lora_rank=args.lora_rank,
-                               checkpoint=args.grad_checkpoint)
+            model = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
         elif args.model == 'chatglm':
             model = ChatGLMActor(pretrained=args.pretrain)
         else:
@@ -69,16 +61,14 @@ def train(args):
 
     # configure tokenizer
     if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained(
-            'gpt2' if args.tokenizer is None else args.tokenizer)
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2' if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'bloom':
         tokenizer = BloomTokenizerFast.from_pretrained(
             'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained(
-            "facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'llama':
         tokenizer = LlamaTokenizer.from_pretrained(
@@ -86,8 +76,8 @@ def train(args):
         tokenizer.eos_token = '<\s>'
         tokenizer.pad_token = tokenizer.unk_token
     elif args.model == 'chatglm':
-        tokenizer = ChatGLMTokenizer.from_pretrained(
-            "THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer, trust_remote_code=True)
+        tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer,
+                                                     trust_remote_code=True)
     else:
         raise ValueError(f'Unsupported model "{args.model}"')
 
diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
index 1a5cd069011d..b489ecd48edb 100755
--- a/applications/Chat/examples/train_sft.sh
+++ b/applications/Chat/examples/train_sft.sh
@@ -1,29 +1,29 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
+# set_n_least_used_CUDA_VISIBLE_DEVICES() {
+#     local n=${1:-"9999"}
+#     echo "GPU Memory Usage:"
+#     local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+#         tail -n +2 |
+#         nl -v 0 |
+#         tee /dev/tty |
+#         sort -g -k 2 |
+#         awk '{print $1}' |
+#         head -n $n)
+#     export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+#     echo "Now CUDA_VISIBLE_DEVICES is set to:"
+#     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+# }
 
-set_n_least_used_CUDA_VISIBLE_DEVICES 4
+# set_n_least_used_CUDA_VISIBLE_DEVICES 4
 
 torchrun --standalone --nproc_per_node=4 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
+    --pretrain "/home/lcjmy/data3/llama" \
     --model 'llama' \
-    --strategy colossalai_zero2 \
+    --strategy colossalai_gemini \
     --log_interval 10 \
-    --save_path /path/to/Coati-7B \
-    --dataset /path/to/data.json \
+    --save_path "/home/lcjmy/data3/output" \
+    --dataset "yizhongw/self_instruct" \
     --batch_size 4 \
-    --accumulation_steps 8 \
+    --accumulation_steps 1 \
     --lr 2e-5 \
     --max_datasets_size 512 \
     --max_epochs 1
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index e5f5ca0932a8..e78b203029a5 100644
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.20.1
 tqdm
 datasets
 loralib
-colossalai==0.3.1
+colossalai==0.3.2
 torch<2.0.0, >=1.12.1
 langchain
 tokenizers
diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py
index 3a3bf5b19cb8..c8ac416d4eff 100644
--- a/applications/Chat/tests/test_checkpoint.py
+++ b/applications/Chat/tests/test_checkpoint.py
@@ -22,10 +22,7 @@ def get_data(batch_size: int, seq_len: int = 10) -> dict:
     return dict(input_ids=input_ids, attention_mask=attention_mask)
 
 
-def train_step(strategy: Strategy,
-               actor: GPTActor,
-               actor_optim: HybridAdam,
-               batch_size: int = 8):
+def train_step(strategy: Strategy, actor: GPTActor, actor_optim: HybridAdam, batch_size: int = 8):
     data = get_data(batch_size)
     action_mask = torch.ones_like(data["attention_mask"], dtype=torch.bool)
     actor_output = actor(data["input_ids"], data["attention_mask"])
@@ -35,12 +32,11 @@ def train_step(strategy: Strategy,
     strategy.optimizer_step(actor_optim)
 
 
-def run_test_checkpoint(strategy_name: str,
-                        shard: bool):
+def run_test_checkpoint(strategy_name: str, shard: bool):
     if strategy_name == "ddp":
         strategy = DDPStrategy()
     elif strategy_name == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="cuda", initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
     elif strategy_name == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     else:
@@ -60,11 +56,9 @@ def run_test_checkpoint(strategy_name: str,
         dist.broadcast_object_list(rank0_dirname)
         rank0_dirname = rank0_dirname[0]
 
-        model_path = os.path.join(
-            rank0_dirname, "model" if shard else f"model.pt")
+        model_path = os.path.join(rank0_dirname, "model" if shard else f"model.pt")
         strategy.save_model(actor, model_path, only_rank0=not shard)
-        optim_path = os.path.join(
-            rank0_dirname, "optim" if shard else "optim.pt")
+        optim_path = os.path.join(rank0_dirname, "optim" if shard else "optim.pt")
         strategy.save_optimizer(actor_optim, optim_path, only_rank0=not shard)
         dist.barrier()
 
@@ -75,11 +69,7 @@ def run_test_checkpoint(strategy_name: str,
     train_step(strategy, actor, actor_optim)
 
 
-def run_dist(rank: int,
-             world_size: int,
-             port: int,
-             strategy_name: str,
-             shard: bool):
+def run_dist(rank: int, world_size: int, port: int, strategy_name: str, shard: bool):
     os.environ["RANK"] = str(rank)
     os.environ["LOCAL_RANK"] = str(rank)
     os.environ["WORLD_SIZE"] = str(world_size)
@@ -93,13 +83,8 @@ def run_dist(rank: int,
 @pytest.mark.parametrize("strategy_name", ["ddp", "colossalai_gemini", "colossalai_zero2"])
 @pytest.mark.parametrize("shard", [False, True])
 @rerun_if_address_is_in_use()
-def test_checkpoint(world_size: int,
-                    strategy_name: str,
-                    shard: bool):
-    spawn(run_dist,
-          world_size,
-          strategy_name=strategy_name,
-          shard=shard)
+def test_checkpoint(world_size: int, strategy_name: str, shard: bool):
+    spawn(run_dist, world_size, strategy_name=strategy_name, shard=shard)
 
 
 if __name__ == "__main__":

From b0c4f28722395977d9c27a8db6d721f45ccbe6f5 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 12 Sep 2023 22:31:13 +0800
Subject: [PATCH 02/22] [chat] fix gemini strategy

---
 applications/Chat/requirements-test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/Chat/requirements-test.txt b/applications/Chat/requirements-test.txt
index eb1a77875acb..c688935bda31 100644
--- a/applications/Chat/requirements-test.txt
+++ b/applications/Chat/requirements-test.txt
@@ -1,2 +1,2 @@
 pytest
-colossalai==0.3.1
\ No newline at end of file
+colossalai==0.3.2

From 987d38d4bdb239070d5539b4f86e38a4e2698c5d Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Wed, 13 Sep 2023 14:18:16 +0800
Subject: [PATCH 03/22] [chat] fix gemini strategy

---
 .../coati/trainer/strategies/colossalai.py    | 85 +++++++++----------
 applications/Chat/examples/train_prompts.py   | 46 +++++-----
 .../Chat/examples/train_reward_model.py       |  8 +-
 applications/Chat/examples/train_sft.sh       | 40 ++++-----
 colossalai/zero/gemini/colo_init_context.py   |  2 +-
 5 files changed, 86 insertions(+), 95 deletions(-)

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index d65491ee8589..744fba2d134e 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -9,10 +9,9 @@
 from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
 # from colossalai.booster.plugin.gemini_plugin import GeminiModel
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-# from colossalai.zero import ColoInitContext
-from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.tensor import ProcessGroup, ShardSpec
 from colossalai.utils import get_current_device
+from colossalai.zero import ColoInitContext
 from colossalai.zero.gemini.gemini_ddp import GeminiDDP
 
 from .ddp import DDPStrategy
@@ -65,24 +64,20 @@ def __init__(
         assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
         assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
 
-        plugin_initializer = lambda: LowLevelZeroPlugin(
-        # zero_config
-            stage=stage,
-            precision=precision,
-        # zero_optim_config
-            reduce_bucket_size_in_m=reduce_bucket_size,
-            overlap_communication=overlap_communication,
-            cpu_offload=(placement_policy == 'cpu'),
-        # optim_config
-            initial_scale=initial_scale,
-            growth_factor=growth_factor,
-            backoff_factor=backoff_factor,
-            growth_interval=growth_interval,
-            hysteresis=hysteresis,
-            min_scale=min_scale,
-            max_scale=max_scale,
-            max_norm=max_norm,
-            norm_type=norm_type)
+        plugin_initializer = lambda: LowLevelZeroPlugin(stage=stage,
+                                                        precision=precision,
+                                                        reduce_bucket_size_in_m=reduce_bucket_size,
+                                                        overlap_communication=overlap_communication,
+                                                        cpu_offload=(placement_policy == 'cpu'),
+                                                        initial_scale=initial_scale,
+                                                        growth_factor=growth_factor,
+                                                        backoff_factor=backoff_factor,
+                                                        growth_interval=growth_interval,
+                                                        hysteresis=hysteresis,
+                                                        min_scale=min_scale,
+                                                        max_scale=max_scale,
+                                                        max_norm=max_norm,
+                                                        norm_type=norm_type)
 
         super().__init__(seed, plugin_initializer)
 
@@ -161,29 +156,25 @@ def __init__(
         warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
 
         # NOTE: dist should be initialized before calling get_current_device()
-        plugin_initializer = lambda: GeminiPlugin(
-        # gemini_config
-            chunk_init_device=get_current_device(),
-            placement_policy=placement_policy,
-            precision='fp16',
-            pin_memory=pin_memory,
-            force_outputs_fp32=force_outputs_fp32,
-            strict_ddp_mode=shard_init,
-            search_range_m=search_range_m,
-            hidden_dim=hidden_dim,
-            min_chunk_size_m=min_chunk_size_m,
-        # zero_optim_config
-            gpu_margin_mem_ratio=gpu_margin_mem_ratio,
-        # optim_config
-            initial_scale=initial_scale,
-            growth_factor=growth_factor,
-            backoff_factor=backoff_factor,
-            growth_interval=growth_interval,
-            hysteresis=hysteresis,
-            min_scale=min_scale,
-            max_scale=max_scale,
-            max_norm=max_norm,
-            norm_type=norm_type)
+        plugin_initializer = lambda: GeminiPlugin(chunk_init_device=get_current_device(),
+                                                  placement_policy=placement_policy,
+                                                  precision='fp16',
+                                                  pin_memory=pin_memory,
+                                                  force_outputs_fp32=force_outputs_fp32,
+                                                  strict_ddp_mode=shard_init,
+                                                  search_range_m=search_range_m,
+                                                  hidden_dim=hidden_dim,
+                                                  min_chunk_size_m=min_chunk_size_m,
+                                                  gpu_margin_mem_ratio=gpu_margin_mem_ratio,
+                                                  initial_scale=initial_scale,
+                                                  growth_factor=growth_factor,
+                                                  backoff_factor=backoff_factor,
+                                                  growth_interval=growth_interval,
+                                                  hysteresis=hysteresis,
+                                                  min_scale=min_scale,
+                                                  max_scale=max_scale,
+                                                  max_norm=max_norm,
+                                                  norm_type=norm_type)
 
         super().__init__(seed, plugin_initializer)
 
@@ -195,7 +186,13 @@ def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
-        return super().model_init_context()
+        world_size = dist.get_world_size()
+        shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
+        default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
+        return ColoInitContext(device=get_current_device(),
+                               dtype=torch.half,
+                               default_pg=shard_pg,
+                               default_dist_spec=default_dist_spec)
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
         # assert isinstance(model, GeminiModel)
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index d27a70a3fef6..ab6590568896 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -23,7 +23,7 @@ def main(args):
     if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
     elif args.strategy == 'colossalai_zero2':
         strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     else:
@@ -108,16 +108,14 @@ def main(args):
 
     # configure tokenizer
     if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained(
-            'gpt2' if args.tokenizer is None else args.tokenizer)
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2' if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'bloom':
         tokenizer = BloomTokenizerFast.from_pretrained(
             'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained(
-            "facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'llama':
         tokenizer = LlamaTokenizer.from_pretrained(
@@ -155,26 +153,24 @@ def main(args):
         strategy.prepare((actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
 
     # configure trainer
-    trainer = PPOTrainer(
-        strategy,
-        actor,
-        critic,
-        reward_model,
-        initial_model,
-        actor_optim,
-        critic_optim,
-        kl_coef=args.kl_coef,
-        ptx_coef=args.ptx_coef,
-        train_batch_size=args.train_batch_size,
-        max_length=args.max_seq_len,
-        use_cache=True,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        offload_inference_models=args.strategy != 'colossalai_gemini'
-    )
+    trainer = PPOTrainer(strategy,
+                         actor,
+                         critic,
+                         reward_model,
+                         initial_model,
+                         actor_optim,
+                         critic_optim,
+                         kl_coef=args.kl_coef,
+                         ptx_coef=args.ptx_coef,
+                         train_batch_size=args.train_batch_size,
+                         max_length=args.max_seq_len,
+                         use_cache=True,
+                         do_sample=True,
+                         temperature=1.0,
+                         top_k=50,
+                         pad_token_id=tokenizer.pad_token_id,
+                         eos_token_id=tokenizer.eos_token_id,
+                         offload_inference_models=args.strategy != 'colossalai_gemini')
 
     trainer.fit(prompt_dataloader=prompt_dataloader,
                 pretrain_dataloader=pretrain_dataloader,
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 190460bc20f6..57b3a71fd0a8 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -27,7 +27,7 @@ def train(args):
     if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='cuda')
+        strategy = GeminiStrategy(placement_policy='auto')
     elif args.strategy == 'colossalai_zero2':
         strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     else:
@@ -54,16 +54,14 @@ def train(args):
 
     # configure tokenizer
     if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained(
-            'gpt2' if args.tokenizer is None else args.tokenizer)
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2' if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'bloom':
         tokenizer = BloomTokenizerFast.from_pretrained(
             'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained(
-            "facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == 'llama':
         tokenizer = LlamaTokenizer.from_pretrained(
diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
index b489ecd48edb..cbe3b7f416cf 100755
--- a/applications/Chat/examples/train_sft.sh
+++ b/applications/Chat/examples/train_sft.sh
@@ -1,29 +1,29 @@
-# set_n_least_used_CUDA_VISIBLE_DEVICES() {
-#     local n=${1:-"9999"}
-#     echo "GPU Memory Usage:"
-#     local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-#         tail -n +2 |
-#         nl -v 0 |
-#         tee /dev/tty |
-#         sort -g -k 2 |
-#         awk '{print $1}' |
-#         head -n $n)
-#     export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-#     echo "Now CUDA_VISIBLE_DEVICES is set to:"
-#     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-# }
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
 
-# set_n_least_used_CUDA_VISIBLE_DEVICES 4
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
 
 torchrun --standalone --nproc_per_node=4 train_sft.py \
-    --pretrain "/home/lcjmy/data3/llama" \
+    --pretrain "/path/to/LLaMa-7B/" \
     --model 'llama' \
-    --strategy colossalai_gemini \
+    --strategy colossalai_zero2 \
     --log_interval 10 \
-    --save_path "/home/lcjmy/data3/output" \
-    --dataset "yizhongw/self_instruct" \
+    --save_path "/path/to/Coati-7B" \
+    --dataset /path/to/data.json \
     --batch_size 4 \
-    --accumulation_steps 1 \
+    --accumulation_steps 8 \
     --lr 2e-5 \
     --max_datasets_size 512 \
     --max_epochs 1
diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py
index dad852a34a71..e1e35697c534 100644
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -126,7 +126,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
                 replaced_tensors[param] = colo_param
             delattr(submodule, param_name)
             setattr(submodule, param_name, colo_param)
-            colo_param.shared_param_modules.append(submodule)
+            # colo_param.shared_param_modules.append(submodule)
 
         param_number = 0
         meta_param_number = 0

From 23281b15f62a043066fd8c064e35da99b83c9f1b Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Wed, 13 Sep 2023 14:25:42 +0800
Subject: [PATCH 04/22] [chat] fix gemini strategy

---
 applications/Chat/benchmarks/benchmark_opt_lora_dummy.py    | 4 +---
 applications/Chat/coati/ray/utils.py                        | 4 +---
 .../Chat/examples/community/peft/train_peft_prompts.py      | 2 +-
 applications/Chat/examples/community/peft/train_peft_sft.py | 6 ++----
 .../Chat/examples/community/ray/train_prompts_on_ray.py     | 6 ++----
 applications/Chat/tests/test_experience.py                  | 2 +-
 6 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 90471ed727b0..0e5d747c4dbe 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -76,9 +76,7 @@ def main(args):
     if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
-    elif args.strategy == 'colossalai_gemini_cpu':
-        strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
     elif args.strategy == 'colossalai_zero2':
         strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     elif args.strategy == 'colossalai_zero2_cpu':
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
index 761186b95ee5..1e661164011d 100644
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -71,11 +71,9 @@ def get_strategy_from_args(strategy: str):
     if strategy == 'ddp':
         strategy_ = DDPStrategy()
     elif strategy == 'colossalai_gemini':
-        strategy_ = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
+        strategy_ = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
     elif strategy == 'colossalai_zero2':
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
-    elif strategy == 'colossalai_gemini_cpu':
-        strategy_ = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
     elif strategy == 'colossalai_zero2_cpu':
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
     else:
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index 9385e457d852..0021664b88a8 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -26,7 +26,7 @@ def main(args):
     if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
     elif args.strategy == 'colossalai_zero2':
         strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
     else:
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index 4af08e6d0141..8c1a315439cd 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -32,7 +32,7 @@ def train(args):
     if args.strategy == 'ddp':
         strategy = DDPStrategy()
     elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='cuda')
+        strategy = GeminiStrategy(placement_policy='auto')
     elif args.strategy == 'colossalai_zero2':
         strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
     else:
@@ -163,9 +163,7 @@ def train(args):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--strategy',
-                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='ddp')
+    parser.add_argument('--strategy', choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='ddp')
     parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
     parser.add_argument('--pretrain', type=str, default=None)
     parser.add_argument('--dataset', type=str, default=None)
diff --git a/applications/Chat/examples/community/ray/train_prompts_on_ray.py b/applications/Chat/examples/community/ray/train_prompts_on_ray.py
index 1bba9ad66fbc..155721b83f56 100644
--- a/applications/Chat/examples/community/ray/train_prompts_on_ray.py
+++ b/applications/Chat/examples/community/ray/train_prompts_on_ray.py
@@ -102,7 +102,7 @@ def _init_strategy(self, strategy: str):
         if strategy == 'ddp':
             self._strategy = DDPStrategy()
         elif strategy == 'colossalai_gemini':
-            self._strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
+            self._strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
         elif strategy == 'colossalai_zero2':
             self._strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
         else:
@@ -531,9 +531,7 @@ def main(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--prompt_csv_url', type=str)
-    parser.add_argument('--strategy',
-                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='ddp')
+    parser.add_argument('--strategy', choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='ddp')
     parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
     parser.add_argument('--pretrain', type=str, default='gpt2')
     parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py
index 071e50b90e8e..9dd8edec479f 100644
--- a/applications/Chat/tests/test_experience.py
+++ b/applications/Chat/tests/test_experience.py
@@ -42,7 +42,7 @@ def make_and_consume_experience(strategy):
     elif strategy == 'colossalai-zero2':
         strategy = LowLevelZeroStrategy()
     elif strategy == 'colossalai-gemini':
-        strategy = GeminiStrategy(placement_policy='cuda')
+        strategy = GeminiStrategy(placement_policy='auto')
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
 

From 81c662ad6bde564e57a7e4e3569d939055740e8d Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Wed, 13 Sep 2023 16:03:44 +0800
Subject: [PATCH 05/22] g# This is a combination of 2 commits.

[chat] fix gemini strategy

fox
---
 applications/Chat/coati/trainer/strategies/colossalai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 744fba2d134e..a8931175cb4a 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -156,7 +156,7 @@ def __init__(
         warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
 
         # NOTE: dist should be initialized before calling get_current_device()
-        plugin_initializer = lambda: GeminiPlugin(chunk_init_device=get_current_device(),
+        plugin_initializer = lambda: GeminiPlugin(device=get_current_device(),
                                                   placement_policy=placement_policy,
                                                   precision='fp16',
                                                   pin_memory=pin_memory,

From 98e891bf8950a3754217748523db809259b156a3 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 14 Sep 2023 12:06:35 +0800
Subject: [PATCH 06/22] [chat] fix gemini strategy

update llama2 example

[chat] fix gemini strategy
---
 .../Chat/coati/trainer/strategies/colossalai.py        | 10 ++--------
 applications/Chat/examples/requirements.txt            |  2 +-
 colossalai/zero/gemini/colo_init_context.py            |  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index a8931175cb4a..88909bc8e42d 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -156,7 +156,7 @@ def __init__(
         warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
 
         # NOTE: dist should be initialized before calling get_current_device()
-        plugin_initializer = lambda: GeminiPlugin(device=get_current_device(),
+        plugin_initializer = lambda: GeminiPlugin(chunk_init_device=get_current_device(),
                                                   placement_policy=placement_policy,
                                                   precision='fp16',
                                                   pin_memory=pin_memory,
@@ -186,13 +186,7 @@ def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
-        world_size = dist.get_world_size()
-        shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
-        default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
-        return ColoInitContext(device=get_current_device(),
-                               dtype=torch.half,
-                               default_pg=shard_pg,
-                               default_dist_spec=default_dist_spec)
+        return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
         # assert isinstance(model, GeminiModel)
diff --git a/applications/Chat/examples/requirements.txt b/applications/Chat/examples/requirements.txt
index 5d0f9f927d17..0890917048d2 100644
--- a/applications/Chat/examples/requirements.txt
+++ b/applications/Chat/examples/requirements.txt
@@ -1,3 +1,3 @@
 pandas>=1.4.1
 sentencepiece
-colossalai==0.3.1
\ No newline at end of file
+colossalai==0.3.2
diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py
index e1e35697c534..dad852a34a71 100644
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -126,7 +126,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
                 replaced_tensors[param] = colo_param
             delattr(submodule, param_name)
             setattr(submodule, param_name, colo_param)
-            # colo_param.shared_param_modules.append(submodule)
+            colo_param.shared_param_modules.append(submodule)
 
         param_number = 0
         meta_param_number = 0

From c13ac914f7733e84fbe2c5c1dd2bc57a39170a74 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Mon, 18 Sep 2023 21:09:16 +0800
Subject: [PATCH 07/22] [fix] fix gemini strategy

---
 applications/Chat/coati/models/generation.py             | 4 ++++
 applications/Chat/coati/trainer/strategies/base.py       | 1 +
 applications/Chat/coati/trainer/strategies/colossalai.py | 7 +++++++
 applications/Chat/examples/train_prompts.py              | 8 ++++----
 applications/Chat/examples/train_reward_model.py         | 2 +-
 applications/Chat/examples/train_sft.py                  | 2 +-
 colossalai/tensor/param_op_hook.py                       | 2 +-
 colossalai/zero/gemini/colo_init_context.py              | 2 +-
 colossalai/zero/gemini/gemini_ddp.py                     | 6 +++---
 9 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
index de0d63f95f50..c3d97ffdea3f 100644
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -58,13 +58,17 @@ def _sample(model: Actor,
     for _ in range(input_ids.size(1), max_length):
         model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) \
             if prepare_inputs_fn is not None else {'input_ids': input_ids}
+        print(model_inputs)
         outputs = model(**model_inputs)
 
         next_token_logits = outputs['logits'][:, -1, :]
         # pre-process distribution
+        print("input_ids" + str(input_ids))
+        print("next_token_logits" + str(next_token_logits))
         next_token_logits = logits_processor(input_ids, next_token_logits)
         # sample
         probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float)
+        print(probs)
         next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
 
         # finished sentences should have their next token be a padding token
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index c20b2b16e396..8a55143ee90f 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -49,6 +49,7 @@ def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = Fal
         pass
 
     def model_init_context(self):
+        print("aaaaaaa nullcontext")
         return nullcontext()
 
     def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]:
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 88909bc8e42d..016b6e7ea3b0 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -186,6 +186,13 @@ def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
+        # world_size = dist.get_world_size()
+        # shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
+        # default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
+        # return ColoInitContext(device=get_current_device(),
+        #                        dtype=torch.half,
+        #                        default_pg=shard_pg,
+        #                        default_dist_spec=default_dist_spec)
         return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index ab6590568896..7b5a03285b7a 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -65,8 +65,8 @@ def main(args):
         if args.rm_path is not None:
             reward_model.load_state_dict(state_dict, strict=False)
 
-        initial_model.to(torch.float16).to(torch.cuda.current_device())
-        reward_model.to(torch.float16).to(torch.cuda.current_device())
+        initial_model.to(torch.bfloat16).to(torch.cuda.current_device())
+        reward_model.to(torch.bfloat16).to(torch.cuda.current_device())
 
         if args.model == 'gpt2':
             actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
@@ -95,8 +95,8 @@ def main(args):
             del state_dict
 
     if args.strategy != 'colossalai_gemini':
-        critic.to(torch.float16).to(torch.cuda.current_device())
-        actor.to(torch.float16).to(torch.cuda.current_device())
+        critic.to(torch.bfloat16).to(torch.cuda.current_device())
+        actor.to(torch.bfloat16).to(torch.cuda.current_device())
 
     # configure optimizer
     if args.strategy.startswith('colossalai'):
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 57b3a71fd0a8..48ed841d7103 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -46,7 +46,7 @@ def train(args):
         else:
             raise ValueError(f'Unsupported model "{args.model}"')
 
-        model.to(torch.float16).to(torch.cuda.current_device())
+        model.to(torch.bfloat16).to(torch.cuda.current_device())
 
         if args.model_path is not None:
             state_dict = torch.load(args.model_path)
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index dcc6b0281082..d0a6becc94e0 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -57,7 +57,7 @@ def train(args):
         else:
             raise ValueError(f'Unsupported model "{args.model}"')
 
-        model.to(torch.float16).to(torch.cuda.current_device())
+        model.to(torch.bfloat16).to(torch.cuda.current_device())
 
     # configure tokenizer
     if args.model == 'gpt2':
diff --git a/colossalai/tensor/param_op_hook.py b/colossalai/tensor/param_op_hook.py
index e37859bac0c3..e2b9926f2585 100644
--- a/colossalai/tensor/param_op_hook.py
+++ b/colossalai/tensor/param_op_hook.py
@@ -143,7 +143,7 @@ def _flatten_grad_args(args) -> Tuple[list, list, List[bool], TreeSpec]:
             grad_args.append(arg)
         else:
             other_args.append(arg)
-    assert len(grad_args) > 0
+    # assert len(grad_args) > 0
     return grad_args, other_args, grad_flags, spec
 
 
diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py
index dad852a34a71..e1e35697c534 100644
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -126,7 +126,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
                 replaced_tensors[param] = colo_param
             delattr(submodule, param_name)
             setattr(submodule, param_name, colo_param)
-            colo_param.shared_param_modules.append(submodule)
+            # colo_param.shared_param_modules.append(submodule)
 
         param_number = 0
         meta_param_number = 0
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 918b08cd3150..f80ee7b88a4f 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -232,9 +232,9 @@ def _post_forward(self):
     def forward(self, *args, **kwargs):
         # check whether we are in a inference mode
         grad_flag = torch.is_grad_enabled()
-        if not grad_flag:
-            assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup(
-            ), "You should run a completed iteration as your warmup iter"
+        # if not grad_flag:
+        #     assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup(
+        #     ), "You should run a completed iteration as your warmup iter"
 
         args, kwargs = _cast_float(args, self.mixed_precision), _cast_float(kwargs, self.mixed_precision)
         self.module.zero_grad(set_to_none=True)

From 88b7b27ea098fe752f35d8af75a9c4238af91791 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Wed, 20 Sep 2023 16:24:59 +0800
Subject: [PATCH 08/22] [fix] fix gemini strategy

---
 .../Chat/coati/models/bloom/bloom_actor.py    |  3 ++
 applications/Chat/coati/models/lora.py        | 31 +++++++++++-------
 .../Chat/coati/trainer/strategies/base.py     |  4 +--
 .../Chat/coati/trainer/strategies/ddp.py      | 32 +++++++++++--------
 applications/Chat/examples/train_prompts.py   |  6 ++++
 .../Chat/examples/train_reward_model.py       |  7 ++++
 applications/Chat/examples/train_sft.py       |  8 +++--
 7 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/applications/Chat/coati/models/bloom/bloom_actor.py b/applications/Chat/coati/models/bloom/bloom_actor.py
index d7577f096493..e83f8a906202 100644
--- a/applications/Chat/coati/models/bloom/bloom_actor.py
+++ b/applications/Chat/coati/models/bloom/bloom_actor.py
@@ -3,6 +3,8 @@
 import torch
 from transformers import BloomConfig, BloomForCausalLM, BloomModel
 
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
+
 from ..base import Actor
 
 
@@ -25,6 +27,7 @@ def __init__(self,
                  lora_rank: int = 0,
                  lora_train_bias: str = 'none') -> None:
         if pretrained is not None:
+            # model = BloomForCausalLM(BloomConfig())
             model = BloomForCausalLM.from_pretrained(pretrained)
         elif config is not None:
             model = BloomForCausalLM(config)
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
index 546f675d7d37..f3224e2e92a0 100644
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -70,18 +70,24 @@ def T(w):
                     self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
             self.merged = False
 
-    def eval(self):
+    # def eval(self):
+    def merge(self):
 
         def T(w):
             return w.T if self.fan_in_fan_out else w
 
+        print("self.merge_weights and not self.merged" + str(self.merge_weights) + str(not self.merged))
         nn.Module.eval(self)
+        self.merge_weights = True
         if self.merge_weights and not self.merged:
             # Merge the weights and mark it
             if self.r > 0:
-                self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
+                print(type(self.lora_A), type(self.lora_B))
+                weight = T(self.lora_B @ self.lora_A) * self.scaling
+                self.weight.data = self.weight.data + weight
                 delattr(self, 'lora_A')
                 delattr(self, 'lora_B')
+            print("eval eval eval eval" + str(self.merged))
             self.merged = True
 
     def forward(self, x: torch.Tensor):
@@ -98,21 +104,24 @@ def T(w):
             return F.linear(x, T(self.weight), bias=self.bias)
 
 
-def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int) -> LoraLinear:
+def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int, merge_weights: bool = False) -> LoraLinear:
     assert lora_rank <= linear.in_features, f'LoRA rank ({lora_rank}) must be less than or equal to in features ({linear.in_features})'
-    lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank, merge_weights=False)
+    lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank, merge_weights=merge_weights)
     return lora_linear
 
 
-def _convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None:
+def _convert_to_lora_recursively(module: nn.Module, lora_rank: int, merge_weights: bool = False) -> None:
     for name, child in module.named_children():
         if isinstance(child, nn.Linear):
-            setattr(module, name, _lora_linear_wrapper(child, lora_rank))
+            setattr(module, name, _lora_linear_wrapper(child, lora_rank, merge_weights))
         else:
-            _convert_to_lora_recursively(child, lora_rank)
+            _convert_to_lora_recursively(child, lora_rank, merge_weights)
 
 
-def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = 'none') -> nn.Module:
+def convert_to_lora_module(module: nn.Module,
+                           lora_rank: int,
+                           lora_train_bias: str = 'none',
+                           merge_weights: bool = False) -> nn.Module:
     """Convert a torch.nn.Module to a LoRA module.
 
     Args:
@@ -124,7 +133,7 @@ def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: s
     """
     if lora_rank <= 0:
         return module
-    _convert_to_lora_recursively(module, lora_rank)
+    _convert_to_lora_recursively(module, lora_rank, merge_weights)
     lora.mark_only_lora_as_trainable(module, lora_train_bias)
     return module
 
@@ -145,5 +154,5 @@ def __init__(self, lora_rank: int = 0, lora_train_bias: str = 'none') -> None:
         self.lora_rank = lora_rank
         self.lora_train_bias = lora_train_bias
 
-    def convert_to_lora(self) -> None:
-        convert_to_lora_module(self, self.lora_rank, self.lora_train_bias)
+    def convert_to_lora(self, merge_weights: bool = False) -> None:
+        convert_to_lora_module(self, self.lora_rank, self.lora_train_bias, merge_weights)
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index 8a55143ee90f..18b2e9821a42 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -49,7 +49,7 @@ def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = Fal
         pass
 
     def model_init_context(self):
-        print("aaaaaaa nullcontext")
+        # print("aaaaaaa nullcontext")
         return nullcontext()
 
     def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]:
@@ -110,7 +110,7 @@ def unwrap_model(model: nn.Module) -> nn.Module:
         return model
 
     def save_model(self, model: nn.Module, path: str, only_rank0: bool = True, **kwargs) -> None:
-        self.booster.save_model(model, path, shard=not only_rank0, **kwargs)
+        self.booster.save_model(model, path, shard=False, **kwargs)
 
     def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None:
         self.booster.load_model(model, path, strict)
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index a52b0460daa8..0ae532d1eeab 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -16,6 +16,7 @@
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel
 
+from ...models.lora import LoraLinear
 from .base import Strategy
 from .sampler import DistributedSampler
 
@@ -34,10 +35,7 @@ class DDPStrategy(Strategy):
         Strategy for distributed training using torch.distributed.
     """
 
-    def __init__(self,
-                 seed: int = 42,
-                 plugin_initializer: Callable = TorchDDPPlugin
-                 ) -> None:
+    def __init__(self, seed: int = 42, plugin_initializer: Callable = TorchDDPPlugin) -> None:
         self.seed = seed
         super().__init__(plugin_initializer)
 
@@ -88,6 +86,13 @@ def unwrap_model(self, model: nn.Module) -> nn.Module:
         assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel."
         return model.unwrap()
 
+    def eval(self, model):
+        for module in model.children():
+            if isinstance(module, LoraLinear):
+                module.merge()
+            else:
+                self.eval(module)
+
     def save_pretrained(self,
                         model: nn.Module,
                         path: str,
@@ -103,17 +108,15 @@ def save_pretrained(self,
             if tokenizer is not None:
                 tokenizer.save_pretrained(path)
         model_path = os.path.join(path, "pytorch_model.bin")
-        self.save_model(model,
-                        model_path,
-                        only_rank0=only_rank0)
+        # print(model)
+        self.eval(model)
+        # print(model)
+        # print([module for module in model.named_children()])
+        self.save_model(model, model_path, only_rank0=only_rank0)
 
-        def _replace_keys(model_path: str,
-                          replace_fn: Callable):
+        def _replace_keys(model_path: str, replace_fn: Callable):
             state_dict = torch.load(model_path, map_location="cpu")
-            state_dict = {
-                replace_fn(k): v
-                for k, v in state_dict.items()
-            }
+            state_dict = {replace_fn(k): v for k, v in state_dict.items()}
             torch.save(state_dict, model_path)
 
         # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
@@ -121,6 +124,9 @@ def _replace_keys(model_path: str,
         if dist.get_rank() == 0:
             _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
 
+    def load_pretrained(self, model, path):
+        self.load_model(model, path, strict=False)
+
     def get_model_state_dict_shard(self, model: nn.Module, **config):
         # TODO: implement sharding on naive strategy
         model = self.unwrap_model(model)
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 7b5a03285b7a..1dd65966c8a2 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -33,12 +33,18 @@ def main(args):
         warnings.warn('LoRA weights should be merged with the model weights')
         state_dict = torch.load(args.rm_path, map_location='cpu')
 
+    if args.lora_rank > 0:
+        warnings.warn("Lora is not supported yet.")
+        args.lora_rank = 0
+
     with strategy.model_init_context():
         # configure model
         if args.model == 'gpt2':
             initial_model = GPTActor(pretrained=args.pretrain)
         elif args.model == 'bloom':
             initial_model = BLOOMActor(pretrained=args.pretrain)
+            # strategy.load_pretrained(initial_model, args.pretrain+"/pytorch_model.bin")
+            # print(initial_model.named_parameters())
         elif args.model == 'opt':
             initial_model = OPTActor(pretrained=args.pretrain)
         elif args.model == 'llama':
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 48ed841d7103..cab1293d50af 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -1,4 +1,5 @@
 import argparse
+import warnings
 from random import randint
 
 import torch
@@ -34,6 +35,10 @@ def train(args):
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
     # configure model
+    if args.lora_rank > 0:
+        warnings.warn("Lora is not supported yet.")
+        args.lora_rank = 0
+
     with strategy.model_init_context():
         if args.model == 'bloom':
             model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
@@ -166,6 +171,8 @@ def train(args):
 
     trainer.fit(train_dataloader=train_dataloader, valid_dataloader=valid_dataloader, eval_dataloader=eval_dataloader)
     # save model checkpoint after fitting on only rank0
+    strategy.eval(model)
+    print(args.save_path)
     strategy.save_model(model, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index d0a6becc94e0..f575d35c4d38 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -41,8 +41,9 @@ def train(args):
 
     # configure model
     if args.lora_rank > 0:
-        warnings.warn("Gradient checkpoint is disabled when using LoRA")
-        args.grad_checkpoint = False
+        warnings.warn("Lora is not supported yet.")
+        args.lora_rank = 0
+
     with strategy.model_init_context():
         if args.model == 'bloom':
             model = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
@@ -168,6 +169,9 @@ def train(args):
                 use_wandb=args.use_wandb)
 
     # save model checkpoint after fitting on only rank0
+    # model.eval()
+    # print(type(model))
+    # print("eval eval")
     strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:

From df9bd2808d6b0fc5ceee91e54b4f5e87e4b1ec90 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Wed, 20 Sep 2023 17:11:29 +0800
Subject: [PATCH 09/22] [fix] fix gemini strategy

---
 .../Chat/coati/trainer/strategies/colossalai.py  | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 1e7caa6012c5..e5dcfd66f5f5 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -1,13 +1,16 @@
 import warnings
 from typing import Optional
 
+import torch
+import torch.distributed as dist
 import torch.nn as nn
 
 import colossalai
 from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.lazy.lazy_init import LazyInitContext
+from colossalai.tensor import ProcessGroup, ShardSpec
 from colossalai.utils import get_current_device
+from colossalai.zero import ColoInitContext
 from colossalai.zero.gemini.gemini_ddp import GeminiDDP
 
 from .ddp import DDPStrategy
@@ -59,7 +62,7 @@ def __init__(
         assert stage in (1, 2), f'Unsupported stage "{stage}"'
         assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
         assert precision in ("fp32", "fp16"), f'Unsupported precision "{precision}"'
-        
+
         plugin_initializer = lambda: LowLevelZeroPlugin(
             stage=stage,
             precision=precision,
@@ -188,10 +191,15 @@ def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
-        return super().model_init_context()
+        world_size = dist.get_world_size()
+        shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
+        default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
+        return ColoInitContext(
+            device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec
+        )
+        # return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
         ddp_model = model.unwrap()
         assert isinstance(ddp_model, GeminiDDP)
         return ddp_model.module
-

From 5443763f3e35d66ba319b1643beff356964dc921 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Wed, 20 Sep 2023 17:31:16 +0800
Subject: [PATCH 10/22] [fix] fix gemini strategy

---
 .../benchmarks/benchmark_opt_lora_dummy.py    | 20 +++++++++----------
 .../Chat/coati/models/bloom/bloom_actor.py    |  3 ---
 applications/Chat/coati/models/generation.py  |  1 -
 applications/Chat/coati/models/lora.py        | 14 ++++++-------
 applications/Chat/coati/ray/utils.py          | 12 +++++------
 .../Chat/coati/trainer/strategies/base.py     |  1 -
 .../Chat/coati/trainer/strategies/ddp.py      |  4 ----
 .../community/peft/train_peft_prompts.py      |  8 ++++----
 .../examples/community/peft/train_peft_sft.py |  8 ++++----
 applications/Chat/examples/train_prompts.py   |  8 ++++----
 .../Chat/examples/train_reward_model.py       | 11 ++++------
 applications/Chat/examples/train_sft.py       | 15 ++++++--------
 applications/Chat/tests/test_experience.py    |  4 ++--
 13 files changed, 46 insertions(+), 63 deletions(-)

diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 5647d8bc03d3..14a313d886d8 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -75,16 +75,16 @@ def get_gpt_config(model_name: str) -> OPTConfig:
 def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
-    elif args.strategy == 'colossalai_zero2':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2_cpu':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
-    elif args.strategy == 'colossalai_zero1':
-        strategy = LowLevelZeroStrategy(stage=1, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero1_cpu':
-        strategy = LowLevelZeroStrategy(stage=1, placement_policy='cpu')
+    elif args.strategy == "colossalai_gemini":
+        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+    elif args.strategy == "colossalai_zero2":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
+    elif args.strategy == "colossalai_zero2_cpu":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
+    elif args.strategy == "colossalai_zero1":
+        strategy = LowLevelZeroStrategy(stage=1, placement_policy="cuda")
+    elif args.strategy == "colossalai_zero1_cpu":
+        strategy = LowLevelZeroStrategy(stage=1, placement_policy="cpu")
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
diff --git a/applications/Chat/coati/models/bloom/bloom_actor.py b/applications/Chat/coati/models/bloom/bloom_actor.py
index 8f37cc1a5396..73855a2245e7 100644
--- a/applications/Chat/coati/models/bloom/bloom_actor.py
+++ b/applications/Chat/coati/models/bloom/bloom_actor.py
@@ -2,8 +2,6 @@
 
 from transformers import BloomConfig, BloomForCausalLM
 
-from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
-
 from ..base import Actor
 
 
@@ -28,7 +26,6 @@ def __init__(
         lora_train_bias: str = "none",
     ) -> None:
         if pretrained is not None:
-            # model = BloomForCausalLM(BloomConfig())
             model = BloomForCausalLM.from_pretrained(pretrained)
         elif config is not None:
             model = BloomForCausalLM(config)
diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
index 1063edad5f26..4ab0cdc8a3ea 100644
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -69,7 +69,6 @@ def _sample(
         next_token_logits = logits_processor(input_ids, next_token_logits)
         # sample
         probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float)
-        print(probs)
         next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
 
         # finished sentences should have their next token be a padding token
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
index 35613cd81b45..2114913e107b 100644
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -70,9 +70,7 @@ def eval(self):
         def T(w):
             return w.T if self.fan_in_fan_out else w
 
-        print("self.merge_weights and not self.merged" + str(self.merge_weights) + str(not self.merged))
         nn.Module.eval(self)
-        self.merge_weights = True
         if self.merge_weights and not self.merged:
             # Merge the weights and mark it
             if self.r > 0:
@@ -102,12 +100,12 @@ def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int) -> LoraLinear:
     return lora_linear
 
 
-def _convert_to_lora_recursively(module: nn.Module, lora_rank: int, merge_weights: bool = False) -> None:
+def _convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None:
     for name, child in module.named_children():
         if isinstance(child, nn.Linear):
-            setattr(module, name, _lora_linear_wrapper(child, lora_rank, merge_weights))
+            setattr(module, name, _lora_linear_wrapper(child, lora_rank))
         else:
-            _convert_to_lora_recursively(child, lora_rank, merge_weights)
+            _convert_to_lora_recursively(child, lora_rank)
 
 
 def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = "none") -> nn.Module:
@@ -122,7 +120,7 @@ def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: s
     """
     if lora_rank <= 0:
         return module
-    _convert_to_lora_recursively(module, lora_rank, merge_weights)
+    _convert_to_lora_recursively(module, lora_rank)
     lora.mark_only_lora_as_trainable(module, lora_train_bias)
     return module
 
@@ -143,5 +141,5 @@ def __init__(self, lora_rank: int = 0, lora_train_bias: str = "none") -> None:
         self.lora_rank = lora_rank
         self.lora_train_bias = lora_train_bias
 
-    def convert_to_lora(self, merge_weights: bool = False) -> None:
-        convert_to_lora_module(self, self.lora_rank, self.lora_train_bias, merge_weights)
+    def convert_to_lora(self) -> None:
+        convert_to_lora_module(self, self.lora_rank, self.lora_train_bias)
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
index caf22435fdb2..33a7cdcae26a 100644
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -70,12 +70,12 @@ def get_reward_model_from_args(model: str, pretrained: str = None, config=None):
 def get_strategy_from_args(strategy: str):
     if strategy == "ddp":
         strategy_ = DDPStrategy()
-    elif strategy == 'colossalai_gemini':
-        strategy_ = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
-    elif strategy == 'colossalai_zero2':
-        strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
-    elif strategy == 'colossalai_zero2_cpu':
-        strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
+    elif strategy == "colossalai_gemini":
+        strategy_ = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+    elif strategy == "colossalai_zero2":
+        strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
+    elif strategy == "colossalai_zero2_cpu":
+        strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
     return strategy_
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index 2cf7e29a2135..f25cd725a633 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -49,7 +49,6 @@ def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = Fal
         pass
 
     def model_init_context(self):
-        # print("aaaaaaa nullcontext")
         return nullcontext()
 
     def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]:
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 9187b74c987e..66ff6703da4d 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -16,7 +16,6 @@
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel
 
-from ...models.lora import LoraLinear
 from .base import Strategy
 from .sampler import DistributedSampler
 
@@ -112,9 +111,6 @@ def _replace_keys(model_path: str, replace_fn: Callable):
         if dist.get_rank() == 0:
             _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
 
-    def load_pretrained(self, model, path):
-        self.load_model(model, path, strict=False)
-
     def get_model_state_dict_shard(self, model: nn.Module, **config):
         # TODO: implement sharding on naive strategy
         model = self.unwrap_model(model)
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index accbdd28500e..dc6e0f504be7 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -23,10 +23,10 @@ def main(args):
     # configure strategy
     if args.strategy == "ddp":
         strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
-    elif args.strategy == 'colossalai_zero2':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
+    elif args.strategy == "colossalai_gemini":
+        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+    elif args.strategy == "colossalai_zero2":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index 032d83d93c8e..091e85c1378b 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -23,10 +23,10 @@ def train(args):
     # configure strategy
     if args.strategy == "ddp":
         strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='auto')
-    elif args.strategy == 'colossalai_zero2':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
+    elif args.strategy == "colossalai_gemini":
+        strategy = GeminiStrategy(placement_policy="auto")
+    elif args.strategy == "colossalai_zero2":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 8ccf73b96e4f..f04edc030ce0 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -22,10 +22,10 @@ def main(args):
     # configure strategy
     if args.strategy == "ddp":
         strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='auto', initial_scale=2**5)
-    elif args.strategy == 'colossalai_zero2':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
+    elif args.strategy == "colossalai_gemini":
+        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+    elif args.strategy == "colossalai_zero2":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index eb2ff5499e0f..6d14e42a293f 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -1,6 +1,5 @@
 import argparse
 import warnings
-from random import randint
 
 import torch
 import torch.distributed as dist
@@ -27,10 +26,10 @@ def train(args):
     # configure strategy
     if args.strategy == "ddp":
         strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='auto')
-    elif args.strategy == 'colossalai_zero2':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
+    elif args.strategy == "colossalai_gemini":
+        strategy = GeminiStrategy(placement_policy="auto")
+    elif args.strategy == "colossalai_zero2":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
@@ -164,8 +163,6 @@ def train(args):
         use_wandb=args.use_wandb,
     )
     # save model checkpoint after fitting on only rank0
-    strategy.eval(model)
-    print(args.save_path)
     strategy.save_model(model, args.save_path, only_rank0=True)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index 3456c09b2d31..bd060301998c 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -29,12 +29,12 @@ def train(args):
     # configure strategy
     if args.strategy == "ddp":
         strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = GeminiStrategy(placement_policy='auto')
-    elif args.strategy == 'colossalai_zero2':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2_cpu':
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
+    elif args.strategy == "colossalai_gemini":
+        strategy = GeminiStrategy(placement_policy="auto")
+    elif args.strategy == "colossalai_zero2":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
+    elif args.strategy == "colossalai_zero2_cpu":
+        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
         raise ValueError(f'Unsupported strategy "{args.strategy}"')
 
@@ -179,9 +179,6 @@ def train(args):
     )
 
     # save model checkpoint after fitting on only rank0
-    # model.eval()
-    # print(type(model))
-    # print("eval eval")
     strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py
index 9ec145b7b62e..70287a35cea0 100644
--- a/applications/Chat/tests/test_experience.py
+++ b/applications/Chat/tests/test_experience.py
@@ -42,8 +42,8 @@ def make_and_consume_experience(strategy):
         strategy = DDPStrategy()
     elif strategy == "colossalai-zero2":
         strategy = LowLevelZeroStrategy()
-    elif strategy == 'colossalai-gemini':
-        strategy = GeminiStrategy(placement_policy='auto')
+    elif strategy == "colossalai-gemini":
+        strategy = GeminiStrategy(placement_policy="auto")
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
 

From 48f39679cbfb99a37f51c0defd8d1676e67fc2d4 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 21 Sep 2023 15:04:33 +0800
Subject: [PATCH 11/22] [fix] fix gemini strategy

---
 .../coati/trainer/strategies/colossalai.py     | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index e5dcfd66f5f5..1c0d1a0d2fb8 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -1,16 +1,12 @@
 import warnings
 from typing import Optional
 
-import torch
-import torch.distributed as dist
 import torch.nn as nn
 
 import colossalai
 from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.tensor import ProcessGroup, ShardSpec
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext
 from colossalai.zero.gemini.gemini_ddp import GeminiDDP
 
 from .ddp import DDPStrategy
@@ -191,13 +187,13 @@ def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
-        world_size = dist.get_world_size()
-        shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
-        default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
-        return ColoInitContext(
-            device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec
-        )
-        # return super().model_init_context()
+        # world_size = dist.get_world_size()
+        # shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
+        # default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
+        # return ColoInitContext(
+        #     device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec
+        # )
+        return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
         ddp_model = model.unwrap()

From 8fcbcb27aa2529d4be0dd6a1f1bd919a31a7885d Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 22 Sep 2023 17:57:27 +0800
Subject: [PATCH 12/22] [fix] fix gemini strategy

---
 applications/Chat/benchmarks/benchmark_opt_lora_dummy.py | 2 ++
 applications/Chat/coati/ray/utils.py                     | 2 ++
 applications/Chat/coati/trainer/strategies/colossalai.py | 9 +++------
 applications/Chat/examples/train_prompts.py              | 2 +-
 applications/Chat/tests/test_experience.py               | 2 +-
 colossalai/tensor/param_op_hook.py                       | 2 +-
 colossalai/zero/gemini/colo_init_context.py              | 2 +-
 colossalai/zero/gemini/gemini_ddp.py                     | 6 +++---
 8 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 14a313d886d8..583ce860e608 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -77,6 +77,8 @@ def main(args):
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
         strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+    elif args.strategy == "colossalai_gemini_cpu":
+        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif args.strategy == "colossalai_zero2_cpu":
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
index 33a7cdcae26a..6f8e2d94ce8e 100644
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -76,6 +76,8 @@ def get_strategy_from_args(strategy: str):
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif strategy == "colossalai_zero2_cpu":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
+    elif strategy == "colossalai_gemini_cpu":
+        strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
     return strategy_
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 1c0d1a0d2fb8..0e3258304107 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -159,6 +159,9 @@ def __init__(
         plugin_initializer = lambda: GeminiPlugin(
             chunk_init_device=get_current_device(),
             placement_policy=placement_policy,
+            shard_param_frac=1.0,
+            offload_optim_frac=1.0,
+            offload_param_frac=1.0,
             precision="fp16",
             pin_memory=pin_memory,
             force_outputs_fp32=force_outputs_fp32,
@@ -187,12 +190,6 @@ def setup_distributed(self) -> None:
         colossalai.launch_from_torch({}, seed=self.seed)
 
     def model_init_context(self):
-        # world_size = dist.get_world_size()
-        # shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
-        # default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
-        # return ColoInitContext(
-        #     device=get_current_device(), dtype=torch.half, default_pg=shard_pg, default_dist_spec=default_dist_spec
-        # )
         return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index f04edc030ce0..29063295a0c2 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -23,7 +23,7 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     else:
diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py
index 70287a35cea0..a9591259800d 100644
--- a/applications/Chat/tests/test_experience.py
+++ b/applications/Chat/tests/test_experience.py
@@ -43,7 +43,7 @@ def make_and_consume_experience(strategy):
     elif strategy == "colossalai-zero2":
         strategy = LowLevelZeroStrategy()
     elif strategy == "colossalai-gemini":
-        strategy = GeminiStrategy(placement_policy="auto")
+        strategy = GeminiStrategy(placement_policy="static")
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
 
diff --git a/colossalai/tensor/param_op_hook.py b/colossalai/tensor/param_op_hook.py
index a92a0d3fc40d..1fe99cd89a4e 100644
--- a/colossalai/tensor/param_op_hook.py
+++ b/colossalai/tensor/param_op_hook.py
@@ -142,7 +142,7 @@ def _flatten_grad_args(args) -> Tuple[list, list, List[bool], TreeSpec]:
             grad_args.append(arg)
         else:
             other_args.append(arg)
-    # assert len(grad_args) > 0
+    assert len(grad_args) > 0
     return grad_args, other_args, grad_flags, spec
 
 
diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py
index 711b0d2196db..ab2ff8f920aa 100644
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -130,7 +130,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
                 replaced_tensors[param] = colo_param
             delattr(submodule, param_name)
             setattr(submodule, param_name, colo_param)
-            # colo_param.shared_param_modules.append(submodule)
+            colo_param.shared_param_modules.append(submodule)
 
         param_number = 0
         meta_param_number = 0
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index c3ecd2578468..8b149a65497a 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -237,9 +237,9 @@ def _post_forward(self):
     def forward(self, *args, **kwargs):
         # check whether we are in a inference mode
         grad_flag = torch.is_grad_enabled()
-        # if not grad_flag:
-        #     assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup(
-        #     ), "You should run a completed iteration as your warmup iter"
+        if not grad_flag:
+            assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup(
+            ), "You should run a completed iteration as your warmup iter"
 
         args, kwargs = _cast_float(args, self.mixed_precision), _cast_float(kwargs, self.mixed_precision)
         self.module.zero_grad(set_to_none=True)

From 16c5d3a7672e470efd3a0b151a532b32e5f27dd8 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 22 Sep 2023 18:02:48 +0800
Subject: [PATCH 13/22] [fix] fix gemini strategy

---
 applications/Chat/examples/community/peft/train_peft_prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index dc6e0f504be7..bbac91477085 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -24,7 +24,7 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:

From a11f72865ae20cfc92adb2c10e5e567904e9ac5c Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 22 Sep 2023 18:38:42 +0800
Subject: [PATCH 14/22] [fix] fix gemini strategy

---
 applications/Chat/tests/test_train.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh
index 55de269005ed..e8ea9281c22d 100755
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
@@ -34,7 +34,7 @@ if [ -z "$PRETRAIN_DATASET" ]; then
     exit 1
 fi
 
-NUM_RETRY=3
+# NUM_RETRY=3
 BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
 EXAMPLES_DIR=$BASE_DIR/examples
 MODELS_DIR=$BASE_DIR/examples/models_config
@@ -80,9 +80,7 @@ SKIPPED_TESTS=(
     "llama-ddp"
     "llama-colossalai_gemini"
     "llama-colossalai_zero2"
-    "gpt2-colossalai_gemini"
-    "opt-colossalai_gemini"
-    "bloom-colossalai_gemini"
+    "bloom-colossalai_zero2-4"
 )
 
 GRAD_CKPTS=('' '--grad_checkpoint')

From 6903f9662ea618eda74aaac04ac5989982391c8c Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 22 Sep 2023 18:42:01 +0800
Subject: [PATCH 15/22] [fix] fix gemini strategy

---
 applications/Chat/tests/test_train.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh
index e8ea9281c22d..1a470a7fcfcb 100755
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
@@ -34,7 +34,7 @@ if [ -z "$PRETRAIN_DATASET" ]; then
     exit 1
 fi
 
-# NUM_RETRY=3
+NUM_RETRY=3
 BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
 EXAMPLES_DIR=$BASE_DIR/examples
 MODELS_DIR=$BASE_DIR/examples/models_config

From 057b74b043b25703e9b3c9a64c8049457f05b69b Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 22 Sep 2023 18:42:56 +0800
Subject: [PATCH 16/22] [fix] fix gemini strategy

---
 applications/Chat/coati/ray/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
index 6f8e2d94ce8e..49ab96b6720d 100644
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -74,10 +74,10 @@ def get_strategy_from_args(strategy: str):
         strategy_ = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
     elif strategy == "colossalai_zero2":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    elif strategy == "colossalai_zero2_cpu":
-        strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     elif strategy == "colossalai_gemini_cpu":
         strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5)
+    elif strategy == "colossalai_zero2_cpu":
+        strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
         raise ValueError(f'Unsupported strategy "{strategy}"')
     return strategy_

From 5f42f1c73143021bbda4980ab93175d010b62232 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Mon, 25 Sep 2023 16:58:08 +0800
Subject: [PATCH 17/22] fix

---
 .../Chat/benchmarks/benchmark_opt_lora_dummy.py  |  4 ++--
 applications/Chat/coati/ray/utils.py             |  4 ++--
 .../Chat/coati/trainer/strategies/base.py        |  2 +-
 .../Chat/coati/trainer/strategies/colossalai.py  | 14 ++++++++------
 .../Chat/coati/trainer/strategies/ddp.py         | 16 ++++++++--------
 .../community/peft/train_peft_prompts.py         |  2 +-
 .../examples/community/peft/train_peft_sft.py    |  2 +-
 applications/Chat/examples/train_prompts.py      |  2 +-
 8 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 583ce860e608..0d0e2a7d34f5 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -76,9 +76,9 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5)
     elif args.strategy == "colossalai_gemini_cpu":
-        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif args.strategy == "colossalai_zero2_cpu":
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
index 49ab96b6720d..b88140c0e036 100644
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -71,11 +71,11 @@ def get_strategy_from_args(strategy: str):
     if strategy == "ddp":
         strategy_ = DDPStrategy()
     elif strategy == "colossalai_gemini":
-        strategy_ = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+        strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     elif strategy == "colossalai_zero2":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif strategy == "colossalai_gemini_cpu":
-        strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5)
+        strategy_ = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
     elif strategy == "colossalai_zero2_cpu":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index f25cd725a633..c1ec1a02b6a9 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -111,7 +111,7 @@ def unwrap_model(model: nn.Module) -> nn.Module:
         return model
 
     def save_model(self, model: nn.Module, path: str, only_rank0: bool = True, **kwargs) -> None:
-        self.booster.save_model(model, path, shard=False, **kwargs)
+        self.booster.save_model(model, path, shard=True, **kwargs)
 
     def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None:
         self.booster.load_model(model, path, strict)
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 0e3258304107..7129edb060ef 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -129,6 +129,9 @@ def __init__(
         seed: int = 42,
         shard_init: bool = False,  # only for stage 3
         placement_policy: str = "auto",
+        shard_param_frac: float = 1.0,  # only for static placement
+        offload_optim_frac: float = 0.0,  # only for static placement
+        offload_param_frac: float = 0.0,  # only for static placement
         pin_memory: bool = True,  # only for stage 3
         force_outputs_fp32: bool = False,  # only for stage 3
         search_range_m: int = 32,  # only for stage 3
@@ -159,9 +162,9 @@ def __init__(
         plugin_initializer = lambda: GeminiPlugin(
             chunk_init_device=get_current_device(),
             placement_policy=placement_policy,
-            shard_param_frac=1.0,
-            offload_optim_frac=1.0,
-            offload_param_frac=1.0,
+            shard_param_frac=shard_param_frac,
+            offload_optim_frac=offload_optim_frac,
+            offload_param_frac=offload_param_frac,
             precision="fp16",
             pin_memory=pin_memory,
             force_outputs_fp32=force_outputs_fp32,
@@ -193,6 +196,5 @@ def model_init_context(self):
         return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
-        ddp_model = model.unwrap()
-        assert isinstance(ddp_model, GeminiDDP)
-        return ddp_model.module
+        assert isinstance(model, GeminiDDP)
+        return model.module
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 66ff6703da4d..b9be24b3e7b7 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -98,18 +98,18 @@ def save_pretrained(
             pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
             if tokenizer is not None:
                 tokenizer.save_pretrained(path)
-        model_path = os.path.join(path, "pytorch_model.bin")
-        self.save_model(model, model_path, only_rank0=only_rank0)
+        # model_path = os.path.join(path, "pytorch_model.bin")
+        self.save_model(model, path, only_rank0=only_rank0)
 
-        def _replace_keys(model_path: str, replace_fn: Callable):
-            state_dict = torch.load(model_path, map_location="cpu")
-            state_dict = {replace_fn(k): v for k, v in state_dict.items()}
-            torch.save(state_dict, model_path)
+        # def _replace_keys(model_path: str, replace_fn: Callable):
+        #     state_dict = torch.load(model_path, map_location="cpu")
+        #     state_dict = {replace_fn(k): v for k, v in state_dict.items()}
+        #     torch.save(state_dict, model_path)
 
         # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
         # HACK: rename keys of pytorch_model.bin
-        if dist.get_rank() == 0:
-            _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
+        # if dist.get_rank() == 0:
+        #     _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
 
     def get_model_state_dict_shard(self, model: nn.Module, **config):
         # TODO: implement sharding on naive strategy
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index bbac91477085..99a024f1463c 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -24,7 +24,7 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index 091e85c1378b..3bbef7208374 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -24,7 +24,7 @@ def train(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto")
+        strategy = GeminiStrategy(placement_policy="static")
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     else:
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 29063295a0c2..f04edc030ce0 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -23,7 +23,7 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     else:

From 3899f734b60e55957fa6ee0407b7854424b265bd Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 25 Sep 2023 23:50:38 +0800
Subject: [PATCH 18/22] fix

---
 applications/Chat/coati/models/base/actor.py  |  1 +
 .../Chat/coati/trainer/strategies/base.py     |  4 ++--
 .../coati/trainer/strategies/colossalai.py    |  1 +
 .../Chat/coati/trainer/strategies/ddp.py      | 19 ++++-----------
 applications/Chat/examples/train_prompts.py   | 23 +++++++++++--------
 .../Chat/examples/train_reward_model.py       |  3 ++-
 applications/Chat/examples/train_sft.py       |  4 +++-
 applications/Chat/requirements-test.txt       |  2 +-
 applications/Chat/requirements.txt            |  2 +-
 applications/Chat/tests/test_inference.sh     |  3 ++-
 applications/Chat/tests/test_train.sh         | 17 ++++----------
 11 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/applications/Chat/coati/models/base/actor.py b/applications/Chat/coati/models/base/actor.py
index 0634631df7a3..8b2b81ed071c 100644
--- a/applications/Chat/coati/models/base/actor.py
+++ b/applications/Chat/coati/models/base/actor.py
@@ -30,3 +30,4 @@ def forward(
         """Returns model output."""
         output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
         return output
+    
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index c1ec1a02b6a9..51c0b5fa6526 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -110,8 +110,8 @@ def unwrap_model(model: nn.Module) -> nn.Module:
         """
         return model
 
-    def save_model(self, model: nn.Module, path: str, only_rank0: bool = True, **kwargs) -> None:
-        self.booster.save_model(model, path, shard=True, **kwargs)
+    def save_model(self, model: nn.Module, path: str, shard: bool = True, **kwargs) -> None:
+        self.booster.save_model(model, path, shard=shard, **kwargs)
 
     def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None:
         self.booster.load_model(model, path, strict)
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 7129edb060ef..b1257a07cc88 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -196,5 +196,6 @@ def model_init_context(self):
         return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
+        model = model.unwrap()
         assert isinstance(model, GeminiDDP)
         return model.module
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index b9be24b3e7b7..4ce0d79fc124 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -87,9 +87,9 @@ def unwrap_model(self, model: nn.Module) -> nn.Module:
         return model.unwrap()
 
     def save_pretrained(
-        self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
+        self, model: nn.Module, path: str, shard: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
     ) -> None:
-        if not only_rank0 or dist.get_rank() == 0:
+        if dist.get_rank() == 0:
             unwrapped_model = self.unwrap_model(model)
             assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
             pretrained_model = unwrapped_model.model
@@ -98,18 +98,9 @@ def save_pretrained(
             pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
             if tokenizer is not None:
                 tokenizer.save_pretrained(path)
-        # model_path = os.path.join(path, "pytorch_model.bin")
-        self.save_model(model, path, only_rank0=only_rank0)
-
-        # def _replace_keys(model_path: str, replace_fn: Callable):
-        #     state_dict = torch.load(model_path, map_location="cpu")
-        #     state_dict = {replace_fn(k): v for k, v in state_dict.items()}
-        #     torch.save(state_dict, model_path)
-
-        # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
-        # HACK: rename keys of pytorch_model.bin
-        # if dist.get_rank() == 0:
-        #     _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
+
+        self.save_model(model, path, shard=shard)
+
 
     def get_model_state_dict_shard(self, model: nn.Module, **config):
         # TODO: implement sharding on naive strategy
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index f04edc030ce0..81343ad8f6f4 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -23,7 +23,7 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     else:
@@ -40,13 +40,13 @@ def main(args):
     with strategy.model_init_context():
         # configure model
         if args.model == "gpt2":
-            initial_model = GPTActor(pretrained=args.pretrain)
+            initial_model = GPTActor()
         elif args.model == "bloom":
-            initial_model = BLOOMActor(pretrained=args.pretrain)
+            initial_model = BLOOMActor()
         elif args.model == "opt":
-            initial_model = OPTActor(pretrained=args.pretrain)
+            initial_model = OPTActor()
         elif args.model == "llama":
-            initial_model = LlamaActor(pretrained=args.pretrain)
+            initial_model = LlamaActor()
         else:
             raise ValueError(f'Unsupported actor model "{args.model}"')
 
@@ -73,13 +73,13 @@ def main(args):
         reward_model.to(torch.bfloat16).to(torch.cuda.current_device())
 
         if args.model == "gpt2":
-            actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            actor = GPTActor(lora_rank=args.lora_rank)
         elif args.model == "bloom":
-            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            actor = BLOOMActor(lora_rank=args.lora_rank)
         elif args.model == "opt":
-            actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            actor = OPTActor(lora_rank=args.lora_rank)
         elif args.model == "llama":
-            actor = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            actor = LlamaActor(lora_rank=args.lora_rank)
         else:
             raise ValueError(f'Unsupported actor model "{args.model}"')
 
@@ -165,6 +165,9 @@ def main(args):
         (actor, actor_optim), (critic, critic_optim), reward_model, initial_model
     )
 
+    strategy.load_model(initial_model, args.pretrain)
+    strategy.load_model(actor, args.pretrain)
+
     # configure trainer
     trainer = PPOTrainer(
         strategy,
@@ -197,7 +200,7 @@ def main(args):
     )
 
     # save model checkpoint after fitting
-    strategy.save_model(actor, args.save_path, only_rank0=True)
+    strategy.save_model(actor, args.save_path)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 6d14e42a293f..4f2a68905c5c 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -163,7 +163,8 @@ def train(args):
         use_wandb=args.use_wandb,
     )
     # save model checkpoint after fitting on only rank0
-    strategy.save_model(model, args.save_path, only_rank0=True)
+    state_dict = model.state_dict()
+    torch.save(state_dict, args.save_path)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index bd060301998c..e93a3e523d47 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -179,7 +179,9 @@ def train(args):
     )
 
     # save model checkpoint after fitting on only rank0
-    strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer)
+    print(type(model))
+    # print(model)
+    strategy.save_pretrained(model, path=args.save_path, tokenizer=tokenizer)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(
diff --git a/applications/Chat/requirements-test.txt b/applications/Chat/requirements-test.txt
index c688935bda31..93d48bcb6f79 100644
--- a/applications/Chat/requirements-test.txt
+++ b/applications/Chat/requirements-test.txt
@@ -1,2 +1,2 @@
 pytest
-colossalai==0.3.2
+colossalai==0.3.3
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index 0b1ee1785fa1..e56aaca0e7cb 100644
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.20.1
 tqdm
 datasets
 loralib
-colossalai==0.3.2
+colossalai==0.3.3
 torch<2.0.0, >=1.12.1
 langchain
 tokenizers
diff --git a/applications/Chat/tests/test_inference.sh b/applications/Chat/tests/test_inference.sh
index 849db06e58ab..7f4a475320ff 100755
--- a/applications/Chat/tests/test_inference.sh
+++ b/applications/Chat/tests/test_inference.sh
@@ -1,6 +1,7 @@
 set -xue
 
-BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
+#BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
+BASE_DIR="/home/lcjmy/vepfs/ColossalAI/applications/Chat"
 EXAMPLES_DIR=$BASE_DIR/examples
 
 echo "[Test]: testing inference ..."
diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh
index 1a470a7fcfcb..7a01acd1755c 100755
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
@@ -80,11 +80,10 @@ SKIPPED_TESTS=(
     "llama-ddp"
     "llama-colossalai_gemini"
     "llama-colossalai_zero2"
-    "bloom-colossalai_zero2-4"
 )
 
 GRAD_CKPTS=('' '--grad_checkpoint')
-for lora_rank in '0' '4'; do
+for lora_rank in '0'; do
     for model in ${MODELS[@]}; do
         strategies=($(shuf -e "${STRATEGIES[@]}"))
         for strategy in ${strategies[@]}; do
@@ -133,14 +132,11 @@ SKIPPED_TESTS=(
     "llama-ddp"
     "llama-colossalai_gemini"
     "llama-colossalai_zero2"
-    "gpt2-colossalai_gemini"
-    "opt-colossalai_gemini"
-    "bloom-colossalai_gemini"
 )
 
 LOSS_FNS=('log_sig' 'log_exp')
 DATASETS=('Anthropic/hh-rlhf' 'Dahoas/rm-static')
-for lora_rank in '0' '4'; do
+for lora_rank in '0'; do
     for model in ${MODELS[@]}; do
         strategies=($(shuf -e "${STRATEGIES[@]}"))
         for strategy in ${strategies[@]}; do
@@ -191,13 +187,10 @@ SKIPPED_TESTS=(
     "llama-ddp"
     "llama-colossalai_gemini"
     "llama-colossalai_zero2"
-    "gpt2-colossalai_gemini"
-    "opt-colossalai_gemini"
-    "bloom-colossalai_gemini"
 )
 
 for model in ${MODELS[@]}; do
-    for lora_rank in '0' '4'; do
+    for lora_rank in '0'; do
         strategies=($(shuf -e "${STRATEGIES[@]}"))
         for strategy in ${strategies[@]}; do
             if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
@@ -221,7 +214,7 @@ for model in ${MODELS[@]}; do
                     --experience_batch_size 2 --train_batch_size 1 --lora_rank $lora_rank \
                     --pretrain $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank} \
                     $rm_pretrain_model --rm_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt \
-                    --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts.pt
+                    --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts
                 passed=$?
                 if [ $passed -eq 0 ]; then
                     break
@@ -236,4 +229,4 @@ for model in ${MODELS[@]}; do
         rm $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt
     done
 done
-rm $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts.pt
+rm -rf $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts

From ecb821da65929d4552e19fb3f38a4f917a31a123 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Tue, 26 Sep 2023 00:08:31 +0800
Subject: [PATCH 19/22] fix

---
 .../coati/trainer/strategies/colossalai.py     |  1 -
 .../Chat/coati/trainer/strategies/ddp.py       | 18 +++++++++---------
 applications/Chat/examples/requirements.txt    |  2 +-
 applications/Chat/examples/train_sft.py        |  2 --
 applications/Chat/tests/test_inference.sh      |  3 +--
 5 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index b1257a07cc88..7129edb060ef 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -196,6 +196,5 @@ def model_init_context(self):
         return super().model_init_context()
 
     def unwrap_model(self, model: nn.Module) -> nn.Module:
-        model = model.unwrap()
         assert isinstance(model, GeminiDDP)
         return model.module
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 4ce0d79fc124..2fb2a4fa7501 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -89,15 +89,15 @@ def unwrap_model(self, model: nn.Module) -> nn.Module:
     def save_pretrained(
         self, model: nn.Module, path: str, shard: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
     ) -> None:
-        if dist.get_rank() == 0:
-            unwrapped_model = self.unwrap_model(model)
-            assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
-            pretrained_model = unwrapped_model.model
-            assert isinstance(pretrained_model, PreTrainedModel)
-            # HACK: only use hf save_pretrained to save config
-            pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
-            if tokenizer is not None:
-                tokenizer.save_pretrained(path)
+        # if dist.get_rank() == 0:
+        #     unwrapped_model = self.unwrap_model(model)
+        #     assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
+        #     pretrained_model = unwrapped_model.model
+        #     assert isinstance(pretrained_model, PreTrainedModel)
+        #     # HACK: only use hf save_pretrained to save config
+        #     pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
+        #     if tokenizer is not None:
+        #         tokenizer.save_pretrained(path)
 
         self.save_model(model, path, shard=shard)
 
diff --git a/applications/Chat/examples/requirements.txt b/applications/Chat/examples/requirements.txt
index 0890917048d2..5474dfa16b3e 100644
--- a/applications/Chat/examples/requirements.txt
+++ b/applications/Chat/examples/requirements.txt
@@ -1,3 +1,3 @@
 pandas>=1.4.1
 sentencepiece
-colossalai==0.3.2
+colossalai==0.3.3
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index e93a3e523d47..137a8267fde4 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -179,8 +179,6 @@ def train(args):
     )
 
     # save model checkpoint after fitting on only rank0
-    print(type(model))
-    # print(model)
     strategy.save_pretrained(model, path=args.save_path, tokenizer=tokenizer)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
diff --git a/applications/Chat/tests/test_inference.sh b/applications/Chat/tests/test_inference.sh
index 7f4a475320ff..849db06e58ab 100755
--- a/applications/Chat/tests/test_inference.sh
+++ b/applications/Chat/tests/test_inference.sh
@@ -1,7 +1,6 @@
 set -xue
 
-#BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
-BASE_DIR="/home/lcjmy/vepfs/ColossalAI/applications/Chat"
+BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
 EXAMPLES_DIR=$BASE_DIR/examples
 
 echo "[Test]: testing inference ..."

From 620c44865d107e88d6e02a5ceb56a4dd113c498e Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Tue, 26 Sep 2023 00:16:14 +0800
Subject: [PATCH 20/22] fix

---
 .../Chat/coati/trainer/strategies/ddp.py       | 18 +++++++++---------
 applications/Chat/examples/train_prompts.py    | 16 ++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 2fb2a4fa7501..4ce0d79fc124 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -89,15 +89,15 @@ def unwrap_model(self, model: nn.Module) -> nn.Module:
     def save_pretrained(
         self, model: nn.Module, path: str, shard: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
     ) -> None:
-        # if dist.get_rank() == 0:
-        #     unwrapped_model = self.unwrap_model(model)
-        #     assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
-        #     pretrained_model = unwrapped_model.model
-        #     assert isinstance(pretrained_model, PreTrainedModel)
-        #     # HACK: only use hf save_pretrained to save config
-        #     pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
-        #     if tokenizer is not None:
-        #         tokenizer.save_pretrained(path)
+        if dist.get_rank() == 0:
+            unwrapped_model = self.unwrap_model(model)
+            assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
+            pretrained_model = unwrapped_model.model
+            assert isinstance(pretrained_model, PreTrainedModel)
+            # HACK: only use hf save_pretrained to save config
+            pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
+            if tokenizer is not None:
+                tokenizer.save_pretrained(path)
 
         self.save_model(model, path, shard=shard)
 
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 81343ad8f6f4..6624f9cebffd 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -40,13 +40,13 @@ def main(args):
     with strategy.model_init_context():
         # configure model
         if args.model == "gpt2":
-            initial_model = GPTActor()
+            initial_model = GPTActor(pretrained=args.pretrain)
         elif args.model == "bloom":
-            initial_model = BLOOMActor()
+            initial_model = BLOOMActor(pretrained=args.pretrain)
         elif args.model == "opt":
-            initial_model = OPTActor()
+            initial_model = OPTActor(pretrained=args.pretrain)
         elif args.model == "llama":
-            initial_model = LlamaActor()
+            initial_model = LlamaActor(pretrained=args.pretrain)
         else:
             raise ValueError(f'Unsupported actor model "{args.model}"')
 
@@ -73,13 +73,13 @@ def main(args):
         reward_model.to(torch.bfloat16).to(torch.cuda.current_device())
 
         if args.model == "gpt2":
-            actor = GPTActor(lora_rank=args.lora_rank)
+            actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
         elif args.model == "bloom":
-            actor = BLOOMActor(lora_rank=args.lora_rank)
+            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
         elif args.model == "opt":
-            actor = OPTActor(lora_rank=args.lora_rank)
+            actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
         elif args.model == "llama":
-            actor = LlamaActor(lora_rank=args.lora_rank)
+            actor = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
         else:
             raise ValueError(f'Unsupported actor model "{args.model}"')
 

From 3c6c7172de7bb1156bf81bb276fb4491d4322c41 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Tue, 26 Sep 2023 18:52:03 +0800
Subject: [PATCH 21/22] fix

---
 applications/Chat/coati/trainer/strategies/base.py |  2 +-
 applications/Chat/coati/trainer/strategies/ddp.py  | 13 +++++++++++--
 applications/Chat/examples/train_prompts.py        |  7 ++++---
 applications/Chat/tests/test_checkpoint.py         |  4 ++--
 applications/Chat/tests/test_train.sh              |  1 +
 colossalai/zero/gemini/gemini_ddp.py               |  3 ++-
 6 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index 51c0b5fa6526..a78716216ae0 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -110,7 +110,7 @@ def unwrap_model(model: nn.Module) -> nn.Module:
         """
         return model
 
-    def save_model(self, model: nn.Module, path: str, shard: bool = True, **kwargs) -> None:
+    def save_model(self, model: nn.Module, path: str, shard: bool = False, **kwargs) -> None:
         self.booster.save_model(model, path, shard=shard, **kwargs)
 
     def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None:
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 4ce0d79fc124..f2a44aeb0961 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -87,7 +87,7 @@ def unwrap_model(self, model: nn.Module) -> nn.Module:
         return model.unwrap()
 
     def save_pretrained(
-        self, model: nn.Module, path: str, shard: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
+        self, model: nn.Module, path: str, shard: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None
     ) -> None:
         if dist.get_rank() == 0:
             unwrapped_model = self.unwrap_model(model)
@@ -99,7 +99,16 @@ def save_pretrained(
             if tokenizer is not None:
                 tokenizer.save_pretrained(path)
 
-        self.save_model(model, path, shard=shard)
+        model_path = os.path.join(path, "pytorch_model.bin")
+        self.save_model(model, model_path, shard=shard)
+        def _replace_keys(model_path: str, replace_fn: Callable):
+            state_dict = torch.load(model_path, map_location="cpu")
+            state_dict = {replace_fn(k): v for k, v in state_dict.items()}
+            torch.save(state_dict, model_path)
+        # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
+        # HACK: rename keys of pytorch_model.bin
+        if dist.get_rank() == 0:
+            _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
 
 
     def get_model_state_dict_shard(self, model: nn.Module, **config):
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 6624f9cebffd..37146a7c5f68 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -74,6 +74,7 @@ def main(args):
 
         if args.model == "gpt2":
             actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
+            # actor = GPTActor.from_pretrained(args.pretrain)
         elif args.model == "bloom":
             actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
         elif args.model == "opt":
@@ -165,8 +166,8 @@ def main(args):
         (actor, actor_optim), (critic, critic_optim), reward_model, initial_model
     )
 
-    strategy.load_model(initial_model, args.pretrain)
-    strategy.load_model(actor, args.pretrain)
+    # strategy.load_model(initial_model, args.pretrain)
+    # strategy.load_model(actor, args.pretrain)
 
     # configure trainer
     trainer = PPOTrainer(
@@ -200,7 +201,7 @@ def main(args):
     )
 
     # save model checkpoint after fitting
-    strategy.save_model(actor, args.save_path)
+    strategy.save_pretrained(actor, path=args.save_path)
     # save optimizer checkpoint on all ranks
     if args.need_optim_ckpt:
         strategy.save_optimizer(
diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py
index 9dfaa7c88206..9c08aa36c9b4 100644
--- a/applications/Chat/tests/test_checkpoint.py
+++ b/applications/Chat/tests/test_checkpoint.py
@@ -57,9 +57,9 @@ def run_test_checkpoint(strategy_name: str, shard: bool):
         rank0_dirname = rank0_dirname[0]
 
         model_path = os.path.join(rank0_dirname, "model" if shard else f"model.pt")
-        strategy.save_model(actor, model_path, only_rank0=not shard)
+        strategy.save_model(actor, model_path)
         optim_path = os.path.join(rank0_dirname, "optim" if shard else "optim.pt")
-        strategy.save_optimizer(actor_optim, optim_path, only_rank0=not shard)
+        strategy.save_optimizer(actor_optim, optim_path)
         dist.barrier()
 
         strategy.load_model(actor, model_path, strict=False)
diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh
index 7a01acd1755c..68fca7fbf8c0 100755
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
@@ -41,6 +41,7 @@ MODELS_DIR=$BASE_DIR/examples/models_config
 MODELS=('gpt2' 'bloom' 'opt' 'llama')
 STRATEGIES=('ddp' 'colossalai_gemini' 'colossalai_zero2')
 
+
 export OMP_NUM_THREADS=8
 
 # install requirements
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 8b149a65497a..580b497ce719 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -238,7 +238,8 @@ def forward(self, *args, **kwargs):
         # check whether we are in a inference mode
         grad_flag = torch.is_grad_enabled()
         if not grad_flag:
-            assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup(
+            assert (
+                not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup()
             ), "You should run a completed iteration as your warmup iter"
 
         args, kwargs = _cast_float(args, self.mixed_precision), _cast_float(kwargs, self.mixed_precision)

From f3201757c4e0a2661cfb297f9df30ea76e07e755 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Wed, 27 Sep 2023 10:52:54 +0800
Subject: [PATCH 22/22] Update train_prompts.py

---
 applications/Chat/examples/train_prompts.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index 37146a7c5f68..ecaf794f6333 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -74,7 +74,6 @@ def main(args):
 
         if args.model == "gpt2":
             actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
-            # actor = GPTActor.from_pretrained(args.pretrain)
         elif args.model == "bloom":
             actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
         elif args.model == "opt":
@@ -166,9 +165,6 @@ def main(args):
         (actor, actor_optim), (critic, critic_optim), reward_model, initial_model
     )
 
-    # strategy.load_model(initial_model, args.pretrain)
-    # strategy.load_model(actor, args.pretrain)
-
     # configure trainer
     trainer = PPOTrainer(
         strategy,