Merge branch 'main' into main

flybird11111 · Apr 30, 2024 · e0b21ea · e0b21ea
2 parents 6eb2655 + d3f34ee
commit e0b21ea
Show file tree

Hide file tree

Showing 266 changed files with 1,525 additions and 463 deletions.
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
@@ -56,7 +56,7 @@ jobs:
     needs: detect-changed-doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm
     timeout-minutes: 20
     defaults:

diff --git a/.github/workflows/release_docker_after_publish.yml b/.github/workflows/release_docker_after_publish.yml
@@ -24,7 +24,7 @@ jobs:
           version=$(cat version.txt)
           tag=hpcaitech/colossalai:$version
           latest=hpcaitech/colossalai:latest
-          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 --build-arg VERSION=v${version} -t $tag ./docker
+          docker build --build-arg VERSION=v${version} -t $tag ./docker
           docker tag $tag $latest
           echo "tag=${tag}" >> $GITHUB_OUTPUT
           echo "latest=${latest}" >> $GITHUB_OUTPUT

diff --git a/LICENSE b/LICENSE
@@ -552,3 +552,18 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    THE SOFTWARE.
+   ---------------- LICENSE FOR Hugging Face accelerate ----------------
+
+   Copyright 2021 The HuggingFace Team
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
@@ -25,6 +25,8 @@
 </div>
 
 ## Latest News
+* [2024/04] [Open-Sora Unveils Major Upgrade: Embracing Open Source with Single-Shot 16-Second Video Generation and 720p Resolution](https://hpc-ai.com/blog/open-soras-comprehensive-upgrade-unveiled-embracing-16-second-video-generation-and-720p-resolution-in-open-source)
+* [2024/04] [Most cost-effective solutions for inference, fine-tuning and pretraining, tailored to LLaMA3 series](https://hpc-ai.com/blog/most-cost-effective-solutions-for-inference-fine-tuning-and-pretraining-tailored-to-llama3-series)
 * [2024/03] [314 Billion Parameter Grok-1 Inference Accelerated by 3.8x, Efficient and Easy-to-Use PyTorch+HuggingFace version is Here](https://hpc-ai.com/blog/314-billion-parameter-grok-1-inference-accelerated-by-3.8x-efficient-and-easy-to-use-pytorchhuggingface-version-is-here)
 * [2024/03] [Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models](https://hpc-ai.com/blog/open-sora-v1.0)
 * [2024/03] [Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora)
@@ -131,7 +133,7 @@ distributed training and inference in a few lines.
 
 [Open-Sora](https://github.com/hpcaitech/Open-Sora)：Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models
 [[code]](https://github.com/hpcaitech/Open-Sora)
-[[blog]](https://hpc-ai.com/blog/open-sora-v1.0)
+[[blog]](https://hpc-ai.com/blog/open-soras-comprehensive-upgrade-unveiled-embracing-16-second-video-generation-and-720p-resolution-in-open-source)
 [[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Open-Sora)
 [[Demo]](https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file#-latest-demo)
 

diff --git a/applications/Colossal-LLaMA/colossal_llama/dataset/loader.py b/applications/Colossal-LLaMA/colossal_llama/dataset/loader.py
@@ -80,15 +80,19 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
 
         # `List[torch.Tensor]`
         batch_input_ids = [
-            torch.LongTensor(instance["input_ids"][: self.max_length])
-            if len(instance["input_ids"]) > self.max_length
-            else torch.LongTensor(instance["input_ids"])
+            (
+                torch.LongTensor(instance["input_ids"][: self.max_length])
+                if len(instance["input_ids"]) > self.max_length
+                else torch.LongTensor(instance["input_ids"])
+            )
             for instance in instances
         ]
         batch_labels = [
-            torch.LongTensor(instance["labels"][: self.max_length])
-            if len(instance["labels"]) > self.max_length
-            else torch.LongTensor(instance["labels"])
+            (
+                torch.LongTensor(instance["labels"][: self.max_length])
+                if len(instance["labels"]) > self.max_length
+                else torch.LongTensor(instance["labels"])
+            )
             for instance in instances
         ]
 

diff --git a/applications/Colossal-LLaMA/train.py b/applications/Colossal-LLaMA/train.py
@@ -136,7 +136,7 @@ def main() -> None:
     # ==============================
     # Initialize Distributed Training
     # ==============================
-    colossalai.launch_from_torch({})
+    colossalai.launch_from_torch()
     accelerator = get_accelerator()
     coordinator = DistCoordinator()
 
@@ -253,9 +253,11 @@ def main() -> None:
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
 
     optimizer = HybridAdam(
-        model_params=filter(lambda p: p.requires_grad, model.parameters())
-        if args.freeze_non_embeds_params
-        else model.parameters(),
+        model_params=(
+            filter(lambda p: p.requires_grad, model.parameters())
+            if args.freeze_non_embeds_params
+            else model.parameters()
+        ),
         lr=args.lr,
         betas=(0.9, 0.95),
         weight_decay=args.weight_decay,

diff --git a/applications/ColossalChat/benchmarks/benchmark_ppo.py b/applications/ColossalChat/benchmarks/benchmark_ppo.py
@@ -66,7 +66,7 @@ def benchmark_train(args):
     # ==============================
     # Initialize Distributed Training
     # ==============================
-    colossalai.launch_from_torch({})
+    colossalai.launch_from_torch()
     coordinator = DistCoordinator()
 
     # ======================================================

diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py
@@ -37,7 +37,7 @@ def train(args):
     # ==============================
     # Initialize Distributed Training
     # ==============================
-    colossalai.launch_from_torch({})
+    colossalai.launch_from_torch()
     coordinator = DistCoordinator()
 
     # ==============================

diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.py b/applications/ColossalChat/examples/training_scripts/train_ppo.py
@@ -39,7 +39,7 @@ def train(args):
     # ==============================
     # Initialize Distributed Training
     # ==============================
-    colossalai.launch_from_torch({})
+    colossalai.launch_from_torch()
     coordinator = DistCoordinator()
 
     # ======================================================

diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.py b/applications/ColossalChat/examples/training_scripts/train_rm.py
@@ -34,7 +34,7 @@ def train(args):
     # ==============================
     # Initialize Distributed Training
     # ==============================
-    colossalai.launch_from_torch({})
+    colossalai.launch_from_torch()
     coordinator = DistCoordinator()
 
     # ======================================================

diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py
@@ -29,7 +29,7 @@ def train(args):
     # ==============================
     # Initialize Distributed Training
     # ==============================
-    colossalai.launch_from_torch({})
+    colossalai.launch_from_torch()
     coordinator = DistCoordinator()
 
     # ==============================

diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.py b/applications/ColossalEval/examples/dataset_evaluation/inference.py
@@ -81,7 +81,7 @@ def rm_and_merge(
 
 
 def main(args):
-    colossalai.launch_from_torch(config={}, seed=42)
+    colossalai.launch_from_torch(seed=42)
     accelerator = get_accelerator()
     world_size = dist.get_world_size()
 

diff --git a/applications/ColossalEval/examples/gpt_evaluation/inference.py b/applications/ColossalEval/examples/gpt_evaluation/inference.py
@@ -81,7 +81,7 @@ def rm_and_merge(
 
 
 def main(args):
-    colossalai.launch_from_torch(config={}, seed=42)
+    colossalai.launch_from_torch(seed=42)
     world_size = dist.get_world_size()
 
     rank = dist.get_rank()

diff --git a/applications/ColossalMoE/infer.py b/applications/ColossalMoE/infer.py
@@ -57,7 +57,7 @@ def main():
     args = parse_args()
 
     # Launch ColossalAI
-    colossalai.launch_from_torch(config={}, seed=args.seed)
+    colossalai.launch_from_torch(seed=args.seed)
     coordinator = DistCoordinator()
 
     config = MixtralConfig.from_pretrained(args.model_name)
@@ -96,7 +96,11 @@ def main():
     if coordinator.rank == 0:
         text = ["Hello my name is"]
     else:
-        text = ["What's the largest country in the world?", "How many people live in China?", "帮我续写这首诗：离离原上草"]
+        text = [
+            "What's the largest country in the world?",
+            "How many people live in China?",
+            "帮我续写这首诗：离离原上草",
+        ]
     tokenizer.pad_token = tokenizer.unk_token
     inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch.cuda.current_device())
 

diff --git a/applications/ColossalMoE/tests/test_mixtral_layer.py b/applications/ColossalMoE/tests/test_mixtral_layer.py
@@ -50,7 +50,7 @@ def check_mixtral_moe_layer():
 
 
 def run_dist(rank: int, world_size: int, port: int):
-    colossalai.launch({}, rank, world_size, "localhost", port)
+    colossalai.launch(rank, world_size, "localhost", port)
     check_mixtral_moe_layer()
 
 

diff --git a/applications/ColossalMoE/tests/test_moe_checkpoint.py b/applications/ColossalMoE/tests/test_moe_checkpoint.py
@@ -133,7 +133,7 @@ def check_mixtral_moe_layer():
 
 
 def run_dist(rank: int, world_size: int, port: int):
-    colossalai.launch({}, rank, world_size, "localhost", port)
+    colossalai.launch(rank, world_size, "localhost", port)
     check_mixtral_moe_layer()
 
 

diff --git a/applications/ColossalMoE/train.py b/applications/ColossalMoE/train.py
@@ -145,7 +145,7 @@ def main():
     args = parse_args()
 
     # Launch ColossalAI
-    colossalai.launch_from_torch(config={}, seed=args.seed)
+    colossalai.launch_from_torch(seed=args.seed)
     coordinator = DistCoordinator()
 
     # Set plugin
@@ -195,9 +195,9 @@ def main():
     lr_scheduler = CosineAnnealingWarmupLR(
         optimizer=optimizer,
         total_steps=args.num_epochs * len(dataloader),
-        warmup_steps=args.warmup_steps
-        if args.warmup_steps is not None
-        else int(args.num_epochs * len(dataloader) * 0.025),
+        warmup_steps=(
+            args.warmup_steps if args.warmup_steps is not None else int(args.num_epochs * len(dataloader) * 0.025)
+        ),
         eta_min=0.1 * args.lr,
     )
 

diff --git a/colossalai/auto_parallel/offload/amp_optimizer.py b/colossalai/auto_parallel/offload/amp_optimizer.py
@@ -126,7 +126,7 @@ def loss_scale(self):
         return self.grad_scaler.scale.item()
 
     def zero_grad(self, *args, **kwargs):
-        self.module.overflow_counter = torch.cuda.IntTensor([0])
+        self.module.overflow_counter = torch.tensor([0], dtype=torch.int, device=get_accelerator().get_current_device())
         return self.optim.zero_grad(set_to_none=True)
 
     def step(self, *args, **kwargs):

diff --git a/colossalai/auto_parallel/offload/base_offload_module.py b/colossalai/auto_parallel/offload/base_offload_module.py
@@ -4,7 +4,7 @@
 import torch
 import torch.nn as nn
 
-from colossalai.utils import _cast_float
+from colossalai.utils import _cast_float, get_current_device
 from colossalai.utils.common import free_storage
 
 from .region_manager import RegionManager
@@ -25,7 +25,7 @@ def __init__(self, model: nn.Module, region_manager: RegionManager, is_sync=True
         self.model = model
         self.region_manager = region_manager
         self.grad_hook_list = []
-        self.overflow_counter = torch.cuda.IntTensor([0])
+        self.overflow_counter = torch.tensor([0], dtype=torch.int, device=get_current_device())
 
         self.grad_offload_stream = torch.cuda.current_stream() if is_sync else GlobalRuntimeInfo.d2h_stream
 

diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
@@ -8,9 +8,18 @@
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
 
+SUPPORT_PEFT = False
+try:
+    import peft
+
+    SUPPORT_PEFT = True
+except ImportError:
+    pass
+
 import colossalai.interface.pretrained as pretrained_utils
 from colossalai.checkpoint_io import GeneralCheckpointIO
 from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.quantization import BnbQuantizationConfig
 
 from .accelerator import Accelerator
 from .mixed_precision import MixedPrecision, mixed_precision_factory
@@ -221,6 +230,56 @@ def no_sync(self, model: nn.Module = None, optimizer: OptimizerWrapper = None) -
         assert self.plugin.support_no_sync(), f"The plugin {self.plugin.__class__.__name__} does not support no_sync."
         return self.plugin.no_sync(model, optimizer)
 
+    def enable_lora(
+        self,
+        model: nn.Module,
+        pretrained_dir: Optional[str] = None,
+        lora_config: "peft.LoraConfig" = None,
+        bnb_quantization_config: Optional[BnbQuantizationConfig] = None,
+        quantize=False,
+    ) -> nn.Module:
+        """
+        Wrap the passed in model with LoRA modules for training. If pretrained directory is provided, lora configs and weights are loaded from that directory.
+        Lora in ColossalAI is implemented using Huggingface peft library, so the arguments for Lora configuration are same as those of peft.
+
+        Args:
+            model (nn.Module): The model to be appended with LoRA modules.
+            pretrained_dir(str, optional): The path to the pretrained directory, can be a local directory
+                or model_id of a PEFT configuration hosted inside a model repo on the Hugging Face Hub.
+                When set to None, create new lora configs and weights for the model using the passed in lora_config. Defaults to None.
+            lora_config: (peft.LoraConfig, optional): Passed in LoraConfig for peft. Defaults to None.
+        """
+        if not SUPPORT_PEFT:
+            raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!")
+
+        assert self.plugin is not None, f"Lora can only be enabled when a plugin is provided."
+        assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora."
+        if pretrained_dir is None:
+            assert (
+                lora_config is not None
+            ), "Please provide configuration for Lora when pretrained directory path isn't passed in."
+            assert isinstance(
+                lora_config, peft.LoraConfig
+            ), "The passed in configuration should be an instance of peft.LoraConfig."
+        if lora_config is None:
+            assert (
+                pretrained_dir is not None
+            ), "Please provide pretrained directory path if not passing in lora configuration."
+        if quantize is True:
+            if bnb_quantization_config is not None:
+                warnings.warn(
+                    "User defined BnbQuantizationConfig is not fully tested in ColossalAI. Use it at your own risk."
+                )
+            else:
+                bnb_quantization_config = BnbQuantizationConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                )
+
+        return self.plugin.enable_lora(model, pretrained_dir, lora_config, bnb_quantization_config)
+
     def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True) -> None:
         """Load model from checkpoint.
 
@@ -323,3 +382,20 @@ def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str) -> None:
             checkpoint (str): Path to the checkpoint. It must be a local file path.
         """
         self.checkpoint_io.load_lr_scheduler(lr_scheduler, checkpoint)
+
+    def save_lora_as_pretrained(
+        self, model: Union[nn.Module, ModelWrapper], checkpoint: str, use_safetensors: bool = False
+    ) -> None:
+        """
+        Save the lora adapters and adapter configuration file to a pretrained checkpoint directory.
+
+        Args:
+            model (Union[nn.Module, ModelWrapper]): A model boosted by Booster.
+            checkpoint (str): Path to the checkpoint directory. It must be a local path.
+            use_safetensors (bool, optional): Whether to use safe tensors when saving. Defaults to False.
+        """
+        if not SUPPORT_PEFT:
+            raise ImportError("Please install Huggingface Peft library to enable lora features in ColossalAI!")
+        assert self.plugin is not None, f"Lora can only be enabled when a plugin is provided."
+        assert self.plugin.support_lora(), f"The plugin {self.plugin.__class__.__name__} does not support lora."
+        self.checkpoint_io.save_lora_as_pretrained(model, checkpoint, use_safetensors)
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
@@ -4,7 +4,7 @@
 import random
 from pathlib import Path
 from types import MethodType
-from typing import Callable, Iterator, List, Optional, Tuple
+from typing import Callable, Dict, Iterator, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -446,6 +446,9 @@ def __del__(self):
     def support_no_sync(self) -> bool:
         return False
 
+    def support_lora(self) -> bool:
+        return False
+
     def control_precision(self) -> bool:
         return True
 
@@ -576,3 +579,8 @@ def get_checkpoint_io(self) -> CheckpointIO:
 
     def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]:
         raise NotImplementedError
+
+    def enable_lora(
+        self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None
+    ) -> nn.Module:
+        raise NotImplementedError