Revert "some changes to support fine-tuning on Intel GPU (intel#88)" (i…

…ntel#95) This reverts commit a555e0c.
harborn · Feb 4, 2024 · 63464ed · 63464ed
1 parent bcd5d08
commit 63464ed
Show file tree

Hide file tree

Showing 10 changed files with 27 additions and 102 deletions.
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
@@ -11,10 +11,10 @@ on:
         default: '10.1.2.13:5000/llmray-build'
       http_proxy:
         type: string
-        default: 'http://10.24.221.149:911'
+        default: 'http://proxy-chain.intel.com:911'
       https_proxy:
         type: string
-        default: 'http://10.24.221.149:911'
+        default: 'http://proxy-chain.intel.com:911'
       runner_config_path:
         type: string
         default: '/home/ci/llm-ray-actions-runner'

diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -11,10 +11,10 @@ on:
         default: '10.1.2.13:5000/llmray-build'
       http_proxy:
         type: string
-        default: 'http://10.24.221.149:911'
+        default: 'http://proxy-chain.intel.com:911'
       https_proxy:
         type: string
-        default: 'http://10.24.221.149:911'
+        default: 'http://proxy-chain.intel.com:911'
       runner_config_path:
         type: string
         default: '/home/ci/llm-ray-actions-runner'

diff --git a/common/model/huggingface_model_for_causal_lm.py b/common/model/huggingface_model_for_causal_lm.py
@@ -8,24 +8,13 @@
 class HuggingFaceModelForCausalLM(Model):
     def __call__(self, config):
         name = config.get("name")
-        model_dtype = config.get("dtype")
         model_config = config.get("config", {})
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            name, torch_dtype=model_dtype, **model_config
-        )
-
+        model = transformers.AutoModelForCausalLM.from_pretrained(name, **model_config)
         lora_config = config.get("lora_config", None)
         if lora_config:
             peft_config = LoraConfig(**lora_config)
             model = get_peft_model(model, peft_config)
             deltatuner_config = config.get("deltatuner_config", None)
             if deltatuner_config:
                 model = deltatuner.optimize(model, **deltatuner_config)
-
-        enable_gradient_checkpointing = config.get("enable_gradient_checkpointing")
-        if enable_gradient_checkpointing:
-            model.enable_input_require_grads()
-            model.gradient_checkpointing_enable()
-            model.config.use_cache = False
-
         return model
diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
@@ -85,6 +85,8 @@ def _get_lr_scheduler(
         num_steps_per_epoch,
         accelerator,
     ):
+        # gradient_accumulation_steps = accelerator.gradient_accumulation_steps
+        # num_update_steps_per_epoch = math.ceil(num_steps_per_epoch / gradient_accumulation_steps)
         enable = lr_scheduler_config.get("enable", False)
         if not enable:
             return None
@@ -151,7 +153,7 @@ def prepare(self, model, tokenizer, dataset, optimizer, accelerator):
     def train(self):
         num_train_epochs = self.config.get("num_train_epochs", 1)
         checkpoint = self.config.get("checkpoint")
-        logging_steps = self.config.get("logging_steps", 1)
+        log_step = self.config.get("log_step", 1)
         max_train_step = self.config.get("max_train_step")
         max_eval_step = self.config.get("max_eval_step")
         for idx in range(self.starting_epoch, num_train_epochs, 1):
@@ -168,17 +170,12 @@ def train(self):
                     if self.lr_scheduler is not None:
                         self.lr_scheduler.step()
                     self.optimizer.zero_grad()
-
-                    if step % logging_steps == 0:
-                        loss = loss.item()
-                        ppl = math.exp(loss)
+                    if step % log_step == 0:
                         logger.info(
-                            f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
+                            f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{math.exp(loss):.6f}\ttime:{time.time()-start:.6f}"
                         )
                         report(
                             {
-                                "loss": loss,
-                                "ppl": ppl,
                                 "train_epoch": idx,
                                 "total_epochs": num_train_epochs,
                                 "train_step": step,
@@ -187,10 +184,6 @@ def train(self):
                                 else total_steps,
                             }
                         )
-                        self.accelerator.log(
-                            {"train loss": loss, "train perplexity": ppl},
-                            step=idx * total_steps + step,
-                        )
                         start = time.time()
                 if max_train_step is not None:
                     if step >= max_train_step - 1:
@@ -221,9 +214,6 @@ def train(self):
                 except OverflowError:
                     eval_loss = float("inf")
                     perplexity = float("inf")
-                self.accelerator.log(
-                    {"evaluate loss": eval_loss, "evaluate perplexity": perplexity}
-                )
                 logger.info(
                     f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]"
                 )
@@ -242,9 +232,6 @@ def train(self):
                 save_function=self.accelerator.save,
             )
             logger.info(f"finish save model to {output}")
-
-        self.accelerator.end_training()
-
         self.accelerator.wait_for_everyone()
 
     def _get_local_path(self, root_path, model_name):

diff --git a/common/trainer/rm_trainer.py b/common/trainer/rm_trainer.py
@@ -52,7 +52,7 @@ def compute_loss(self, batch, return_outputs=False):
 
     def train(self):
         num_train_epochs = self.config.get("num_train_epochs", 1)
-        logging_steps = self.config.get("logging_steps", 1)
+        log_step = self.config.get("log_step", 1)
         if not os.path.exists(self.config.get("log_path", ".")):
             os.makedirs(self.config.get("log_path", "."), exist_ok=True)
         writer = SummaryWriter(self.config.get("log_path", "."))
@@ -69,7 +69,7 @@ def train(self):
                     if self.lr_scheduler is not None:
                         self.lr_scheduler.step()
                     self.optimizer.zero_grad()
-                    if step % logging_steps == 0:
+                    if step % log_step == 0:
                         logger.info(
                             f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{len(self.train_dataloader)}]\tloss:{loss}\tppl:{math.exp(loss)}\ttime:{time.time()-start}"
                         )

diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
@@ -10,11 +10,9 @@ The following are the parameters supported in the finetuning workflow.
 |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
 |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
 |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
-|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers|
 |config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained()` method|
 |lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
 |deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
-|enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime|
 
 
 ## Dataset Parameters
@@ -42,4 +40,3 @@ The following are the parameters supported in the finetuning workflow.
 |max_train_steps|None|Total number of training steps to perform. If provided, overrides epochs.|
 |gradient_accumulation_steps|1|Number of updates steps to accumulate before performing a backward/update pass.|
 |seed|None|A seed for reproducible training.|
-|logging_steps|10|logging per steps|
diff --git a/finetune/finetune.py b/finetune/finetune.py
@@ -4,7 +4,6 @@
 import argparse
 from typing import Any, Dict, Union
 
-import torch
 import accelerate
 from accelerate.utils import is_xpu_available
 
@@ -63,14 +62,6 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any],
     return mode_env_vars[mode]
 
 
-def convert_dtype(dtype: str) -> torch.dtype:
-    supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
-    if dtype in supported_dtypes:
-        return supported_dtypes[dtype]
-    else:
-        raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]")
-
-
 def train_func(config: Dict[str, Any]):
     cwd = config.get("cwd")
     if cwd:
@@ -88,26 +79,9 @@ def train_func(config: Dict[str, Any]):
         )
     else:
         fsdp_plugin = None
-
-    log_with = "tensorboard"  # only support tensorboard as tracker
-    output_dir = config["General"]["output_dir"]
-    tracking_dir = config["General"]["tracking_dir"]
     accelerator = accelerate.Accelerator(
-        gradient_accumulation_steps=gradient_accumulation_steps,
-        fsdp_plugin=fsdp_plugin,
-        log_with=log_with,
-        project_dir=tracking_dir,
+        gradient_accumulation_steps=gradient_accumulation_steps, fsdp_plugin=fsdp_plugin
     )
-    epochs = config["Training"]["epochs"]
-    tracker_config = {
-        "epochs": epochs,
-        "learning_rate": config["Training"]["learning_rate"],
-        "batch_size": config["Training"]["batch_size"],
-    }
-    base_model = config["General"]["base_model"]
-    dataset_file = config["Dataset"]["train_file"]
-    accelerator.init_trackers("fine-tuning", config=tracker_config)
-
     common.logger.info(
         f"accelerator generate finish, accelerator device type = {accelerator.device}"
     )
@@ -118,25 +92,23 @@ def train_func(config: Dict[str, Any]):
 
     datasets = common.dataset.Dataset.registory.get("HuggingfaceDataset")()(
         config={
-            "name": dataset_file,
+            "name": config["Dataset"]["train_file"],
             "validation_file": config["Dataset"]["validation_file"],
             "validation_split_percentage": config["Dataset"]["validation_split_percentage"],
         }
     )
 
     tokenizer = common.tokenizer.Tokenizer.registory.get("HuggingFaceTokenizer")()(
         config={
-            "name": base_model,
+            "name": config["General"]["base_model"],
             "config": config["General"]["config"],
         }
     )
 
     model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(
         config={
-            "name": base_model,
-            "dtype": convert_dtype(config["Training"]["mixed_precision"]),
+            "name": config["General"]["base_model"],
             "config": config["General"]["config"],
-            "enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
             "lora_config": config["General"]["lora_config"]
             if config["General"].get("lora_config")
             else None,
@@ -153,10 +125,10 @@ def train_func(config: Dict[str, Any]):
 
     trainer = common.trainer.Trainer.registory.get("DefaultTrainer")(
         config={
-            "num_train_epochs": epochs,
+            "num_train_epochs": config["Training"]["epochs"],
             "max_train_step": config["Training"].get("max_train_steps", None),
-            "logging_steps": config["Training"].get("logging_steps", 1),
-            "output": output_dir,
+            "log_step": 1,
+            "output": config["General"]["output_dir"],
             "dataprocesser": {
                 "type": "GeneralProcesser",
                 "per_device_train_batch_size": config["Training"]["batch_size"],
@@ -245,21 +217,14 @@ def main(external_config=None):
                 "FI_PROVIDER": "tcp",
             }
         }
+
         accelerate_env_vars = get_accelerate_environment_variable(accelerate_mode, config)
         runtime_env["env_vars"].update(accelerate_env_vars)
 
         if config["General"]["gpt_base_model"] is True:
             runtime_env["pip"] = ["transformers==4.26.0"]
 
-        import intel_extension_for_pytorch as ipex
-
-        if "xpu" in ipex.__version__:
-            num_cpus = (
-                resources_per_worker["CPU"] * num_training_workers + 1
-            )  # additional 1 for head worker
-            ray.init(num_cpus=num_cpus, runtime_env=runtime_env)
-        else:
-            ray.init(runtime_env=runtime_env)
+        ray.init(runtime_env=runtime_env)
 
     common.logger.info(f"ray available resources = {ray.available_resources()}")
 

diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
-  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
@@ -12,7 +11,6 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
-  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -30,5 +28,3 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
-  gradient_accumulation_steps: 2
-  logging_steps: 10
diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py
@@ -26,11 +26,9 @@ class General(BaseModel):
     gpt_base_model: bool
     output_dir: str
     checkpoint_dir: str
-    tracking_dir: str
     config: GeneralConfig
     lora_config: Optional[LoraConfig] = None
     deltatuner_config: Optional[DeltatunerConfig] = None
-    enable_gradient_checkpointing: bool = False
 
 
 class Dataset(BaseModel):
@@ -56,8 +54,6 @@ class Training(BaseModel):
     resources_per_worker: RayResourceConfig
     accelerate_mode: str
     mixed_precision: str = "no"
-    gradient_accumulation_steps: int
-    logging_steps: int = 10
 
     @validator("device")
     def check_device(cls, v: str):
@@ -73,11 +69,6 @@ def check_accelerate_mode(cls, v: str):
             raise ValueError(f"accelerate_mode must be one of {modes}")
         return v
 
-    @validator("logging_steps")
-    def check_logging_steps(cls, v: int):
-        assert v > 0
-        return v
-
     # @model_validator(mode='after')
     # def check_device_and_accelerate_mode(self) -> "Training":
     #     dev = self.device

diff --git a/pyproject.toml b/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
     "peft>=0.4.0",
     "deltatuner==1.1.9",
     "py-cpuinfo",
-    "pydantic-yaml"
+    "pydantic-yaml",
 ]
 
 [project.optional-dependencies]
@@ -48,11 +48,11 @@ cpu = [
 
 gpu = [
     "transformers>=4.35.0",
-    "torch==2.1.0a0",
-    "torchvision==0.16.0a0",
-    "intel_extension_for_pytorch==2.1.10+xpu",
-    "oneccl_bind_pt==2.1.100+xpu",
-    "dpctl==0.15.0"
+    "torch==2.0.1a0",
+    "torchvision==0.15.2a0",
+    "intel-extension-for-pytorch==2.0.110+xpu",
+    "oneccl_bind_pt==2.0.100+gpu",
+    "dpctl==0.14.5"
 ]
 
 deepspeed = [