🦙 Llama2 optimization with multi-ep managed env (#1087)

## Describe your changes 1. Llama2 optimization with multi-ep 2. Add info-level log to tell the system creation time. 3. Use device and ep but not only device to determine the default value of `use_gpu`. ## Checklist before requesting a review - [ ] Add unit tests for this change. - [ ] Make sure all tests can pass. - [ ] Update documents if necessary. - [ ] Lint and apply fixes to your code by running `lintrunner -a` - [ ] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. - [ ] Is this PR including examples changes? If yes, please remember to update [example documentation](https://github.com/microsoft/Olive/blob/main/docs/source/examples.md) in a follow-up PR. ## (Optional) Issue link
microsoft · Apr 24, 2024 · 60fdd5c · 60fdd5c
1 parent 3de60e0
commit 60fdd5c
Show file tree

Hide file tree

Showing 8 changed files with 1,497 additions and 1 deletion.
diff --git a/examples/llama2/notebook/llama2_multiep/config_cpu.json b/examples/llama2/notebook/llama2_multiep/config_cpu.json
@@ -0,0 +1,172 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_script": "user_script.py",
+            "io_config": "get_merged_decoder_with_past_io_config",
+            "dummy_inputs_func": "get_merged_decoder_with_past_dummy_inputs",
+            "hf_config": {
+                "model_name": "meta-llama/Llama-2-7b-hf",
+                "model_class": "LlamaForCausalLM",
+                "from_pretrained_args": {
+                    "_attn_implementation": "eager"
+                },
+                "task": "text-generation"
+            }
+        }
+    },
+    "data_configs": {
+        "wikitext2_train": {
+            "name": "wikitext2_train",
+            "type": "HuggingfaceContainer",
+            "params_config": {
+                "data_name": "wikitext",
+                "subset": "wikitext-2-raw-v1",
+                "split": "train",
+                "component_kwargs": {
+                    "pre_process_data": {
+                        "text_cols": [
+                            "text"
+                        ],
+                        "corpus_strategy": "join",
+                        "add_special_tokens": false,
+                        "source_max_len": 2048,
+                        "max_samples": 128,
+                        "joiner": "\n\n"
+                    }
+                }
+            }
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "CPU",
+                        "execution_providers": [
+                            "CPUExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "merged_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency_prompt_processing",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 1
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "dataloader_func": "dataloader_func_for_merged",
+                        "func_kwargs": {
+                            "dataloader_func": {
+                                "model_id": "meta-llama/Llama-2-7b-hf",
+                                "past_seq_length": 0,
+                                "seq_length": 8,
+                                "max_seq_length": 2048
+                            }
+                        },
+                        "batch_size": 2,
+                        "io_bind": true
+                    }
+                },
+                {
+                    "name": "latency_token_generation",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg"
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "dataloader_func": "dataloader_func_for_merged",
+                        "func_kwargs": {
+                            "dataloader_func": {
+                                "model_id": "meta-llama/Llama-2-7b-hf",
+                                "past_seq_length": 8,
+                                "seq_length": 1,
+                                "max_seq_length": 2048
+                            }
+                        },
+                        "batch_size": 2,
+                        "io_bind": true
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion_merged": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 17,
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true
+            }
+        },
+        "transformers_optimization_fp32": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "model_type": "gpt2",
+                "opt_level": 0,
+                "only_onnxruntime": false,
+                "keep_io_types": false,
+                "float16": false,
+                "use_gqa": false
+            }
+        },
+        "transformers_optimization_fp16": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "model_type": "gpt2",
+                "opt_level": 0,
+                "only_onnxruntime": false,
+                "keep_io_types": false,
+                "float16": true,
+                "use_gqa": true
+            }
+        },
+        "blockwise_quant_int4": {
+            "type": "OnnxMatMul4Quantizer",
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "block_size": 32,
+                "is_symmetric": true
+            }
+        }
+    },
+    "engine": {
+        "evaluator": "merged_evaluator",
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_dir": "models/llama2_cpu/meta-llama/Llama-2-7b-hf"
+    },
+    "pass_flows": [
+        [
+            "conversion_merged",
+            "transformers_optimization_fp32",
+            "blockwise_quant_int4"
+        ],
+        [
+            "conversion_merged",
+            "transformers_optimization_fp16"
+        ]
+    ]
+}
diff --git a/examples/llama2/notebook/llama2_multiep/config_gpu.json b/examples/llama2/notebook/llama2_multiep/config_gpu.json
@@ -0,0 +1,172 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_script": "user_script.py",
+            "io_config": "get_merged_decoder_with_past_io_config",
+            "dummy_inputs_func": "get_merged_decoder_with_past_dummy_inputs",
+            "hf_config": {
+                "model_name": "meta-llama/Llama-2-7b-hf",
+                "model_class": "LlamaForCausalLM",
+                "from_pretrained_args": {
+                    "_attn_implementation": "eager"
+                },
+                "task": "text-generation"
+            }
+        }
+    },
+    "data_configs": {
+        "wikitext2_train": {
+            "name": "wikitext2_train",
+            "type": "HuggingfaceContainer",
+            "params_config": {
+                "data_name": "wikitext",
+                "subset": "wikitext-2-raw-v1",
+                "split": "train",
+                "component_kwargs": {
+                    "pre_process_data": {
+                        "text_cols": [
+                            "text"
+                        ],
+                        "corpus_strategy": "join",
+                        "add_special_tokens": false,
+                        "source_max_len": 2048,
+                        "max_samples": 128,
+                        "joiner": "\n\n"
+                    }
+                }
+            }
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "GPU",
+                        "execution_providers": [
+                            "CUDAExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "merged_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency_prompt_processing",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 1
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "dataloader_func": "dataloader_func_for_merged",
+                        "func_kwargs": {
+                            "dataloader_func": {
+                                "model_id": "meta-llama/Llama-2-7b-hf",
+                                "past_seq_length": 0,
+                                "seq_length": 8,
+                                "max_seq_length": 2048
+                            }
+                        },
+                        "batch_size": 2,
+                        "io_bind": true
+                    }
+                },
+                {
+                    "name": "latency_token_generation",
+                    "type": "latency",
+                    "sub_types": [
+                        {
+                            "name": "avg"
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "dataloader_func": "dataloader_func_for_merged",
+                        "func_kwargs": {
+                            "dataloader_func": {
+                                "model_id": "meta-llama/Llama-2-7b-hf",
+                                "past_seq_length": 8,
+                                "seq_length": 1,
+                                "max_seq_length": 2048
+                            }
+                        },
+                        "batch_size": 2,
+                        "io_bind": true
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion_merged": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 17,
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true
+            }
+        },
+        "transformers_optimization_fp32": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "model_type": "gpt2",
+                "opt_level": 0,
+                "only_onnxruntime": false,
+                "keep_io_types": false,
+                "float16": false,
+                "use_gqa": false
+            }
+        },
+        "transformers_optimization_fp16": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "model_type": "gpt2",
+                "opt_level": 0,
+                "only_onnxruntime": false,
+                "keep_io_types": false,
+                "float16": true,
+                "use_gqa": true
+            }
+        },
+        "blockwise_quant_int4": {
+            "type": "OnnxMatMul4Quantizer",
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "block_size": 32,
+                "is_symmetric": true
+            }
+        }
+    },
+    "engine": {
+        "evaluator": "merged_evaluator",
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_dir": "models/llama2_gpu/meta-llama/Llama-2-7b-hf"
+    },
+    "pass_flows": [
+        [
+            "conversion_merged",
+            "transformers_optimization_fp32",
+            "blockwise_quant_int4"
+        ],
+        [
+            "conversion_merged",
+            "transformers_optimization_fp16"
+        ]
+    ]
+}