Skip to content

Commit

Permalink
🦙 Llama2 optimization with multi-ep managed env (#1087)
Browse files Browse the repository at this point in the history
## Describe your changes
1. Llama2 optimization with multi-ep
2. Add info-level log to tell the system creation time.
3. Use device and ep but not only device to determine the default value
of `use_gpu`.

## Checklist before requesting a review
- [ ] Add unit tests for this change.
- [ ] Make sure all tests can pass.
- [ ] Update documents if necessary.
- [ ] Lint and apply fixes to your code by running `lintrunner -a`
- [ ] Is this a user-facing change? If yes, give a description of this
change to be included in the release notes.
- [ ] Is this PR including examples changes? If yes, please remember to
update [example
documentation](https://github.com/microsoft/Olive/blob/main/docs/source/examples.md)
in a follow-up PR.

## (Optional) Issue link
  • Loading branch information
trajepl authored Apr 24, 2024
1 parent 3de60e0 commit 60fdd5c
Show file tree
Hide file tree
Showing 8 changed files with 1,497 additions and 1 deletion.
172 changes: 172 additions & 0 deletions examples/llama2/notebook/llama2_multiep/config_cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
{
"input_model": {
"type": "PyTorchModel",
"config": {
"model_script": "user_script.py",
"io_config": "get_merged_decoder_with_past_io_config",
"dummy_inputs_func": "get_merged_decoder_with_past_dummy_inputs",
"hf_config": {
"model_name": "meta-llama/Llama-2-7b-hf",
"model_class": "LlamaForCausalLM",
"from_pretrained_args": {
"_attn_implementation": "eager"
},
"task": "text-generation"
}
}
},
"data_configs": {
"wikitext2_train": {
"name": "wikitext2_train",
"type": "HuggingfaceContainer",
"params_config": {
"data_name": "wikitext",
"subset": "wikitext-2-raw-v1",
"split": "train",
"component_kwargs": {
"pre_process_data": {
"text_cols": [
"text"
],
"corpus_strategy": "join",
"add_special_tokens": false,
"source_max_len": 2048,
"max_samples": 128,
"joiner": "\n\n"
}
}
}
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"config": {
"accelerators": [
{
"device": "CPU",
"execution_providers": [
"CPUExecutionProvider"
]
}
]
}
}
},
"evaluators": {
"merged_evaluator": {
"metrics": [
{
"name": "latency_prompt_processing",
"type": "latency",
"sub_types": [
{
"name": "avg",
"priority": 1
}
],
"user_config": {
"user_script": "user_script.py",
"dataloader_func": "dataloader_func_for_merged",
"func_kwargs": {
"dataloader_func": {
"model_id": "meta-llama/Llama-2-7b-hf",
"past_seq_length": 0,
"seq_length": 8,
"max_seq_length": 2048
}
},
"batch_size": 2,
"io_bind": true
}
},
{
"name": "latency_token_generation",
"type": "latency",
"sub_types": [
{
"name": "avg"
}
],
"user_config": {
"user_script": "user_script.py",
"dataloader_func": "dataloader_func_for_merged",
"func_kwargs": {
"dataloader_func": {
"model_id": "meta-llama/Llama-2-7b-hf",
"past_seq_length": 8,
"seq_length": 1,
"max_seq_length": 2048
}
},
"batch_size": 2,
"io_bind": true
}
}
]
}
},
"passes": {
"conversion_merged": {
"type": "OnnxConversion",
"config": {
"target_opset": 17,
"save_as_external_data": true,
"all_tensors_to_one_file": true
}
},
"transformers_optimization_fp32": {
"type": "OrtTransformersOptimization",
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": false,
"use_gqa": false
}
},
"transformers_optimization_fp16": {
"type": "OrtTransformersOptimization",
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": true,
"use_gqa": true
}
},
"blockwise_quant_int4": {
"type": "OnnxMatMul4Quantizer",
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"block_size": 32,
"is_symmetric": true
}
}
},
"engine": {
"evaluator": "merged_evaluator",
"host": "local_system",
"target": "local_system",
"cache_dir": "cache",
"output_dir": "models/llama2_cpu/meta-llama/Llama-2-7b-hf"
},
"pass_flows": [
[
"conversion_merged",
"transformers_optimization_fp32",
"blockwise_quant_int4"
],
[
"conversion_merged",
"transformers_optimization_fp16"
]
]
}
172 changes: 172 additions & 0 deletions examples/llama2/notebook/llama2_multiep/config_gpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
{
"input_model": {
"type": "PyTorchModel",
"config": {
"model_script": "user_script.py",
"io_config": "get_merged_decoder_with_past_io_config",
"dummy_inputs_func": "get_merged_decoder_with_past_dummy_inputs",
"hf_config": {
"model_name": "meta-llama/Llama-2-7b-hf",
"model_class": "LlamaForCausalLM",
"from_pretrained_args": {
"_attn_implementation": "eager"
},
"task": "text-generation"
}
}
},
"data_configs": {
"wikitext2_train": {
"name": "wikitext2_train",
"type": "HuggingfaceContainer",
"params_config": {
"data_name": "wikitext",
"subset": "wikitext-2-raw-v1",
"split": "train",
"component_kwargs": {
"pre_process_data": {
"text_cols": [
"text"
],
"corpus_strategy": "join",
"add_special_tokens": false,
"source_max_len": 2048,
"max_samples": 128,
"joiner": "\n\n"
}
}
}
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"config": {
"accelerators": [
{
"device": "GPU",
"execution_providers": [
"CUDAExecutionProvider"
]
}
]
}
}
},
"evaluators": {
"merged_evaluator": {
"metrics": [
{
"name": "latency_prompt_processing",
"type": "latency",
"sub_types": [
{
"name": "avg",
"priority": 1
}
],
"user_config": {
"user_script": "user_script.py",
"dataloader_func": "dataloader_func_for_merged",
"func_kwargs": {
"dataloader_func": {
"model_id": "meta-llama/Llama-2-7b-hf",
"past_seq_length": 0,
"seq_length": 8,
"max_seq_length": 2048
}
},
"batch_size": 2,
"io_bind": true
}
},
{
"name": "latency_token_generation",
"type": "latency",
"sub_types": [
{
"name": "avg"
}
],
"user_config": {
"user_script": "user_script.py",
"dataloader_func": "dataloader_func_for_merged",
"func_kwargs": {
"dataloader_func": {
"model_id": "meta-llama/Llama-2-7b-hf",
"past_seq_length": 8,
"seq_length": 1,
"max_seq_length": 2048
}
},
"batch_size": 2,
"io_bind": true
}
}
]
}
},
"passes": {
"conversion_merged": {
"type": "OnnxConversion",
"config": {
"target_opset": 17,
"save_as_external_data": true,
"all_tensors_to_one_file": true
}
},
"transformers_optimization_fp32": {
"type": "OrtTransformersOptimization",
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": false,
"use_gqa": false
}
},
"transformers_optimization_fp16": {
"type": "OrtTransformersOptimization",
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": true,
"use_gqa": true
}
},
"blockwise_quant_int4": {
"type": "OnnxMatMul4Quantizer",
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"block_size": 32,
"is_symmetric": true
}
}
},
"engine": {
"evaluator": "merged_evaluator",
"host": "local_system",
"target": "local_system",
"cache_dir": "cache",
"output_dir": "models/llama2_gpu/meta-llama/Llama-2-7b-hf"
},
"pass_flows": [
[
"conversion_merged",
"transformers_optimization_fp32",
"blockwise_quant_int4"
],
[
"conversion_merged",
"transformers_optimization_fp16"
]
]
}
Loading

0 comments on commit 60fdd5c

Please sign in to comment.