Skip to content

Commit

Permalink
update gpu memory consumption (deepjavalibrary#818)
Browse files Browse the repository at this point in the history
Add GPTJ and GPTNeoX model to tests
  • Loading branch information
Qing Lan committed Jun 9, 2023
1 parent 4fc5bca commit b016e5d
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 14 deletions.
15 changes: 12 additions & 3 deletions .github/workflows/llm_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -358,14 +358,14 @@ jobs:
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py fastertransformer_raw bigscience/bloom-3b
docker rm -f $(docker ps -aq)
- name: Test flan-t5-xxl
- name: Test nomic-ai/gpt4all-j
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py fastertransformer_raw flan-t5-xxl
python3 llm/prepare.py fastertransformer_raw nomic-ai/gpt4all-j
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py fastertransformer_raw flan-t5-xxl
python3 llm/client.py fastertransformer_raw nomic-ai/gpt4all-j
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
Expand Down Expand Up @@ -422,6 +422,15 @@ jobs:
serve
python3 llm/client.py fastertransformer bigscience/bloom-3b
docker rm -f $(docker ps -aq)
- name: Test EleutherAI/pythia-2.8b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py fastertransformer EleutherAI/pythia-2.8b
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py fastertransformer EleutherAI/pythia-2.8b
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
working-directory: tests/integration
Expand Down
11 changes: 8 additions & 3 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ def get_model_name():
"batch_size": [1, 2],
"seq_length": [64, 128],
"max_memory_per_gpu": [15.0, 15.0, 15.0, 15.0]
},
"EleutherAI/pythia-2.8b": {
"batch_size": [1, 2],
"seq_length": [64, 128],
"max_memory_per_gpu": [6.0, 6.0, 6.0, 6.0]
}
}

Expand All @@ -178,7 +183,7 @@ def get_model_name():
},
"gpt2-xl": {
"batch_size": [1, 2],
"max_memory_per_gpu": 7.0
"max_memory_per_gpu": 8.0
},
"facebook/opt-6.7b": {
"batch_size": [1, 2],
Expand All @@ -188,9 +193,9 @@ def get_model_name():
"batch_size": [1, 2],
"max_memory_per_gpu": 6.0
},
"flan-t5-xxl": {
"nomic-ai/gpt4all-j": {
"batch_size": [1, 2],
"max_memory_per_gpu": 15.0
"max_memory_per_gpu": 6.0
}
}

Expand Down
18 changes: 13 additions & 5 deletions tests/integration/llm/fastertransformer-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,20 @@
import fastertransformer

model = None
use_triton = False


def load_model(properties):
tensor_parallel_degree = properties["tensor_parallel_degree"]
pipeline_parallel_degree = 1 # TODO: add tests for pp_degree > 1
model_id = properties.get('model_id') or properties.get('model_dir')
use_triton = properties.get("use_triton", False)
dtype = properties.get("dtype", "fp32")
return fastertransformer.init_inference(model_id, tensor_parallel_degree,
pipeline_parallel_degree, dtype)
return fastertransformer.init_inference(model_id,
tensor_parallel_degree,
pipeline_parallel_degree,
dtype,
use_triton=use_triton), use_triton


def partition(inputs: Input):
Expand All @@ -27,17 +32,20 @@ def partition(inputs: Input):


def handle(inputs: Input):
global model
global model, use_triton

if not model:
model = load_model(inputs.get_properties())
model, use_triton = load_model(inputs.get_properties())

if inputs.is_empty():
# Model server makes an empty call to warmup the model on startup
return None

input_json = inputs.get_as_json()
input_data = input_json.pop("inputs")
result = model.pipeline_generate(input_data)
if not use_triton:
result = model.pipeline_generate(input_data)
else:
result = model.pipeline_generate(input_data, [64] * len(input_data))

return Output().add(result)
15 changes: 12 additions & 3 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@
"option.model_id": "s3://djl-llm/flan-t5-xxl/",
"option.tensor_parallel_degree": 4,
"option.dtype": "fp32"
},
"EleutherAI/pythia-2.8b": {
"option.model_id": "s3://djl-llm/pythia-2.8b/",
"option.tensor_parallel_degree": 2,
"option.dtype": "fp16",
"gpu.maxWorkers": 1
}
}

Expand All @@ -203,10 +209,11 @@
"option.dtype": "fp16",
"gpu.maxWorkers": 1,
},
"flan-t5-xxl": {
"option.model_id": "s3://djl-llm/flan-t5-xxl/",
"nomic-ai/gpt4all-j": {
"option.model_id": "s3://djl-llm/gpt4all-j/",
"option.tensor_parallel_degree": 4,
"option.dtype": "fp32"
"option.dtype": "fp32",
"option.use_triton": True
}
}

Expand Down Expand Up @@ -409,6 +416,8 @@ def build_ft_raw_model(model):
)
options = ft_model_list[model]
options["engine"] = "FasterTransformer"
if "option.use_triton" in options and options["option.use_triton"]:
options["engine"] = "Python"
write_properties(options)
shutil.copyfile("llm/fastertransformer-model.py", "models/test/model.py")

Expand Down

0 comments on commit b016e5d

Please sign in to comment.