forked from runpod/skypilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
serve.yaml
36 lines (28 loc) · 979 Bytes
/
serve.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
envs:
MODEL_NAME: decapoda-research/llama-65b-hf
resources:
accelerators: A100-80GB:8
setup: |
conda activate vllm
if [ $? -ne 0 ]; then
conda create -n vllm python=3.9 -y
conda activate vllm
fi
git clone https://github.com/vllm-project/vllm.git || true
# Install fschat and accelerate for chat completion
pip install fschat
pip install accelerate
cd vllm
pip list | grep vllm || pip install .
pip install gradio
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.api_server \
--model $MODEL_NAME \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--tokenizer hf-internal-testing/llama-tokenizer 2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
echo 'Starting gradio server...'
python vllm/examples/gradio_webserver.py