You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
fromopenaiimportOpenAI# Modify OpenAI's API key and API base to use vLLM's API server.openai_api_key="EMPTY"openai_api_base="http://localhost:8000/v1"client=OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")api_key=openai_api_key,
base_url=openai_api_base,
)
models=client.models.list()
model=models.data[0].id# Completion APIstream=Truecompletion=client.completions.create(
model=model,
prompt="A robot may not injure a human being",
echo=False,
n=2,
stream=stream)
print("Completion results:")
ifstream:
forcincompletion:
print(c)
else:
print(completion)
The client will see:
Traceback (most recent call last):
File "/home/user/vllm/send_request.py", line 27, in <module>
for c in completion:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 46, in __iter__
for item in self._iterator:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 58, in __stream__
for sse in iterator:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 50, in _iter_events
yield from self._decoder.iter_bytes(self.response.iter_bytes())
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 280, in iter_bytes
for chunk in self._iter_chunks(iterator):
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 291, in _iter_chunks
for chunk in iterator:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_models.py", line 829, in iter_bytes
for raw_bytes in self.iter_raw():
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_models.py", line 883, in iter_raw
for raw_stream_bytes in self.stream:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_client.py", line 126, in __iter__
for chunk in self._stream:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_transports/default.py", line 112, in __iter__
with map_httpcore_exceptions():
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/contextlib.py", line 158, in __exit__
self.gen.throw(typ, value, traceback)
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_transports/default.py", line 86, in map_httpcore_exceptions
raise mapped_exc(message) from exc
httpx.RemoteProtocolError: peer closed connection without sending complete message body (incomplete chunked read)
and the error on the server-side is:
| Traceback (most recent call last):
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/starlette/responses.py", line 261, in wrap
| await func()
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/starlette/responses.py", line 250, in stream_response
| async for chunk in self.body_iterator:
| File "/home/user/vllm/vllm/entrypoints/openai/serving_completion.py", line 222, in completion_stream_generator
| async for prompt_idx, res in result_generator:
| File "/home/user/vllm/vllm/utils.py", line 319, in consumer
| raise e
| File "/home/user/vllm/vllm/utils.py", line 310, in consumer
| raise item
| File "/home/user/vllm/vllm/utils.py", line 294, in producer
| async for item in iterator:
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 746, in generate
| async for output in self._process_request(
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 859, in _process_request
| raise e
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 855, in _process_request
| async for request_output in stream:
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 90, in __anext__
| raise result
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 43, in _log_task_completion
| return_value = task.result()
| ^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 595, in run_engine_loop
| result = task.result()
| ^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 540, in engine_step
| request_outputs = await self.engine.step_async(virtual_engine)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 241, in step_async
| output = await self.model_executor.execute_model_async(
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/executor/gpu_executor.py", line 122, in execute_model_async
| output = await make_async(self.driver_worker.execute_model
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/concurrent/futures/thread.py", line 58, in run
| result = self.fn(*self.args, **self.kwargs)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
| return func(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/spec_decode/spec_decode_worker.py", line 338, in execute_model
| return self._run_no_spec(execute_model_req,
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/contextlib.py", line 81, in inner
| return func(*args, **kwds)
| ^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/spec_decode/spec_decode_worker.py", line 386, in _run_no_spec
| sampler_output = self.scorer_worker.execute_model(execute_model_req)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/worker/worker_base.py", line 271, in execute_model
| output = self.model_runner.execute_model(
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
| return func(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/worker/model_runner.py", line 1245, in execute_model
| output: SamplerOutput = self.model.sample(
| ^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/model_executor/models/llama.py", line 416, in sample
| next_tokens = self.sampler(logits, sampling_metadata)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
| return self._call_impl(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
| return forward_call(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/model_executor/layers/sampler.py", line 96, in forward
| sample_results, maybe_sampled_tokens_tensor = _sample(
| ^^^^^^^^
| File "/home/user/vllm/vllm/model_executor/layers/sampler.py", line 658, in _sample
| return _sample_with_torch(
| ^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/model_executor/layers/sampler.py", line 528, in _sample_with_torch
| sampled_token_ids_tensor[
| RuntimeError: shape mismatch: value tensor of shape [2] cannot be broadcast to indexing result of shape [1, 1]
+------------------------------------
The text was updated successfully, but these errors were encountered:
This issue has been automatically marked as stale because it has not had any activity within 90 days. It will be automatically closed if no further activity occurs within 30 days. Leave a comment if you feel this issue should remain open. Thank you!
Your current environment
🐛 Describe the bug
If one sends a request with
n>1
to a server with spec. decode enabled, the request with fail with an unhelpful error message.To reproduce, start an inference server with:
and then send a request via:
The client will see:
and the error on the server-side is:
The text was updated successfully, but these errors were encountered: