From 8dc146db4dd02fa5314fe8fcedcf56a6a77db936 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 25 Jun 2024 10:36:26 -0700 Subject: [PATCH 1/2] add example --- docs/source/models/vlm.rst | 2 + examples/openai_vision_api_client.py | 77 ++++++++++++++++++++++++++++ vllm/multimodal/utils.py | 12 +++-- 3 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 examples/openai_vision_api_client.py diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index de55a1a099192..1837dd2aa89f7 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -130,6 +130,8 @@ To consume the server, you can use the OpenAI client like in the example below: ) print("Chat response:", chat_response) +A full code example can be found in `examples/openai_vision_api_client.py `_. + .. note:: By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py new file mode 100644 index 0000000000000..2deb411c65114 --- /dev/null +++ b/examples/openai_vision_api_client.py @@ -0,0 +1,77 @@ +"""An example showing how to use vLLM to serve VLMs. + +Launch the vLLM server with the following command: +python -m vllm.entrypoints.openai.api_server \ + --model llava-hf/llava-1.5-7b-hf \ + --image-input-type pixel_values \ + --image-token-id 32000 \ + --image-input-shape 1,3,336,336 \ + --image-feature-size 576 \ + --chat-template template_llava.jinja +""" +import requests +import base64 +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +# Use image url in the payload +chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + ], + }], + model=model, +) + +print(f"Chat completion results:{chat_completion_from_url.choices[0].message.content}") + +# Use base64 encoded image in the payload +def encode_image_base64_from_url(image_url: str) -> str: + """Encode an image retrieved from a remote url to base64 format.""" + + with requests.get(image_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + +image_base64 = encode_image_base64_from_url(image_url=image_url) +chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + }, + ], + }], + model=model, +) + +print(f"Chat completion results: {chat_completion_from_base64.choices[0].message.content}") diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 509f791d27c6f..0cf2c057f892c 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,6 +1,7 @@ import base64 from io import BytesIO from typing import Optional, Union +from urllib.parse import urlparse import aiohttp from PIL import Image @@ -28,6 +29,10 @@ async def fetch_image(cls, image_url: str) -> Image.Image: """Load PIL image from a url or base64 encoded openai GPT4V format""" if image_url.startswith('http'): + parsed_url = urlparse(image_url) + if parsed_url.scheme not in ["http", "https"]: + raise ValueError("Invalid 'image_url': A valid 'image_url' " + "must have scheme 'http' or 'https'.") # Avoid circular import from vllm import __version__ as VLLM_VERSION @@ -44,8 +49,9 @@ async def fetch_image(cls, image_url: str) -> Image.Image: image = load_image_from_base64(image_url.split(',', 1)[1]) else: - raise ValueError("Invalid image url: A valid image url must start " - "with either 'data:image' or 'http'.") + raise ValueError( + "Invalid 'image_url': A valid 'image_url' must start " + "with either 'data:image' or 'http'.") return image @@ -56,7 +62,7 @@ async def async_get_and_parse_image(image_url: str) -> ImagePixelData: def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str: - """encode image to base64 format.""" + """Encode a pillow image to base64 format.""" buffered = BytesIO() if format == 'JPEG': From c166805c98f31ca2b9d46ae660166feaa0d55c79 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 25 Jun 2024 10:45:56 -0700 Subject: [PATCH 2/2] format --- examples/openai_vision_api_client.py | 59 +++++++++++++++++----------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py index 2deb411c65114..26f2aa651fca7 100644 --- a/examples/openai_vision_api_client.py +++ b/examples/openai_vision_api_client.py @@ -9,8 +9,9 @@ --image-feature-size 576 \ --chat-template template_llava.jinja """ -import requests import base64 + +import requests from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. @@ -31,21 +32,27 @@ # Use image url in the payload chat_completion_from_url = client.chat.completions.create( messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What’s in this image?"}, - { - "type": "image_url", - "image_url": { - "url": image_url - }, - }, - ], + "role": + "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + ], }], model=model, ) -print(f"Chat completion results:{chat_completion_from_url.choices[0].message.content}") +result = chat_completion_from_url.choices[0].message.content +print(f"Chat completion output:{result}") + # Use base64 encoded image in the payload def encode_image_base64_from_url(image_url: str) -> str: @@ -57,21 +64,27 @@ def encode_image_base64_from_url(image_url: str) -> str: return result + image_base64 = encode_image_base64_from_url(image_url=image_url) chat_completion_from_base64 = client.chat.completions.create( messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What’s in this image?"}, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, - }, - ], + "role": + "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + }, + ], }], model=model, ) -print(f"Chat completion results: {chat_completion_from_base64.choices[0].message.content}") +result = chat_completion_from_base64.choices[0].message.content +print(f"Chat completion output:{result}")