Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DashScope API based multimodal service functions in AgentScope library #255

Merged
merged 11 commits into from
Jun 5, 2024
8 changes: 4 additions & 4 deletions examples/conversation_with_customized_services/main.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@
" weather_data = weather.run(f\"{city},{country}\")\n",
" return ServiceResponse(ServiceExecStatus.SUCCESS, weather_data)\n",
" except Exception as e:\n",
" return ServiceResponse(ServiceExecStatus.FAILURE, str(e))"
" return ServiceResponse(ServiceExecStatus.ERROR, str(e))"
]
},
{
Expand Down Expand Up @@ -408,7 +408,7 @@
" return ServiceResponse(ServiceExecStatus.SUCCESS, {\"urls\": urls})\n",
" else:\n",
" err_msg = f\"status_code: {response.status_code}, code: {response.code}, message: {response.message}\"\n",
" return ServiceResponse(ServiceExecStatus.FAILURE, err_msg)"
" return ServiceResponse(ServiceExecStatus.ERROR, err_msg)"
]
},
{
Expand Down Expand Up @@ -457,7 +457,7 @@
" return ServiceResponse(ServiceExecStatus.SUCCESS, description)\n",
" else:\n",
" err_msg = f\"status_code: {response.status_code}, code: {response.code}, message: {response.message}\"\n",
" return ServiceResponse(ServiceExecStatus.FAILURE, err_msg) \n",
" return ServiceResponse(ServiceExecStatus.ERROR, err_msg) \n",
" "
]
},
Expand Down Expand Up @@ -500,7 +500,7 @@
" f.write(result.get_audio_data())\n",
" return ServiceResponse(ServiceExecStatus.SUCCESS, 'output.wav')\n",
" else:\n",
" return ServiceResponse(ServiceExecStatus.FAILURE, \"Failed to generate audio file\")"
" return ServiceResponse(ServiceExecStatus.ERROR, \"Failed to generate audio file\")"
]
},
{
Expand Down
12 changes: 7 additions & 5 deletions src/agentscope/models/dashscope_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,9 @@ def __call__(
messages=messages,
**kwargs,
)

# Unhandle code path here
# response could be a generator , if stream is yes
# suggest add a check here
if response.status_code != HTTPStatus.OK:
error_msg = (
f" Request id: {response.request_id},"
Expand Down Expand Up @@ -770,7 +772,7 @@ def format(
for i, unit in enumerate(input_msgs):
if i == 0 and unit.role == "system":
# system prompt
content = self._convert_url(unit.url)
content = self.convert_url(unit.url)
content.append({"text": _convert_to_str(unit.content)})

messages.append(
Expand All @@ -785,7 +787,7 @@ def format(
f"{unit.name}: {_convert_to_str(unit.content)}",
)
# image and audio
image_or_audio_dicts.extend(self._convert_url(unit.url))
image_or_audio_dicts.extend(self.convert_url(unit.url))

dialogue_history = "\n".join(dialogue)

Expand All @@ -808,7 +810,7 @@ def format(

return messages

def _convert_url(self, url: Union[str, Sequence[str], None]) -> List[dict]:
def convert_url(self, url: Union[str, Sequence[str], None]) -> List[dict]:
"""Convert the url to the format of DashScope API. Note for local
files, a prefix "file://" will be added.

Expand Down Expand Up @@ -841,7 +843,7 @@ def _convert_url(self, url: Union[str, Sequence[str], None]) -> List[dict]:
elif isinstance(url, list):
dicts = []
for _ in url:
dicts.extend(self._convert_url(_))
dicts.extend(self.convert_url(_))
return dicts
else:
raise TypeError(
Expand Down
8 changes: 8 additions & 0 deletions src/agentscope/service/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
dblp_search_authors,
dblp_search_venues,
)
from .multi_modality.dashscope_services import (
dashscope_image_to_text,
dashscope_text_to_image,
dashscope_text_to_audio,
)
from .service_response import ServiceResponse
from .service_toolkit import ServiceToolkit
from .service_toolkit import ServiceFactory
Expand Down Expand Up @@ -78,6 +83,9 @@ def get_help() -> None:
"dblp_search_publications",
"dblp_search_authors",
"dblp_search_venues",
"dashscope_image_to_text",
"dashscope_text_to_image",
"dashscope_text_to_audio",
# to be deprecated
"ServiceFactory",
]
289 changes: 289 additions & 0 deletions src/agentscope/service/multi_modality/dashscope_services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# -*- coding: utf-8 -*-
"""Use DashScope API to generate images,
convert text to audio, and convert images to text.
Please refer to the official documentation for more details:
https://dashscope.aliyun.com/
"""

from typing import Union, Optional, Literal, Sequence

import os

import dashscope
from dashscope.audio.tts import SpeechSynthesizer

from agentscope.models import (
DashScopeImageSynthesisWrapper,
DashScopeMultiModalWrapper,
)

# SpeechSynthesizerWrapper is current not available


from agentscope.service.service_response import (
ServiceResponse,
ServiceExecStatus,
)
from agentscope.utils.tools import _download_file


def dashscope_text_to_image(
prompt: str,
api_key: str,
n: int = 1,
size: Literal["1024*1024", "720*1280", "1280*720"] = "1024*1024",
model: str = "wanx-v1",
save_dir: Optional[str] = None,
) -> ServiceResponse:
"""Generate image(s) based on the given prompt, and return image url(s).

Args:
prompt (`str`):
The text prompt to generate image.
api_key (`str`):
The api key for the dashscope api.
n (`int`, defaults to `1`):
The number of images to generate.
size (`Literal["1024*1024", "720*1280", "1280*720"]`, defaults to
`"1024*1024"`):
Size of the image.
model (`str`, defaults to '"wanx-v1"'):
The model to use.
save_dir (`Optional[str]`, defaults to 'None'):
The directory to save the generated images. If not specified,
will return the web urls.

Returns:
ServiceResponse:
A dictionary with two variables: `status` and`content`.
If `status` is ServiceExecStatus.SUCCESS,
the `content` is a dict with key 'fig_paths" and
value is a list of the paths to the generated images.

Example:

.. code-block:: python

prompt = "A beautiful sunset in the mountains"
print(dashscope_text_to_image(prompt, "{api_key}"))

> {
> 'status': 'SUCCESS',
> 'content': {'image_urls': ['IMAGE_URL1', 'IMAGE_URL2']}
> }

"""
text2img = DashScopeImageSynthesisWrapper(
config_name="dashscope-text-to-image-service", # Just a placeholder
model_name=model,
api_key=api_key,
)
try:
res = text2img(
prompt=prompt,
n=n,
size=size,
)
urls = res.image_urls

# save images to save_dir
if urls is not None:
if save_dir:
os.makedirs(save_dir, exist_ok=True)
urls_local = []
# Obtain the image file names in the url
for url in urls:
image_name = url.split("/")[-1]
image_path = os.path.join(save_dir, image_name)
# Download the image
_download_file(url, image_path)
urls_local.append(image_path)

return ServiceResponse(
ServiceExecStatus.SUCCESS,
{"image_urls": urls_local}
)
else:
# Return the web urls
return ServiceResponse(
ServiceExecStatus.SUCCESS,
{"image_urls": urls}
)
else:
return ServiceResponse(
ServiceExecStatus.ERROR,
"Error: Failed to generate images",
)
except Exception as e:
return ServiceResponse(
ServiceExecStatus.ERROR,
str(e),
)


def dashscope_image_to_text(
image_urls: Union[str, Sequence[str]],
api_key: str,
prompt: str = "Describe the image",
model: str = "qwen-vl-plus",
) -> ServiceResponse:
"""Generate text based on the given images.

Args:
image_urls (`Union[str, Sequence[str]]`):
The url of single or multiple images.
api_key (`str`):
The api key for the dashscope api.
prompt (`str`, defaults to 'Describe the image' ):
The text prompt.
model (`str`, defaults to 'qwen-vl-plus'):
The model to use in DashScope MultiModal API.

Returns:
`ServiceResponse`:
A dictionary with two variables: `status` and`content`.
If `status` is ServiceExecStatus.SUCCESS, the `content` is the
generated text.

Example:

.. code-block:: python

image_url = "image.jpg"
prompt = "Describe the image"
print(image_to_text(image_url, prompt))

> {'status': 'SUCCESS', 'content': 'A beautiful sunset in the mountains'}

"""

img2text = DashScopeMultiModalWrapper(
config_name="dashscope-image-to-text-service", # Just a placeholder
model_name=model,
api_key=api_key,
)

if isinstance(image_urls, str):
image_urls = [image_urls]

# Check if the local url is valid
img_abs_urls = []
for url in image_urls:
if os.path.exists(url):
if os.path.isfile(url):
img_abs_urls.append(os.path.abspath(url))
else:
return ServiceResponse(
ServiceExecStatus.ERROR,
f'Error: The input image url "{url}" is not a file.'
)
else:
# Maybe a web url or an invalid url, we leave it to the API
# to handle
img_abs_urls.append(url)

# Convert image paths according to the model requirements
contents = img2text.convert_url(img_abs_urls)
contents.append({"text": prompt})
# currently only support one round of conversation
# if multiple rounds of conversation are needed,
# it would be better to implement an Agent class
sys_message = {
"role": "system",
"content": [{"text": "You are a helpful assistant."}],
}
user_message = {
"role": "user",
"content": contents,
}
messages = [sys_message, user_message]
try:
res = img2text(messages, stream=False)
description = res.text
if description is not None:
return ServiceResponse(
ServiceExecStatus.SUCCESS,
description,
)
else:
return ServiceResponse(
ServiceExecStatus.ERROR,
"Error: Failed to generate text",
)
except Exception as e:
return ServiceResponse(
ServiceExecStatus.ERROR,
str(e),
)


def dashscope_text_to_audio(
text: str,
api_key: str,
save_dir: str,
model: str = "sambert-zhichu-v1",
sample_rate: int = 48000,
) -> ServiceResponse:
"""Convert the given text to audio.

Args:
text (`str`):
The text to be converted into audio.
api_key (`str`):
The api key for the dashscope API.
save_dir (`str`):
The directory to save the generated audio.
model (`str`, defaults to 'sambert-zhichu-v1'):
The model to use. Full model list can be found in
https://help.aliyun.com/zh/dashscope/model-list
sample_rate (`int`, defaults to 48000):
Samplerate of the audio.

Returns:
`ServiceResponse`:
A dictionary with two variables: `status` and`content`. If
`status` is ServiceExecStatus.SUCCESS, the `content` contains
a dictionary with key "audio_path" and value is the path to
the generated audio.

Example:

.. code-block:: python

text = "How is the weather today?"
print(text_to_audio(text)) gives:


> {'status': 'SUCCESS', 'content': {"audio_path": "AUDIO_PATH"}}

"""
dashscope.api_key = api_key

res = SpeechSynthesizer.call(
model=model,
text=text,
sample_rate=sample_rate,
format="wav",
)

audio_data = res.get_audio_data()

if audio_data is not None:
if save_dir is not None:
os.makedirs(save_dir, exist_ok=True)

# Save locally
text = text[0:15] if len(text) > 15 else text
audio_path = os.path.join(save_dir, f"{text.strip()}.wav")

with open(audio_path, "wb") as f:
f.write(audio_data)
return ServiceResponse(
ServiceExecStatus.SUCCESS,
{"audio_path": audio_path},
)
else:
return ServiceResponse(
ServiceExecStatus.ERROR,
"Error: Failed to generate audio"
)
Loading
Loading