diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fda228b..a45dba5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,7 +22,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install black ruff + python -m pip install ruff - name: Check quality run: | diff --git a/Makefile b/Makefile index fec4443..26287ce 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,9 @@ check_dirs := src tests quality: - black --check $(check_dirs) ruff $(check_dirs) + ruff format --check $(check_dirs) style: - black $(check_dirs) ruff $(check_dirs) --fix + ruff format $(check_dirs) diff --git a/README.md b/README.md index e25a597..c89e611 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,40 @@ pip install -U imitater python -m imitater.service.app -c config/example.yaml ``` +
Show configuration instruction. + +Add an openai model. + +```yaml +- name: Display name +- token: OpenAI token +``` + +Add a chat model. + +```yaml +- name: Display name +- path: Model name on hub or model path +- device: Device IDs +- port: Port ID +- maxlen: Maximum model length (optional) +- agent_type: Agent type (optional) {react, aligned} +- template: Template jinja file (optional) +- gen_config: Generation config folder (optional) +``` + +Add an embedding model: + +```yaml +- name: Display name +- path: Model name on hub or model path +- device: Device IDs (does not support multi-gpus) +- port: Port ID +- batch_size: Batch size (optional) +``` + +
+ > [!NOTE] > [Chat template](https://huggingface.co/docs/transformers/chat_templating) is required for the chat models. > diff --git a/pyproject.toml b/pyproject.toml index 796f03e..e5aa684 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,30 +2,21 @@ requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" -[tool.black] +[tool.ruff] +target-version = "py310" line-length = 119 -target-version = ["py310"] +indent-width = 4 -[tool.ruff] +[tool.ruff.lint] ignore = ["C901", "E501", "E741", "W605"] select = ["C", "E", "F", "I", "W"] -line-length = 119 -[tool.ruff.isort] +[tool.ruff.lint.isort] lines-after-imports = 2 +known-third-party = ["infinity_emb", "openai", "torch", "transformers", "vllm"] -[isort] -default_section = "FIRSTPARTY" -known_third_party = [ - "infinity_emb", - "torch", - "transformers", - "vllm" -] -line_length = 119 -lines_after_imports = 2 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true -ensure_newline_before_comments = true +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" diff --git a/requirements.txt b/requirements.txt index ed6c436..78a603d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy sse-starlette -infinity-emb[torch] +infinity-emb[torch]==0.0.17 openai>=1.5.0 transformers>=4.37.2 vllm>=0.3.0 diff --git a/src/imitater/__init__.py b/src/imitater/__init__.py index 3ced358..b5fdc75 100644 --- a/src/imitater/__init__.py +++ b/src/imitater/__init__.py @@ -1 +1 @@ -__version__ = "0.2.1" +__version__ = "0.2.2" diff --git a/src/imitater/agent/types.py b/src/imitater/agent/types.py index d17020e..db3a3c5 100644 --- a/src/imitater/agent/types.py +++ b/src/imitater/agent/types.py @@ -29,8 +29,7 @@ def extract_tool(self, answer: str, tools: List[Dict[str, Any]]) -> Union[str, T tools: the tool specification in the OpenAI format. Returns: - name, arguments (if tool call exists): the tool name with JSON formatted arguments. - response (if tool call does not exist): the assistant response. + response | (name, arguments): response text or tool name with JSON arguments if tool exists. """ ... diff --git a/src/imitater/model/chat_model.py b/src/imitater/model/chat_model.py index 86a5f3e..b8969c6 100644 --- a/src/imitater/model/chat_model.py +++ b/src/imitater/model/chat_model.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, fields -from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Generator, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Tuple, Union from transformers import AutoTokenizer, GenerationConfig from typing_extensions import Self @@ -17,6 +17,14 @@ @dataclass class ChatConfig: + r""" + Creates configuration for a chat model. + + Methods: + add_cli_args: adds arguments to a argument parser. + from_cli_args: builds configuration based on the command line arguments. + """ + name: str path: str device: List[int] @@ -44,6 +52,15 @@ def from_cli_args(cls, args: "Namespace") -> Self: class ChatModel: + r""" + Creates a chat model for chat completions. + + Methods: + chat: generates chat completions. + stream_chat: streams chat completions. + function_call: generates tool calls. + """ + def __init__(self, config: "ChatConfig") -> None: config.path = try_download_model_from_ms(config.path) self.config = config @@ -89,7 +106,7 @@ def _load_generation_config(self) -> None: self._generation_config.top_p = 1.0 if not self._generation_config.max_new_tokens: - self._generation_config.max_new_tokens = 1024 + self._generation_config.max_new_tokens = 2048 if isinstance(self._generation_config.eos_token_id, int): self._generation_config.eos_token_id = [self._generation_config.eos_token_id] @@ -121,6 +138,22 @@ async def _generate( return result_generator async def chat(self, messages: List[Dict[str, str]], request_id: str, **gen_kwargs) -> Tuple[str, int, int]: + r""" + Generates chat completions. + + Args: + messages: input messages. + request_id: request ID. + temperature: generation parameter. + top_p: generation parameter. + max_tokens: generation parameter. + stop_token_ids: generation parameter. + + Returns: + generated_text: the generated text. + prompt_tokens: the number of prompt tokens. + completion_tokens: the number of completion tokens. + """ generated_text, prompt_tokens, completion_tokens = "", 0, 0 generator = await self._generate(messages, request_id, **gen_kwargs) async for result in generator: @@ -133,7 +166,21 @@ async def chat(self, messages: List[Dict[str, str]], request_id: str, **gen_kwar async def stream_chat( self, messages: List[Dict[str, str]], request_id: str, **gen_kwargs - ) -> Generator[str, None, None]: + ) -> AsyncGenerator[str, None]: + r""" + Streams chat completions. + + Args: + messages: input messages. + request_id: request ID. + temperature: generation parameter. + top_p: generation parameter. + max_tokens: generation parameter. + stop_token_ids: generation parameter. + + Returns: + generated_token: the generated token. + """ generated_text = "" generator = await self._generate(messages, request_id, **gen_kwargs) async for result in generator: @@ -144,21 +191,33 @@ async def stream_chat( async def function_call( self, messages: List[Dict[str, str]], tools: List[Dict[str, Any]], request_id: str, **gen_kwargs ) -> Tuple[Union[str, Tuple[str, str]], int, int]: - generated_text, prompt_tokens, completion_tokens = "", 0, 0 + r""" + Generates chat completions. + + Args: + messages: input messages. + tools: tools available. + request_id: request ID. + temperature: generation parameter. + top_p: generation parameter. + max_tokens: generation parameter. + stop_token_ids: generation parameter. + + Returns: + response | (name, arguments): response text or tool name with JSON arguments if tool exists. + prompt_tokens: the number of prompt tokens. + completion_tokens: the number of completion tokens. + """ agent_messages = self._agent.build_prompt(messages, tools) stop_word = self._agent.get_stop_word() if stop_word is not None: - gen_kwargs["stop_token_ids"] = [self._tokenizer.encode(stop_word)[0]] + stop_word_id = self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(stop_word)[0]) + gen_kwargs["stop_token_ids"] = gen_kwargs.pop("stop_token_ids", []) + [stop_word_id] - generator = await self._generate(agent_messages, request_id, **gen_kwargs) - async for result in generator: - if result.finished: - generated_text = result.outputs[0].text - prompt_tokens = len(result.prompt_token_ids) - completion_tokens = len(result.outputs[0].token_ids) + generated_text, prompt_tokens, completion_tokens = await self.chat(agent_messages, request_id, **gen_kwargs) if stop_word is not None: - stop_token = self._tokenizer.decode(gen_kwargs["stop_token_ids"]) + stop_token = self._tokenizer.convert_ids_to_tokens(stop_word_id) if generated_text.endswith(stop_token): generated_text = generated_text[: -len(stop_token)] diff --git a/src/imitater/model/embed_model.py b/src/imitater/model/embed_model.py index a8c751d..113bca9 100644 --- a/src/imitater/model/embed_model.py +++ b/src/imitater/model/embed_model.py @@ -16,6 +16,14 @@ @dataclass class EmbedConfig: + r""" + Creates configuration for an embedding model. + + Methods: + add_cli_args: adds arguments to a argument parser. + from_cli_args: builds configuration based on the command line arguments. + """ + name: str path: str device: List[int] @@ -37,6 +45,15 @@ def from_cli_args(cls, args: "Namespace") -> Self: class EmbedModel: + r""" + Creates an embedding model for text embeddings. + + Methods: + startup: starts the embedding engine. + shutdown: stops the embedding engine. + embed: calculates text embeddings. + """ + def __init__(self, config: "EmbedConfig") -> None: config.path = try_download_model_from_ms(config.path) self.config = config @@ -50,15 +67,30 @@ def _init_infinity_engine(self) -> None: self._engine = AsyncEmbeddingEngine( model_name_or_path=self.config.path, batch_size=self.config.batch_size, - engine="torch", device="cuda", ) async def startup(self) -> None: + r""" + Starts the embedding engine. + """ await self._engine.astart() async def shutdown(self) -> None: + r""" + Stops the embedding engine. + """ await self._engine.astop() async def embed(self, texts: List[str]) -> Tuple[List["NDArray[float32]"], int]: + r""" + Calculates the text embeddings. + + Args: + texts: the batched text input. + + Returns: + embeddings: the batched embeddings. + usage: the number of input tokens. + """ return await self._engine.embed(texts) diff --git a/src/imitater/service/app.py b/src/imitater/service/app.py index 1f95a65..e9cb962 100644 --- a/src/imitater/service/app.py +++ b/src/imitater/service/app.py @@ -1,8 +1,9 @@ import argparse import os +from copy import deepcopy from subprocess import PIPE, STDOUT, Popen from threading import Thread -from typing import Any, AsyncGenerator, Dict, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, List, Union import uvicorn import yaml @@ -12,14 +13,13 @@ from openai.types.chat import ChatCompletion, ChatCompletionChunk from sse_starlette import EventSourceResponse -from ..utils.generic import dictify, jsonify +from ..utils.generic import dictify +from .common import create_stream_chunk, print_subprocess_stdout from .protocol import ( ChatCompletionMessage, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, - ChatCompletionStreamResponse, - ChatCompletionStreamResponseChoice, Embeddings, EmbeddingsRequest, EmbeddingsResponse, @@ -33,25 +33,7 @@ ) -def read_message(process: "Popen") -> None: - while process.stdout.readable(): - line: bytes = process.stdout.readline() - - if not line: - break - - print(line.decode("utf-8").strip()) - - -def create_stream_chunk( - request_id: str, model: str, delta: "ChatCompletionMessage", finish_reason: Optional[Finish] = None -) -> str: - choice = ChatCompletionStreamResponseChoice(index=0, delta=delta, finish_reason=finish_reason) - chunk = ChatCompletionStreamResponse(id=request_id, model=model, choices=[choice]) - return jsonify(chunk) - - -async def stream(response: AsyncStream[ChatCompletionChunk]) -> AsyncGenerator[str, None]: +async def _stream_openai_chat_completion(response: AsyncStream[ChatCompletionChunk]) -> AsyncGenerator[str, None]: request_id, model = None, None async for chunk in response: if request_id is None: @@ -67,10 +49,10 @@ async def stream(response: AsyncStream[ChatCompletionChunk]) -> AsyncGenerator[s yield "[DONE]" -async def create_chat_completion( - request: "ChatCompletionRequest", chat_model: "AsyncOpenAI" +async def _create_openai_chat_completion( + request: "ChatCompletionRequest", model: "AsyncOpenAI" ) -> "ChatCompletionResponse": - response: "ChatCompletion" = await chat_model.chat.completions.create( + response: "ChatCompletion" = await model.chat.completions.create( model=request.model, messages=[dictify(message) for message in request.messages], tools=[dictify(tool) for tool in request.tools] if request.tools is not None else None, @@ -82,7 +64,7 @@ async def create_chat_completion( ) if request.stream: - return EventSourceResponse(stream(response), media_type="text/event-stream") + return EventSourceResponse(_stream_openai_chat_completion(response), media_type="text/event-stream") choices = [] for i, choice in enumerate(response.choices): @@ -123,8 +105,8 @@ async def create_chat_completion( ) -async def create_embeddings(request: "EmbeddingsRequest", embed_model: "AsyncOpenAI"): - response = await embed_model.embeddings.create( +async def _create_openai_embeddings(request: "EmbeddingsRequest", model: "AsyncOpenAI") -> "EmbeddingsResponse": + response = await model.embeddings.create( model=request.model, input=request.input, encoding_format=request.encoding_format, @@ -145,6 +127,33 @@ async def create_embeddings(request: "EmbeddingsRequest", embed_model: "AsyncOpe ) +def _launch_chat_server(chat_config: Dict[str, Any]) -> "Popen": + cmd = "python -m imitater.service.chat" + cmd += " --name {}".format(chat_config["name"]) + cmd += " --path {}".format(chat_config["path"]) + cmd += " --device {}".format(" ".join(map(str, chat_config["device"]))) + cmd += " --port {}".format(chat_config["port"]) + cmd += " --maxlen {}".format(chat_config["maxlen"]) if "maxlen" in chat_config else "" + cmd += " --agent_type {}".format(chat_config["agent_type"]) if "agent_type" in chat_config else "" + cmd += " --template {}".format(chat_config["template"]) if "template" in chat_config else "" + cmd += " --gen_config {}".format(chat_config["gen_config"]) if "gen_config" in chat_config else "" + env = deepcopy(os.environ) + env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, chat_config["device"])) + return Popen(cmd, env=env, shell=True, stdout=PIPE, stderr=STDOUT) + + +def _launch_embed_server(embed_config: Dict[str, Any]) -> "Popen": + cmd = "python -m imitater.service.embed" + cmd += " --name {}".format(embed_config["name"]) + cmd += " --path {}".format(embed_config["path"]) + cmd += " --device {}".format(" ".join(map(str, embed_config["device"]))) + cmd += " --port {}".format(embed_config["port"]) + cmd += " --batch_size {}".format(embed_config["batch_size"]) if "batch_size" in embed_config else "" + env = deepcopy(os.environ) + env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, embed_config["device"])) + return Popen(cmd, env=env, shell=True, stdout=PIPE, stderr=STDOUT) + + def launch_server(config_file: str) -> None: with open(config_file, "r", encoding="utf-8") as f: config: Dict[str, Union[Dict[str, Any], List[Dict[str, Any]]]] = yaml.safe_load(f) @@ -155,40 +164,22 @@ def launch_server(config_file: str) -> None: processes: List["Popen"] = [] for chat_config in config["chat"]: - cmd = "python -m imitater.service.chat" - cmd += " --name {}".format(chat_config.get("name")) - cmd += " --path {}".format(chat_config.get("path")) - cmd += " --device {}".format(" ".join(map(str, chat_config.get("device")))) - cmd += " --port {}".format(chat_config.get("port")) - if chat_config.get("maxlen", None): - cmd += " --maxlen {}".format(chat_config.get("maxlen")) - if chat_config.get("agent_type", None): - cmd += " --agent_type {}".format(chat_config.get("agent_type")) - if chat_config.get("template", None): - cmd += " --template {}".format(chat_config.get("template")) - if chat_config.get("gen_config", None): - cmd += " --gen_config {}".format(chat_config.get("gen_config")) - env = os.environ - env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, chat_config.get("device"))) - processes.append(Popen(cmd, env=env, shell=True, stdout=PIPE, stderr=STDOUT)) - chat_models[chat_config.get("name")] = AsyncOpenAI( - api_key="0", base_url="http://127.0.0.1:{}/v1".format(chat_config.get("port")) - ) + if "token" in chat_config: + chat_models[chat_config["name"]] = AsyncOpenAI(api_key=chat_config["token"]) + else: + processes.append(_launch_chat_server(chat_config)) + chat_models[chat_config["name"]] = AsyncOpenAI( + api_key="0", base_url="http://localhost:{}/v1".format(chat_config["port"]) + ) for embed_config in config["embed"]: - cmd = "python -m imitater.service.embed" - cmd += " --name {}".format(embed_config.get("name")) - cmd += " --path {}".format(embed_config.get("path")) - cmd += " --device {}".format(" ".join(map(str, embed_config.get("device")))) - cmd += " --port {}".format(embed_config.get("port")) - if chat_config.get("batch_size", None): - cmd += " --batch_size {}".format(chat_config.get("batch_size")) - env = os.environ - env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, embed_config.get("device"))) - processes.append(Popen(cmd, env=env, shell=True, stdout=PIPE, stderr=STDOUT)) - embed_models[embed_config.get("name")] = AsyncOpenAI( - api_key="0", base_url="http://127.0.0.1:{}/v1".format(embed_config.get("port")) - ) + if "token" in embed_config: + embed_models[embed_config["name"]] = AsyncOpenAI(api_key=embed_config["token"]) + else: + processes.append(_launch_embed_server(embed_config)) + embed_models[embed_config["name"]] = AsyncOpenAI( + api_key="0", base_url="http://localhost:{}/v1".format(embed_config["port"]) + ) app = FastAPI() app.add_middleware( @@ -203,22 +194,22 @@ async def list_models(): return ModelList(data=[ModelCard(id=name) for name in model_names]) @app.post("/v1/chat/completions", response_model=ChatCompletionResponse, status_code=status.HTTP_200_OK) - async def create_chat_completion_v1(request: "ChatCompletionRequest"): + async def create_chat_completion(request: "ChatCompletionRequest"): if request.model not in chat_models: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Model not found.") - return await create_chat_completion(request, chat_models[request.model]) + return await _create_openai_chat_completion(request, chat_models[request.model]) @app.post("/v1/embeddings", response_model=EmbeddingsResponse, status_code=status.HTTP_200_OK) - async def create_embeddings_v1(request: "EmbeddingsRequest"): + async def create_embeddings(request: "EmbeddingsRequest"): if request.model not in embed_models: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Model not found.") - return await create_embeddings(request, embed_models[request.model]) + return await _create_openai_embeddings(request, embed_models[request.model]) for process in processes: - thread = Thread(target=read_message, args=[process]) + thread = Thread(target=print_subprocess_stdout, args=[process]) thread.start() - uvicorn.run(app, host="0.0.0.0", port=port, workers=1) + uvicorn.run(app, host="0.0.0.0", port=port) if __name__ == "__main__": diff --git a/src/imitater/service/chat.py b/src/imitater/service/chat.py index 1389f73..0337aaf 100644 --- a/src/imitater/service/chat.py +++ b/src/imitater/service/chat.py @@ -1,20 +1,19 @@ import argparse import uuid -from typing import Any, AsyncGenerator, Dict, Optional, Union +from typing import Any, AsyncGenerator, Dict import uvicorn from fastapi import FastAPI, HTTPException, status from sse_starlette import EventSourceResponse from ..model import ChatConfig, ChatModel -from ..utils.generic import dictify, jsonify +from ..utils.generic import dictify +from .common import create_stream_chunk from .protocol import ( ChatCompletionMessage, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, - ChatCompletionStreamResponse, - ChatCompletionStreamResponseChoice, Finish, Function, FunctionCall, @@ -23,9 +22,24 @@ ) -async def create_chat_completion( - chat_model: "ChatModel", request: "ChatCompletionRequest" -) -> Union["ChatCompletionResponse", "EventSourceResponse"]: +async def _create_stream_chat_completion( + request: "ChatCompletionRequest", model: "ChatModel", input_kwargs: Dict[str, Any] +) -> AsyncGenerator[str, None]: + yield create_stream_chunk( + input_kwargs["request_id"], request.model, ChatCompletionMessage(role=Role.ASSISTANT, content="") + ) + async for new_token in model.stream_chat(**input_kwargs): + yield create_stream_chunk(input_kwargs["request_id"], request.model, ChatCompletionMessage(content=new_token)) + + yield create_stream_chunk( + input_kwargs["request_id"], request.model, ChatCompletionMessage(), finish_reason=Finish.STOP + ) + yield "[DONE]" + + +async def _create_local_chat_completion( + request: "ChatCompletionRequest", model: "ChatModel" +) -> "ChatCompletionResponse": msg_id = uuid.uuid4().hex input_kwargs = { "messages": [dictify(message) for message in request.messages], @@ -39,14 +53,14 @@ async def create_chat_completion( if request.tools is not None: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.") - generator = create_stream_chat_completion(chat_model, request, input_kwargs) + generator = _create_stream_chat_completion(request, model, input_kwargs) return EventSourceResponse(generator, media_type="text/event-stream") if request.tools is not None: input_kwargs["tools"] = [dictify(tool) for tool in request.tools] - result, prompt_tokens, completion_tokens = await chat_model.function_call(**input_kwargs) + result, prompt_tokens, completion_tokens = await model.function_call(**input_kwargs) else: - result, prompt_tokens, completion_tokens = await chat_model.chat(**input_kwargs) + result, prompt_tokens, completion_tokens = await model.chat(**input_kwargs) if isinstance(result, tuple): name, arguments = result[0], result[1] @@ -74,38 +88,15 @@ async def create_chat_completion( ) -def create_stream_chunk( - request_id: str, model: str, delta: "ChatCompletionMessage", finish_reason: Optional[Finish] = None -) -> str: - choice = ChatCompletionStreamResponseChoice(index=0, delta=delta, finish_reason=finish_reason) - chunk = ChatCompletionStreamResponse(id=request_id, model=model, choices=[choice]) - return jsonify(chunk) - - -async def create_stream_chat_completion( - chat_model: "ChatModel", request: "ChatCompletionRequest", input_kwargs: Dict[str, Any] -) -> AsyncGenerator[str, None]: - yield create_stream_chunk( - input_kwargs["request_id"], request.model, ChatCompletionMessage(role=Role.ASSISTANT, content="") - ) - async for new_token in chat_model.stream_chat(**input_kwargs): - yield create_stream_chunk(input_kwargs["request_id"], request.model, ChatCompletionMessage(content=new_token)) - - yield create_stream_chunk( - input_kwargs["request_id"], request.model, ChatCompletionMessage(), finish_reason=Finish.STOP - ) - yield "[DONE]" - - def launch_server(config: "ChatConfig") -> None: model = ChatModel(config) app = FastAPI() @app.post("/v1/chat/completions", response_model=ChatCompletionResponse, status_code=status.HTTP_200_OK) - async def create_chat_completion_v1(request: "ChatCompletionRequest"): - return await create_chat_completion(model, request) + async def create_chat_completion(request: "ChatCompletionRequest"): + return await _create_local_chat_completion(request, model) - uvicorn.run(app, host="127.0.0.1", port=config.port, workers=1) + uvicorn.run(app, port=config.port) if __name__ == "__main__": diff --git a/src/imitater/service/common.py b/src/imitater/service/common.py new file mode 100644 index 0000000..50803df --- /dev/null +++ b/src/imitater/service/common.py @@ -0,0 +1,27 @@ +from subprocess import Popen +from typing import Optional + +from ..utils.generic import jsonify +from .protocol import ChatCompletionMessage, ChatCompletionStreamResponse, ChatCompletionStreamResponseChoice, Finish + + +def print_subprocess_stdout(process: "Popen") -> None: + while process.stdout.readable(): + line: bytes = process.stdout.readline() + + if not line: + break + + print(line.decode("utf-8").strip()) + + +def create_stream_chunk( + request_id: str, + model: str, + delta: "ChatCompletionMessage", + index: Optional[int] = 0, + finish_reason: Optional[Finish] = None, +) -> str: + choice = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason) + chunk = ChatCompletionStreamResponse(id=request_id, model=model, choices=[choice]) + return jsonify(chunk) diff --git a/src/imitater/service/embed.py b/src/imitater/service/embed.py index df306a9..6405efb 100644 --- a/src/imitater/service/embed.py +++ b/src/imitater/service/embed.py @@ -14,12 +14,12 @@ ) -async def create_embeddings(embed_model: "EmbedModel", request: "EmbeddingsRequest") -> "EmbeddingsResponse": +async def _create_local_embeddings(request: "EmbeddingsRequest", model: "EmbedModel") -> "EmbeddingsResponse": texts = request.input if isinstance(texts, str): texts = [texts] - embed_output, embed_tokens = await embed_model.embed(texts) + embed_output, embed_tokens = await model.embed(texts) embeddings = [] for i, embed_data in enumerate(embed_output): if request.encoding_format == "base64": @@ -40,7 +40,7 @@ def launch_server(config: "EmbedConfig") -> None: model = EmbedModel(config) @asynccontextmanager - async def lifespan(app: "FastAPI") -> None: + async def lifespan(app: "FastAPI"): await model.startup() yield await model.shutdown() @@ -49,9 +49,9 @@ async def lifespan(app: "FastAPI") -> None: @app.post("/v1/embeddings", response_model=EmbeddingsResponse, status_code=status.HTTP_200_OK) async def create_embeddings_v1(request: "EmbeddingsRequest"): - return await create_embeddings(model, request) + return await _create_local_embeddings(request, model) - uvicorn.run(app, host="127.0.0.1", port=config.port, workers=1) + uvicorn.run(app, port=config.port) if __name__ == "__main__": diff --git a/src/imitater/utils/generic.py b/src/imitater/utils/generic.py index c4cf37f..5ad9a07 100644 --- a/src/imitater/utils/generic.py +++ b/src/imitater/utils/generic.py @@ -9,12 +9,12 @@ def dictify(data: "BaseModel") -> Dict[str, Any]: try: # pydantic v2 return data.model_dump(exclude_unset=True) - except Exception: # pydantic v1 + except AttributeError: # pydantic v1 return data.dict(exclude_unset=True) def jsonify(data: "BaseModel") -> str: try: # pydantic v2 return json.dumps(data.model_dump(exclude_unset=True), ensure_ascii=False) - except Exception: # pydantic v1 + except AttributeError: # pydantic v1 return data.json(exclude_unset=True, ensure_ascii=False) diff --git a/tests/test_openai.py b/tests/test_openai.py index 4f904ba..7037953 100644 --- a/tests/test_openai.py +++ b/tests/test_openai.py @@ -69,9 +69,9 @@ def test_tool(client: "OpenAI") -> None: {"role": "user", "content": "What is the weather like in Boston?"}, { "role": "function", - "content": '{"name": "get_current_weather", "arguments": {"location": "Boston, MA"}}', + "content": """{"name": "get_current_weather", "arguments": {"location": "Boston, MA"}}""", }, - {"role": "tool", "content": '{"temperature": 22, "unit": "celsius", "description": "Sunny"}'}, + {"role": "tool", "content": """{"temperature": 22, "unit": "celsius", "description": "Sunny"}"""}, ], model="gpt-3.5-turbo", tools=tools, @@ -81,7 +81,7 @@ def test_tool(client: "OpenAI") -> None: if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-c", "--config", type=str, help="Path to config file.") + parser.add_argument("-c", "--config", type=str, required=True, help="Path to config file.") args = parser.parse_args() with open(getattr(args, "config"), "r", encoding="utf-8") as f: config: Dict[str, Dict[str, Any]] = yaml.safe_load(f) @@ -89,7 +89,7 @@ def test_tool(client: "OpenAI") -> None: client = OpenAI( api_key="0", base_url="http://{host}:{port}/v1".format( - host=config["service"].get("host", "192.168.0.1"), + host=config["service"].get("host", "localhost"), port=config["service"].get("port", 8000), ), )