From 1ff742b6ecbe0d0963efe1ed668e52b51ce007e2 Mon Sep 17 00:00:00 2001
From: Driss Guessous <32754868+drisspg@users.noreply.github.com>
Date: Wed, 16 Aug 2023 14:24:21 -0700
Subject: [PATCH] Neurips client (#1693)

---
 src/helm/benchmark/static/schema.yaml         |  10 ++
 .../http_model_window_service.py              |  28 ++++
 .../window_services/window_service_factory.py |   3 +
 src/helm/proxy/clients/auto_client.py         |   5 +
 src/helm/proxy/clients/http_model_client.py   | 140 ++++++++++++++++++
 src/helm/proxy/models.py                      |   6 +
 6 files changed, 192 insertions(+)
 create mode 100644 src/helm/benchmark/window_services/http_model_window_service.py
 create mode 100644 src/helm/proxy/clients/http_model_client.py

diff --git a/src/helm/benchmark/static/schema.yaml b/src/helm/benchmark/static/schema.yaml
index 00142344b27..1e9272c3a0e 100644
--- a/src/helm/benchmark/static/schema.yaml
+++ b/src/helm/benchmark/static/schema.yaml
@@ -76,6 +76,16 @@ models:
     access: limited
     num_parameters: 70000000000
     release_date: 2022-01-01
+  
+  # TODO: Remove Once we have configurable model names
+  - name: neurips/local
+    display_name: Local service
+    description: Local competition service
+    creator_organization: neurips
+    access: open
+    num_parameters: 1
+    release_date: 2021-12-01
+
 
   # Anthropic
   - name: anthropic/stanford-online-all-v4-s3
diff --git a/src/helm/benchmark/window_services/http_model_window_service.py b/src/helm/benchmark/window_services/http_model_window_service.py
new file mode 100644
index 00000000000..dac3bb70fbb
--- /dev/null
+++ b/src/helm/benchmark/window_services/http_model_window_service.py
@@ -0,0 +1,28 @@
+from .local_window_service import LocalWindowService
+from .tokenizer_service import TokenizerService
+
+
+# TODO: Remove Once we have configurable model names since this hardcodes the tokenizer name
+class HTTPModelWindowServce(LocalWindowService):
+    def __init__(self, service: TokenizerService):
+        super().__init__(service)
+
+    @property
+    def max_sequence_length(self) -> int:
+        return 2048
+
+    @property
+    def max_request_length(self) -> int:
+        return self.max_sequence_length
+
+    @property
+    def end_of_text_token(self) -> str:
+        return "<|endoftext|>"
+
+    @property
+    def tokenizer_name(self) -> str:
+        return "neurips/local"
+
+    @property
+    def prefix_token(self) -> str:
+        return self.end_of_text_token
diff --git a/src/helm/benchmark/window_services/window_service_factory.py b/src/helm/benchmark/window_services/window_service_factory.py
index 64f64313f11..bfdce679bb0 100644
--- a/src/helm/benchmark/window_services/window_service_factory.py
+++ b/src/helm/benchmark/window_services/window_service_factory.py
@@ -51,6 +51,7 @@
 from .llama_window_service import LlamaWindowService, Llama2WindowService
 from .window_service import WindowService
 from .tokenizer_service import TokenizerService
+from .http_model_window_service import HTTPModelWindowServce
 from helm.proxy.clients.huggingface_client import get_huggingface_model_config
 from helm.proxy.clients.remote_model_registry import get_remote_model
 
@@ -86,6 +87,8 @@ def get_window_service(model_name: str, service: TokenizerService) -> WindowServ
             )
         elif get_remote_model(model_name):
             window_service = get_remote_window_service(service, model_name)
+        elif organization == "neurips":
+            window_service = HTTPModelWindowServce(service)
         elif huggingface_model_config:
             window_service = HuggingFaceWindowService(service=service, model_config=huggingface_model_config)
         elif organization == "openai":
diff --git a/src/helm/proxy/clients/auto_client.py b/src/helm/proxy/clients/auto_client.py
index b0af9dbbbb2..06db243f6f8 100644
--- a/src/helm/proxy/clients/auto_client.py
+++ b/src/helm/proxy/clients/auto_client.py
@@ -18,6 +18,7 @@
 from helm.proxy.retry import retry_request, NonRetriableException
 from helm.proxy.clients.critique_client import CritiqueClient
 from helm.proxy.clients.client import Client
+from .http_model_client import HTTPModelClient
 from helm.proxy.clients.huggingface_model_registry import get_huggingface_model_config
 from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
 
@@ -86,6 +87,8 @@ def _get_client(self, model: str) -> Client:
                 from helm.proxy.clients.huggingface_client import HuggingFaceClient
 
                 client = HuggingFaceClient(cache_config=cache_config)
+            elif organization == "neurips":
+                client = HTTPModelClient(cache_config=cache_config)
             elif organization == "openai":
                 from helm.proxy.clients.chat_gpt_client import ChatGPTClient
                 from helm.proxy.clients.openai_client import OpenAIClient
@@ -216,6 +219,8 @@ def _get_tokenizer_client(self, tokenizer: str) -> Client:
                 from helm.proxy.clients.huggingface_client import HuggingFaceClient
 
                 client = HuggingFaceClient(cache_config=cache_config)
+            elif organization == "neurips":
+                client = HTTPModelClient(cache_config=cache_config)
             elif organization in [
                 "bigscience",
                 "bigcode",
diff --git a/src/helm/proxy/clients/http_model_client.py b/src/helm/proxy/clients/http_model_client.py
new file mode 100644
index 00000000000..7966f85c72c
--- /dev/null
+++ b/src/helm/proxy/clients/http_model_client.py
@@ -0,0 +1,140 @@
+from dataclasses import asdict
+from typing import Optional
+
+from helm.common.cache import Cache, CacheConfig
+from helm.common.request import (
+    Request,
+    RequestResult,
+    Sequence,
+    Token,
+    EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
+)
+from helm.common.tokenization_request import (
+    DecodeRequest,
+    DecodeRequestResult,
+    TokenizationRequest,
+    TokenizationRequestResult,
+    TokenizationToken,
+)
+from .client import Client, wrap_request_time
+
+import requests
+
+
+class HTTPModelClient(Client):
+    """Implements a simple client for a model being served over HTTP."""
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        base_url: str = "http://localhost:8080",
+        timeout: int = 10,
+        do_cache: bool = False,
+    ):
+        self.cache: Optional[Cache] = Cache(cache_config) if do_cache else None
+        self.base_url = base_url
+        self.timeout = timeout
+
+    def make_request(self, request: Request) -> RequestResult:
+        cache_key = asdict(request)
+        # This needs to match whatever we define in pedantic
+        if request.embedding:
+            return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
+
+        raw_request = {
+            "prompt": request.prompt,
+            "temperature": 1e-7 if request.temperature == 0 else request.temperature,
+            "num_return_sequences": request.num_completions,
+            "max_new_tokens": request.max_tokens,
+            "top_p": request.top_p,
+            "echo_prompt": request.echo_prompt,
+            "top_k_per_token": request.top_k_per_token,
+            "stop_sequences": request.stop_sequences,
+        }
+
+        try:
+
+            def do_it():
+                url = f"{self.base_url}/process"
+                response = requests.post(url, json=raw_request, timeout=self.timeout)
+                response.raise_for_status()
+                response_data = response.json()
+                return response_data
+
+            if self.cache:
+                response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            else:
+                response, cached = do_it(), False
+
+            tokens = [
+                Token(text=token["text"], logprob=token["logprob"], top_logprobs=token["top_logprob"])
+                for token in response["tokens"]
+            ]
+            completions = [Sequence(text=response["text"], logprob=response["logprob"], tokens=tokens)]
+
+            return RequestResult(
+                success=True,
+                cached=cached,
+                error=None,
+                completions=completions,
+                embedding=[],
+                request_time=response["request_time"],
+            )
+        except requests.exceptions.RequestException as e:
+            error: str = f"Request error: {e}"
+            return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+
+    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+        cache_key = asdict(request)
+        raw_request = {
+            "text": request.text,
+            "truncation": request.truncation,
+            "max_length": request.max_length,
+        }
+
+        try:
+
+            def do_it():
+                url = f"{self.base_url}/tokenize"
+                response = requests.post(url, json=raw_request)
+                response.raise_for_status()
+                response_data = response.json()
+                return response_data
+
+            if self.cache:
+                result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            else:
+                result, cached = do_it(), False
+        except Exception as e:
+            error: str = f"Local Model error: {e}"
+            return TokenizationRequestResult(success=False, cached=False, error=error, text="", tokens=[])
+
+        return TokenizationRequestResult(
+            success=True,
+            cached=cached,
+            text=request.text,
+            tokens=[TokenizationToken(value) for value in result["tokens"]],
+            request_time=result["request_time"],
+        )
+
+    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+        raise NotImplementedError("Not implemented yet.")
+        # cache_key = asdict(request)
+
+        # try:
+
+        #     def do_it():
+        #         url = f"{self.base_url}/decode"
+        #         response = requests.post(url, json={"tokens": request.tokens})
+        #         response.raise_for_status()
+        #         response_data = response.json()
+        #         return response_data
+
+        #     result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        # except Exception as e:
+        #     error: str = f"Local Model error: {e}"
+        #     return DecodeRequestResult(success=False, cached=False, error=error, text="")
+
+        # return DecodeRequestResult(
+        #     success=True, cached=cached, text=result["text"], request_time=result["request_time"]
+        # )
diff --git a/src/helm/proxy/models.py b/src/helm/proxy/models.py
index 5c26d75de2e..1cfb055d56f 100644
--- a/src/helm/proxy/models.py
+++ b/src/helm/proxy/models.py
@@ -112,6 +112,12 @@ def engine(self) -> str:
 # Over time, we should add more information there.
 
 ALL_MODELS = [
+    # Local Model
+    Model(
+        group="neurips",
+        name="neurips/local",
+        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
+    ),
     # AI21: https://studio.ai21.com/pricing
     Model(
         group="jurassic",