Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unable to connect to Hugging Face Dedicated Endpoints? #945

Open
michael-newsrx opened this issue Jul 8, 2024 · 3 comments
Open

Unable to connect to Hugging Face Dedicated Endpoints? #945

michael-newsrx opened this issue Jul 8, 2024 · 3 comments

Comments

@michael-newsrx
Copy link

michael-newsrx commented Jul 8, 2024

Is your feature request related to a problem? Please describe.
I'm wanting to utilize guidance using a large hugging face hosted model.
I don't see any classes related for Hugging Face specifically, and setting the api_key does not seem to set the required Authorization: Bearer header.

Describe the solution you'd like
A handler for HuggingFace hosted endpoints.

Describe alternatives you've considered
Don't know how to pass a hugging face endpoint directly into the guidance code.

Ideally be able to use hugging face interence endpoint object directly passed into an appropriate guidance wrapping model.

def inference_endpoint1():
    hf_login_check()
    ep_name = "llama-3-70b-instruct-aws1"
    for ep in list_inference_endpoints(namespace="my company"):
        if ep.name == ep_name:
            ep.wait()
            return ep
    ep = create_inference_endpoint(  #
            ep_name,  #
            repository="meta-llama/Meta-Llama-3-70B-Instruct",  #
            framework="pytorch",  #
            accelerator="gpu",  #
            instance_size="x4",  #
            instance_type="nvidia-a100",  #
            region="us-east-1",  #
            vendor="aws",  #
            min_replica=0,  #
            max_replica=1,  #
            task="text-generation",  #
            type=InferenceEndpointType.PROTECTED,  #
            namespace="newsrx",  #
            custom_image={  #
                "health_route": "/health",  #
                "url": "ghcr.io/huggingface/text-generation-inference:2.1.1",  #
                "env": {  #
                    "MAX_BATCH_PREFILL_TOKENS": "32768",  #
                    "MAX_BATCH_TOTAL_TOKENS": "32768",  #
                    "MAX_INPUT_LENGTH": "16384",  #
                    "MAX_TOTAL_TOKENS": "32768",  #
                    "MODEL_ID": "/repository"},  #
            })
    ep.scale_to_zero()
    ep.wait()
    return ep

def hf_login_check():
    try:
        huggingface_hub.whoami()
    except RequestException as e:
        print("Not logged in or not connected to internet")
        huggingface_hub.login()  # inference_endpoint()
from huggingface_hub import InferenceEndpoint
from pprint import pprint
from guidance.models import Model
from local_models.mixtral_guidelines import load_llama_3_70b_instruct_chat_q2 as test_model
import guidance

api_key = huggingface_hub.get_token()

with BlockTimer() as timer:
    end_point: InferenceEndpoint = local_utils.hf.inference.inference_endpoint1()
    n_ctx: int = 16384  # Context window size
    llm: Model = guidance.models.LlamaCpp(end_point.url, api_key=api_key, compute_log_probs=True)
    print(f"Model load elapsed: {timer.formatted}")
@michael-newsrx
Copy link
Author

Additional attempts:

Using OpenAI client

Works

    ep1 = inference_endpoint1()
    while ep1.status != "running":
        if ep1.status == "failed":
            raise RuntimeError(f"Failed to create inference endpoint: {ep1.name}")
        ep1.wait()

    import openai
    client = openai.OpenAI(  #
            base_url=ep1.url + "/v1/",  #
            api_key=hf_bearer_token(),  #
    )

    role_system = {"role": "system", "content": "You are a helpful assistant."}
    role_user = {"role": "user", "content": "What is deep learning?"}
    chat_completion = client.chat.completions.create(model="gpt-4o",  #
                                                     messages=[role_system, role_user],  #
                                                     stream=True, max_tokens=1024,
                                                     temperature=0.0)

Using @guidance

Fails with streaming error

    llm = guidance.models.OpenAI(model="gpt-4o",  #
                                 base_url=ep1.url + "/v1/",  #
                                 api_key=hf_bearer_token(),  #
                                 echo=False)

    # llm = Transformers(ep1.url, echo=False, api_key=ep1.client.token)
    with system():
        llm += "I am an evil robot overlord."
    with user():
        llm += "What is your command?"
    with assistant():
        llm += gen()
    print(str(llm))

Stacktrace

Traceback (most recent call last):
  File "/home/michael/git/ai_newsletters/local_utils/hf/inference.py", line 290, in <module>
    main()
  File "/home/michael/git/ai_newsletters/local_utils/hf/inference.py", line 266, in main
    llm += gen()
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_model.py", line 1159, in __add__
    out = lm._run_stateless(value)
          ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_model.py", line 1364, in _run_stateless
    for chunk in gen_obj:
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_model.py", line 760, in __call__
    logits = self.get_logits(token_ids, forced_bytes, current_temp)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_grammarless.py", line 360, in get_logits
    raise new_bytes
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_grammarless.py", line 165, in _start_generator_stream
    for chunk in generator:
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_openai.py", line 156, in _generator_chat
    raise e
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_openai.py", line 145, in _generator_chat
    for part in generator:
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/openai/_streaming.py", line 46, in __iter__
    for item in self._iterator:
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/openai/_streaming.py", line 72, in __stream__
    raise APIError(
openai.APIError: An error occurred during streaming

@michael-newsrx
Copy link
Author

top_p must be > 0.0 and < 1.0

Container logs show:

{
    "timestamp":"2024-07-11T17:32:24.361684Z",
    "level":"ERROR",
    "message":"`top_p` must be > 0.0 and < 1.0",
    "target":"text_generation_router::infer",
    "filename":"router/src/infer.rs",
    "line_number":137,
    "span":{
        "name":"generate_stream"
    },
    "spans":[
        {
            "name":"chat_completions"
        },{
            "name":"async_stream"
        },{
            "name":"generate_stream"
        }
    ]
}

@michael-conrad
Copy link

michael-conrad commented Jul 15, 2024

Workaround

  1. Use custom httpx client that alters the JSON of the request.
  2. Supply custom httpx client to the OpenAI constructor

Create a custom class extending DefaultHttpxClient

This class is set to replace top_p=1.0 in the json payload with top_p=0.9999…

class LocalHttpxClient(DefaultHttpxClient):
    def build_request(self,  #
                      method: str,  #
                      url: URLTypes,  #
                      *, content: RequestContent | None = None,  #
                      data: RequestData | None = None,  #
                      files: RequestFiles | None = None,  #
                      json: typing.Any | None = None,  #
                      params: QueryParamTypes | None = None,  #
                      headers: HeaderTypes | None = None,  #
                      cookies: CookieTypes | None = None,  #
                      timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,  #
                      extensions: RequestExtensions | None = None,  #
                      ) -> Request:
        if json is not None and isinstance(json, dict):
            if "top_p" in json and json["top_p"] == 1.0:
                del json["top_p"]
        return Request(method, url, content=content, data=data, files=files, json=json,
                       params=params, headers=headers, cookies=cookies, extensions=extensions, )
def test()->None:
    test_ep: InferenceEndpoint = inference_endpoint1()
    while test_ep.status != "running":
        if test_ep.status == "failed":
            raise RuntimeError(f"Failed to create inference endpoint: {test_ep.name}")
        try:
            test_ep.wait(timeout=1)
        except InferenceEndpointTimeoutError:
            pass

    import openai
    httpx_client = LocalHttpxClient()
    client = openai.OpenAI(  #
            base_url=test_ep.url + "/v1",  #
            api_key=hf_bearer_token(),  #
            organization="NewsRx",  #
            http_client=httpx_client,  #
    )

    # print(f"Available models: {client.models.list()}")
    role_system = {"role": "system", "content": "I am an evil robot overlord."}
    role_user = {"role": "user", "content": "What is your command? Be very succinct."}
    chat_completion = client.chat.completions.create(model="tgi",  #
                                                     messages=[role_system, role_user],  #
                                                     stream=True,  #
                                                     max_tokens=1024,  #
                                                     temperature=0.0,  #
                                                     )

    print("=" * 40)
    for chunk in chat_completion:
        content = chunk.choices[0].delta.content
        if content is not None:
            print(content, end="")
    print()
    print()

    xllm = guidance.models.OpenAI(model="gpt-3.5-turbo",  #
                                  base_url=test_ep.url + "/v1",  #
                                  api_key=hf_bearer_token(),  #
                                  echo=False,  #
                                  organization="NewsRx",  #
                                  http_client=httpx_client,  #
                                  )
    llm = xllm
    with system():
        llm += "I am an evil robot overlord."
    with user():
        llm += "What is your command? Be very succinct."
    with assistant():
        llm += gen()
    print(str(llm))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
2 participants