|
1 |
| -from __future__ import annotations |
2 |
| - |
3 | 1 | import os
|
4 | 2 | from typing import TYPE_CHECKING, Optional
|
5 | 3 |
|
| 4 | +import requests |
| 5 | + |
6 | 6 | from ._chat import Chat
|
7 | 7 | from ._provider_openai import OpenAIProvider
|
8 |
| -from ._utils import MISSING, MISSING_TYPE, is_testing |
| 8 | +from ._turn import Turn |
9 | 9 |
|
10 | 10 | if TYPE_CHECKING:
|
11 |
| - from ._provider_openai import ChatCompletion |
12 |
| - from .types.openai import ChatClientArgs, SubmitInputArgs |
| 11 | + from openai.types.chat import ChatCompletionToolParam |
| 12 | + |
| 13 | + from .types.openai import ChatClientArgs |
13 | 14 |
|
14 | 15 |
|
15 |
| -def ChatVllm( |
| 16 | +def ChatVLLM( |
16 | 17 | *,
|
17 | 18 | base_url: str,
|
18 | 19 | system_prompt: Optional[str] = None,
|
| 20 | + turns: Optional[list[Turn]] = None, |
19 | 21 | model: Optional[str] = None,
|
20 | 22 | api_key: Optional[str] = None,
|
21 |
| - seed: Optional[int] | MISSING_TYPE = MISSING, |
| 23 | + seed: Optional[int] = None, |
22 | 24 | kwargs: Optional["ChatClientArgs"] = None,
|
23 |
| -) -> Chat["SubmitInputArgs", ChatCompletion]: |
| 25 | +) -> Chat: |
24 | 26 | """
|
25 |
| - Chat with a model hosted by vLLM. |
| 27 | + Chat with a model hosted by vLLM |
26 | 28 |
|
27 | 29 | [vLLM](https://docs.vllm.ai/en/latest/) is an open source library that
|
28 | 30 | provides an efficient and convenient LLMs model server. You can use
|
29 |
| - `ChatVllm()` to connect to endpoints powered by vLLM. |
| 31 | + `ChatVLLM()` to connect to endpoints powered by vLLM. |
30 | 32 |
|
31 | 33 | Prerequisites
|
32 | 34 | -------------
|
33 | 35 |
|
34 | 36 | ::: {.callout-note}
|
35 |
| - ## vLLM Server |
| 37 | + ## vLLM runtime |
36 | 38 |
|
37 |
| - You need access to a running vLLM server instance. vLLM provides |
38 |
| - OpenAI-compatible API endpoints, so this function works with any |
39 |
| - vLLM deployment that exposes the `/v1/chat/completions` endpoint. |
| 39 | + `ChatVLLM` requires a vLLM server to be running somewhere (either on your |
| 40 | + machine or a remote server). If you want to run a vLLM server locally, see |
| 41 | + the [vLLM documentation](https://docs.vllm.ai/en/v0.5.3/getting_started/quickstart.html). |
40 | 42 | :::
|
41 | 43 |
|
42 |
| - Examples |
43 |
| - -------- |
| 44 | + ::: {.callout-note} |
| 45 | + ## Python requirements |
44 | 46 |
|
45 |
| - ```python |
46 |
| - import os |
47 |
| - from chatlas import ChatVllm |
| 47 | + `ChatVLLM` requires the `openai` package (e.g., `pip install openai`). |
| 48 | + ::: |
48 | 49 |
|
49 |
| - # Connect to a vLLM server |
50 |
| - chat = ChatVllm( |
51 |
| - base_url="http://localhost:8000/v1", |
52 |
| - model="meta-llama/Llama-2-7b-chat-hf", |
53 |
| - api_key=os.getenv("VLLM_API_KEY"), # Optional, depends on server config |
54 |
| - ) |
55 |
| - chat.chat("What is the capital of France?") |
56 |
| - ``` |
57 | 50 |
|
58 | 51 | Parameters
|
59 | 52 | ----------
|
60 | 53 | base_url
|
61 |
| - The base URL of the vLLM server endpoint. This should include the |
62 |
| - `/v1` path if the server follows OpenAI API conventions. |
63 |
| - system_prompt |
64 | 54 | A system prompt to set the behavior of the assistant.
|
| 55 | + system_prompt |
| 56 | + Optional system prompt to prepend to conversation. |
| 57 | + turns |
| 58 | + A list of turns to start the chat with (i.e., continuing a previous |
| 59 | + conversation). If not provided, the conversation begins from scratch. Do |
| 60 | + not provide non-`None` values for both `turns` and `system_prompt`. Each |
| 61 | + message in the list should be a dictionary with at least `role` (usually |
| 62 | + `system`, `user`, or `assistant`, but `tool` is also possible). Normally |
| 63 | + there is also a `content` field, which is a string. |
65 | 64 | model
|
66 |
| - The model to use for the chat. If None, you may need to specify |
67 |
| - the model name that's loaded on your vLLM server. |
68 |
| - api_key |
69 |
| - The API key to use for authentication. Some vLLM deployments may |
70 |
| - not require authentication. You can set the `VLLM_API_KEY` |
71 |
| - environment variable instead of passing it directly. |
| 65 | + Model identifier to use. |
72 | 66 | seed
|
73 |
| - Optional integer seed that vLLM uses to try and make output more |
74 |
| - reproducible. |
| 67 | + Random seed for reproducibility. |
| 68 | + api_key |
| 69 | + API key for authentication. If not provided, the `VLLM_API_KEY` environment |
| 70 | + variable will be used. |
75 | 71 | kwargs
|
76 |
| - Additional arguments to pass to the `openai.OpenAI()` client constructor. |
77 |
| -
|
78 |
| - Returns |
79 |
| - ------- |
80 |
| - Chat |
81 |
| - A chat object that retains the state of the conversation. |
82 |
| -
|
83 |
| - Note |
84 |
| - ---- |
85 |
| - This function is a lightweight wrapper around [](`~chatlas.ChatOpenAI`) with |
86 |
| - the defaults tweaked for vLLM endpoints. |
87 |
| -
|
88 |
| - Note |
89 |
| - ---- |
90 |
| - vLLM servers are OpenAI-compatible, so this provider uses the same underlying |
91 |
| - client as OpenAI but configured for your vLLM endpoint. Some advanced OpenAI |
92 |
| - features may not be available depending on your vLLM server configuration. |
93 |
| -
|
94 |
| - Note |
95 |
| - ---- |
96 |
| - Pasting an API key into a chat constructor (e.g., `ChatVllm(api_key="...")`) |
97 |
| - is the simplest way to get started, and is fine for interactive use, but is |
98 |
| - problematic for code that may be shared with others. |
99 |
| -
|
100 |
| - Instead, consider using environment variables or a configuration file to manage |
101 |
| - your credentials. One popular way to manage credentials is to use a `.env` file |
102 |
| - to store your credentials, and then use the `python-dotenv` package to load them |
103 |
| - into your environment. |
104 |
| -
|
105 |
| - ```shell |
106 |
| - pip install python-dotenv |
107 |
| - ``` |
108 |
| -
|
109 |
| - ```shell |
110 |
| - # .env |
111 |
| - VLLM_API_KEY=... |
112 |
| - ``` |
113 |
| -
|
114 |
| - ```python |
115 |
| - from chatlas import ChatVllm |
116 |
| - from dotenv import load_dotenv |
117 |
| -
|
118 |
| - load_dotenv() |
119 |
| - chat = ChatVllm(base_url="http://localhost:8000/v1") |
120 |
| - chat.console() |
121 |
| - ``` |
122 |
| -
|
123 |
| - Another, more general, solution is to load your environment variables into the shell |
124 |
| - before starting Python (maybe in a `.bashrc`, `.zshrc`, etc. file): |
125 |
| -
|
126 |
| - ```shell |
127 |
| - export VLLM_API_KEY=... |
128 |
| - ``` |
| 72 | + Additional arguments to pass to the LLM client. |
| 73 | +
|
| 74 | + Returns: |
| 75 | + Chat instance configured for vLLM |
129 | 76 | """
|
130 |
| - if api_key is None: |
131 |
| - api_key = os.getenv("VLLM_API_KEY") |
132 | 77 |
|
133 |
| - if isinstance(seed, MISSING_TYPE): |
134 |
| - seed = 1014 if is_testing() else None |
| 78 | + if api_key is None: |
| 79 | + api_key = get_vllm_key() |
135 | 80 |
|
136 | 81 | if model is None:
|
137 |
| - raise ValueError( |
138 |
| - "Must specify model. vLLM servers can host different models, so you need to " |
139 |
| - "specify which one to use. Check your vLLM server's /v1/models endpoint " |
140 |
| - "to see available models." |
141 |
| - ) |
| 82 | + models = get_vllm_models(base_url, api_key) |
| 83 | + available_models = ", ".join(models) |
| 84 | + raise ValueError(f"Must specify model. Available models: {available_models}") |
142 | 85 |
|
143 | 86 | return Chat(
|
144 |
| - provider=VllmProvider( |
145 |
| - api_key=api_key, |
146 |
| - model=model, |
| 87 | + provider=VLLMProvider( |
147 | 88 | base_url=base_url,
|
| 89 | + model=model, |
148 | 90 | seed=seed,
|
149 |
| - name="vLLM", |
| 91 | + api_key=api_key, |
150 | 92 | kwargs=kwargs,
|
151 | 93 | ),
|
152 | 94 | system_prompt=system_prompt,
|
153 | 95 | )
|
154 | 96 |
|
155 | 97 |
|
156 |
| -class VllmProvider(OpenAIProvider): |
157 |
| - """ |
158 |
| - Provider for vLLM endpoints. |
| 98 | +class VLLMProvider(OpenAIProvider): |
| 99 | + # Just like OpenAI but no strict |
| 100 | + @staticmethod |
| 101 | + def _tool_schema_json( |
| 102 | + schema: "ChatCompletionToolParam", |
| 103 | + ) -> "ChatCompletionToolParam": |
| 104 | + schema["function"]["strict"] = False |
| 105 | + return schema |
159 | 106 |
|
160 |
| - vLLM is OpenAI-compatible but may have some differences in tool handling |
161 |
| - and other advanced features. |
162 |
| - """ |
163 | 107 |
|
164 |
| - def _chat_perform_args(self, *args, **kwargs): |
165 |
| - """ |
166 |
| - Customize request arguments for vLLM compatibility. |
| 108 | +def get_vllm_key() -> str: |
| 109 | + key = os.getenv("VLLM_API_KEY", os.getenv("VLLM_KEY")) |
| 110 | + if not key: |
| 111 | + raise ValueError("VLLM_API_KEY environment variable not set") |
| 112 | + return key |
| 113 | + |
| 114 | + |
| 115 | +def get_vllm_models(base_url: str, api_key: Optional[str] = None) -> list[str]: |
| 116 | + if api_key is None: |
| 117 | + api_key = get_vllm_key() |
| 118 | + |
| 119 | + headers = {"Authorization": f"Bearer {api_key}"} |
| 120 | + response = requests.get(f"{base_url}/v1/models", headers=headers) |
| 121 | + response.raise_for_status() |
| 122 | + data = response.json() |
167 | 123 |
|
168 |
| - vLLM may not support all OpenAI features like stream_options, |
169 |
| - so we remove potentially unsupported parameters. |
170 |
| - """ |
171 |
| - # Get the base arguments from OpenAI provider |
172 |
| - result = super()._chat_perform_args(*args, **kwargs) |
| 124 | + return [model["id"] for model in data["data"]] |
173 | 125 |
|
174 |
| - # Remove stream_options if present (some vLLM versions don't support it) |
175 |
| - if "stream_options" in result: |
176 |
| - del result["stream_options"] |
177 | 126 |
|
178 |
| - return result |
| 127 | +# def chat_vllm_test(**kwargs) -> Chat: |
| 128 | +# """Create a test chat instance with default parameters.""" |
| 129 | +# return ChatVLLM(base_url="https://llm.nrp-nautilus.io/", model="llama3", **kwargs) |
0 commit comments