Skip to content

Commit e215876

Browse files
committed
add README.md and test
1 parent b5fcd04 commit e215876

File tree

3 files changed

+42
-45
lines changed

3 files changed

+42
-45
lines changed

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
<li><a href="#worker-aware-async-scheduler">Schedule jobs</a></li>
3232
<li><a href="#smtp-setup">Email Configuration</a></li>
3333
<li><a href="#uv-knowledge-and-inspirations">UV knowledge and inspirations</a></li>
34+
<li><a href="#large-language-model">Integration with local LLM</a></li>
3435
</ul>
3536
</li>
3637
<li><a href="#acknowledgments">Acknowledgments</a></li>
@@ -162,6 +163,24 @@ This service supports plaintext and HTML emails, and also allows sending templat
162163
It is implemented as a singleton to ensure that only one SMTP connection is maintained
163164
throughout the application lifecycle, optimizing resource usage.
164165

166+
<p align="right">(<a href="#readme-top">back to top</a>)</p>
167+
168+
### Large Language Model
169+
The `/v1/ml/chat/` endpoint is designed to handle chat-based interactions with the LLM model.
170+
It accepts a user prompt and streams responses back in real-time.
171+
The endpoint leverages FastAPI's asynchronous capabilities to efficiently manage multiple simultaneous requests,
172+
ensuring low latency and high throughput.
173+
174+
FastAPI's async support is particularly beneficial for reducing I/O bottlenecks when connecting to the LLM model.
175+
By using asynchronous HTTP clients like `httpx`,
176+
the application can handle multiple I/O-bound tasks concurrently,
177+
such as sending requests to the LLM server and streaming responses back to the client.
178+
This approach minimizes idle time and optimizes resource utilization, making it ideal for high-performance applications.
179+
180+
Install ollama and run the server
181+
```shell
182+
ollama run llama3.2
183+
```
165184

166185
<p align="right">(<a href="#readme-top">back to top</a>)</p>
167186

@@ -215,6 +234,7 @@ I've included a few of my favorites to kick things off!
215234
- **[DEC 16 2024]** bump project to Python 3.13 :fast_forward:
216235
- **[JAN 28 2025]** add SMTP setup :email:
217236
- **[MAR 8 2025]** switch from poetry to uv :fast_forward:
237+
- **[MAY 3 2025]** add large language model integration :robot:
218238

219239
<p align="right">(<a href="#readme-top">back to top</a>)</p>
220240

app/services/llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ def __init__(self, base_url: str = "http://localhost:11434/v1"):
1010

1111
async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
1212
"""Stream chat completion responses from LLM."""
13-
# Send user message first
13+
# Send the user a message first
1414
user_msg = {
1515
"role": "user",
1616
"content": prompt,

tests/chat.py

Lines changed: 21 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,30 @@
1-
from typing import Optional, AsyncGenerator
2-
1+
import anyio
32
import httpx
43
import orjson
54

5+
async def chat_with_endpoint():
6+
async with httpx.AsyncClient() as client:
7+
while True:
8+
# Get user input
9+
prompt = input("\nYou: ")
10+
if prompt.lower() == "exit":
11+
break
612

7-
class StreamLLMService:
8-
def __init__(self, base_url: str = "http://localhost:11434/v1"):
9-
self.base_url = base_url
10-
self.model = "llama3.2"
11-
12-
async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes, None]:
13-
"""Stream chat completion responses from LLM."""
14-
# Send user message first
15-
user_msg = {
16-
"role": "user",
17-
"content": prompt,
18-
}
19-
yield orjson.dumps(user_msg) + b"\n"
20-
21-
# Open client as context manager and stream responses
22-
async with httpx.AsyncClient(base_url=self.base_url) as client:
13+
# Send request to the API
14+
print("\nModel: ", end="", flush=True)
2315
async with client.stream(
2416
"POST",
25-
"/chat/completions",
26-
json={
27-
"model": self.model,
28-
"messages": [{"role": "user", "content": prompt}],
29-
"stream": True,
30-
},
31-
timeout=60.0,
17+
"http://localhost:8000/chat/",
18+
data={"prompt": prompt},
19+
timeout=60
3220
) as response:
33-
async for line in response.aiter_lines():
34-
print(line)
35-
if line.startswith("data: ") and line != "data: [DONE]":
21+
async for chunk in response.aiter_lines():
22+
if chunk:
3623
try:
37-
json_line = line[6:] # Remove "data: " prefix
38-
data = orjson.loads(json_line)
39-
content = (
40-
data.get("choices", [{}])[0]
41-
.get("delta", {})
42-
.get("content", "")
43-
)
44-
if content:
45-
model_msg = {"role": "model", "content": content}
46-
yield orjson.dumps(model_msg) + b"\n"
47-
except Exception:
48-
pass
49-
24+
data = orjson.loads(chunk)
25+
print(data["content"], end="", flush=True)
26+
except Exception as e:
27+
print(f"\nError parsing chunk: {e}")
5028

51-
# FastAPI dependency
52-
def get_llm_service(base_url: Optional[str] = None) -> StreamLLMService:
53-
return StreamLLMService(base_url=base_url or "http://localhost:11434/v1")
29+
if __name__ == "__main__":
30+
anyio.run(chat_with_endpoint)

0 commit comments

Comments
 (0)