diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py index 2e462b0f6e..ac67f4a16c 100644 --- a/ChatQnA/chatqna.py +++ b/ChatQnA/chatqna.py @@ -3,10 +3,11 @@ import argparse import json +import logging import os import re -from comps import MegaServiceEndpoint, MicroService, ServiceOrchestrator, ServiceRoleType, ServiceType +from comps import CustomLogger, MegaServiceEndpoint, MicroService, ServiceOrchestrator, ServiceRoleType, ServiceType from comps.cores.mega.utils import handle_message from comps.cores.proto.api_protocol import ( ChatCompletionRequest, @@ -20,6 +21,10 @@ from fastapi.responses import StreamingResponse from langchain_core.prompts import PromptTemplate +logger = CustomLogger(__name__) +log_level = logging.DEBUG if os.getenv("LOGFLAG", "").lower() == "true" else logging.INFO +logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + class ChatTemplate: @staticmethod @@ -62,6 +67,10 @@ def generate_rag_prompt(question, documents): def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs): + logger.debug( + f"Aligning inputs for service: {self.services[cur_node].name}, type: {self.services[cur_node].service_type}" + ) + if self.services[cur_node].service_type == ServiceType.EMBEDDING: inputs["inputs"] = inputs["text"] del inputs["text"] @@ -83,6 +92,9 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k # next_inputs["repetition_penalty"] = inputs["repetition_penalty"] next_inputs["temperature"] = inputs["temperature"] inputs = next_inputs + + # Log the aligned inputs (be careful with sensitive data) + logger.debug(f"Aligned inputs for {self.services[cur_node].name}: {type(inputs)}") return inputs @@ -123,7 +135,9 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di elif input_variables == ["question"]: prompt = prompt_template.format(question=data["initial_query"]) else: - print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") + logger.warning( + f"{prompt_template} not used, we only support 2 input variables ['question', 'context']" + ) prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs) else: prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs) @@ -152,7 +166,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di elif input_variables == ["question"]: prompt = prompt_template.format(question=prompt) else: - print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") + logger.warning(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") prompt = ChatTemplate.generate_rag_prompt(prompt, reranked_docs) else: prompt = ChatTemplate.generate_rag_prompt(prompt, reranked_docs) @@ -171,27 +185,65 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di def align_generator(self, gen, **kwargs): - # OpenAI response format - # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n' + """Aligns the generator output to match ChatQnA's format of sending bytes. + + Handles different LLM output formats (TGI, OpenAI) and properly filters + empty or null content chunks to avoid UI display issues. + """ + # OpenAI response format example: + # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct", + # "system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"}, + # "logprobs":null,"finish_reason":null}]}\n\n' + for line in gen: - line = line.decode("utf-8") - chunks = [chunk.strip() for chunk in line.split("\n\n") if chunk.strip()] - for line in chunks: + try: + line = line.decode("utf-8") start = line.find("{") end = line.rfind("}") + 1 + + # Skip lines with invalid JSON structure + if start == -1 or end <= start: + logger.debug("Skipping line with invalid JSON structure") + continue + json_str = line[start:end] - try: - # sometimes yield empty chunk, do a fallback here - json_data = json.loads(json_str) - if "ops" in json_data and "op" in json_data["ops"][0]: - if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str): - yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n" - else: - pass - elif "content" in json_data["choices"][0]["delta"]: - yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n" - except Exception as e: - yield f"data: {repr(json_str.encode('utf-8'))}\n\n" + + # Parse the JSON data + json_data = json.loads(json_str) + + # Handle TGI format responses + if "ops" in json_data and "op" in json_data["ops"][0]: + if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str): + yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n" + # Empty value chunks are silently skipped + + # Handle OpenAI format responses + elif "choices" in json_data and len(json_data["choices"]) > 0: + # Only yield content if it exists and is not null + if ( + "delta" in json_data["choices"][0] + and "content" in json_data["choices"][0]["delta"] + and json_data["choices"][0]["delta"]["content"] is not None + ): + content = json_data["choices"][0]["delta"]["content"] + yield f"data: {repr(content.encode('utf-8'))}\n\n" + # Null content chunks are silently skipped + elif ( + "delta" in json_data["choices"][0] + and "content" in json_data["choices"][0]["delta"] + and json_data["choices"][0]["delta"]["content"] is None + ): + logger.debug("Skipping null content chunk") + + except json.JSONDecodeError as e: + # Log the error with the problematic JSON string for better debugging + logger.error(f"JSON parsing error in align_generator: {e}\nProblematic JSON: {json_str[:200]}") + # Skip sending invalid JSON to avoid UI issues + continue + except Exception as e: + logger.error(f"Unexpected error in align_generator: {e}, line snippet: {line[:100]}...") + # Skip sending to avoid UI issues + continue yield "data: [DONE]\n\n" diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_endpoint_openai.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_endpoint_openai.md new file mode 100644 index 0000000000..f9e84d07ec --- /dev/null +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_endpoint_openai.md @@ -0,0 +1,453 @@ +# Build Mega Service of ChatQnA on Xeon with an LLM Endpoint + +This document outlines the single node deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservices on Intel Xeon server. The steps include pulling Docker images, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank` and `llm`. + +## Table of contents + +1. [ChatQnA Quick Start Deployment](#chatqna-quick-start-Deployment) +2. [ChatQnA Docker Compose file Options](#chatqna-docker-compose-files) +3. [ChatQnA with Conversational UI](#chatqna-with-conversational-ui-optional) + +## ChatQnA Quick Start Deployment + +This section describes how to quickly deploy and test the ChatQnA service manually on an Intel® Xeon® processor. The basic steps are: + +1. [Access the Code](#access-the-code) +2. [Generate a HuggingFace Access Token](#generate-a-huggingface-access-token) +3. [Configure the Deployment Environment](#configure-the-deployment-environment) +4. [Deploy the Services Using Docker Compose](#deploy-the-services-using-docker-compose) +5. [Check the Deployment Status](#check-the-deployment-status) +6. [Test the Pipeline](#test-the-pipeline) +7. [Cleanup the Deployment](#cleanup-the-deployment) + +### Access the Code + +Clone the GenAIExample repository and access the ChatQnA Intel® Gaudi® platform Docker Compose files and supporting scripts: + +``` +git clone https://github.com/opea-project/GenAIComps +cd GenAIComps + +# Build the opea/llm-textgen image. + +docker build \ + --no-cache \ + --build-arg https_proxy=$https_proxy \ + --build-arg http_proxy=$http_proxy \ + -t opea/llm-textgen:latest \ + -f comps/llms/src/text-generation/Dockerfile . + + +cd ../ +git clone https://github.com/opea-project/GenAIExamples.git +cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/ +``` + +### Generate a HuggingFace Access Token + +Some HuggingFace resources, such as some models, are only accessible if the developer have an access token. In the absence of a HuggingFace access token, the developer can create one by first creating an account by following the steps provided at [HuggingFace](https://huggingface.co/) and then generating a [user access token](https://huggingface.co/docs/transformers.js/en/guides/private#step-1-generating-a-user-access-token). + +## Endpoint Access + +An OpenAI-compatible endpoint is required e.g., OpenRouter.ai. Please obtain a valid API key. + +### Configure the Deployment Environment + +To set up environment variables for deploying ChatQnA services, set up some parameters specific to the deployment environment and source the _setup_env.sh_ script in this directory: + +```bash +cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon +source set_env.sh # source environment variables then override below. + +export host_ip="External_Public_IP" # e.g. export host_ip=$(hostname -I | awk '{print $1}') +export HF_TOKEN="Your_Huggingface_API_Token" +export OPENAI_API_KEY="key for openAI-like endpoint" + +export LLM_MODEL_ID="" # e.g. "google/gemma-3-1b-it:free" +export LLM_ENDPOINT="" # e.g. "https://openrouter.ai/api" (please make sure to omit /v1 suffix) +export no_proxy="" # Can set if any no proxy variables. See set_env.sh +``` + +Consult the section on [ChatQnA Service configuration](#chatqna-configuration) for information on how service specific configuration parameters affect deployments. + +### Deploy the Services Using Docker Compose + +To deploy the ChatQnA services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute the command below. It uses the 'compose.yaml' file. + +```bash +NGINX_PORT=8080 docker compose -f compose_endpoint_openai.yaml up -d +``` + +Usage of NGINX_PORT=8080 allows you to access the chat console on localhost:8080 since webbrowser may use port 80. + +To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file. +CPU example with Open Telemetry feature: + +> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands. + +```bash +./grafana/dashboards/download_opea_dashboard.sh +NGINX_PORT=8080 docker compose -f compose_endpoint_openai.yaml -f compose.telemetry.yaml up -d +``` + +**Note**: developers should build docker image from source when: + +- Developing off the git main branch (as the container's ports in the repo may be different from the published docker image). +- Unable to download the docker image. +- Use a specific version of Docker image. + +Please refer to the table below to build different microservices from source: + +| Microservice | Deployment Guide | +| ------------ | --------------------------------------------------------------------------------------------- | +| Dataprep | https://github.com/opea-project/GenAIComps/tree/main/comps/dataprep | +| Embedding | https://github.com/opea-project/GenAIComps/tree/main/comps/embeddings | +| Retriever | https://github.com/opea-project/GenAIComps/tree/main/comps/retrievers | +| Reranker | https://github.com/opea-project/GenAIComps/tree/main/comps/rerankings | +| LLM | https://github.com/opea-project/GenAIComps/tree/main/comps/llms | +| Megaservice | [Megaservice build guide](../../../../README_miscellaneous.md#build-megaservice-docker-image) | +| UI | [Basic UI build guide](../../../../README_miscellaneous.md#build-ui-docker-image) | + +### Check the Deployment Status + +After running docker compose, check if all the containers launched via docker compose have started: + +``` +docker ps -a +``` + +For the endpoint-based deployment, the following 9 containers should be running: + +```bash +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +04f0e3607457 opea/nginx:${RELEASE_VERSION} "/docker-entrypoint.…" 17 minutes ago Up 16 minutes 0.0.0.0:8080->80/tcp, [::]:8080->80/tcp chatqna-xeon-nginx-server +6d7fe1bfd0a5 opea/chatqna-ui:${RELEASE_VERSION} "docker-entrypoint.s…" 17 minutes ago Up 16 minutes 0.0.0.0:5173->5173/tcp, :::5173->5173/tcp chatqna-xeon-ui-server +71d01fe8bc94 opea/chatqna:${RELEASE_VERSION} "python chatqna.py" 17 minutes ago Up 16 minutes 0.0.0.0:8888->8888/tcp, :::8888->8888/tcp chatqna-xeon-backend-server +ea12fab1c70e opea/retriever:${RELEASE_VERSION} "python opea_retriev…" 17 minutes ago Up 17 minutes 0.0.0.0:7000->7000/tcp, :::7000->7000/tcp retriever-redis-server +253622403ed6 opea/dataprep:${RELEASE_VERSION} "sh -c 'python $( [ …" 17 minutes ago Up 17 minutes (healthy) 0.0.0.0:6007->5000/tcp, [::]:6007->5000/tcp dataprep-redis-server +a552cf4f0dd0 redis/redis-stack:7.2.0-v9 "/entrypoint.sh" 17 minutes ago Up 17 minutes (healthy) 0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp redis-vector-db +6795a52137f7 ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 "text-embeddings-rou…" 17 minutes ago Up 17 minutes 0.0.0.0:6006->80/tcp, [::]:6006->80/tcp tei-embedding-server +3e55313e714b opea/llm-textgen:${RELEASE_VERSION} "bash entrypoint.sh" 17 minutes ago Up 17 minutes 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp textgen-service-endpoint-openai +10318f82c943 ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 "text-embeddings-rou…" 17 minutes ago Up 17 minutes 0.0.0.0:8808->80/tcp, [::]:8808->80/tcp tei-reranking-server +``` + +If any issues are encountered during deployment, refer to the [troubleshooting](../../../../README_miscellaneous.md##troubleshooting) section. + +### Test the Pipeline + +Once the ChatQnA services are running, test the pipeline using the following command. This will send a sample query to the ChatQnA service and return a response. + +```bash +curl http://${host_ip}:8888/v1/chatqna \ + -H "Content-Type: application/json" \ + -d '{ + "messages": "What is the revenue of Nike in 2023?" + }' +``` + +**Note** : Access the ChatQnA UI by web browser through this URL: `http://${host_ip}:8080`. Please confirm the `8080` port is opened in the firewall. To validate each microservice used in the pipeline refer to the [Validate microservices](#validate-microservices) section. + +### Cleanup the Deployment + +To stop the containers associated with the deployment, execute the following command: + +``` +docker compose -f compose.yaml down +``` + +## ChatQnA Docker Compose Files + +In the context of deploying a ChatQnA pipeline on an Intel® Xeon® platform, we can pick and choose different vector databases, large language model serving frameworks, and remove pieces of the pipeline such as the reranker. The table below outlines the various configurations that are available as part of the application. These configurations can be used as templates and can be extended to different components available in [GenAIComps](https://github.com/opea-project/GenAIComps.git). + +| File | Description | +| -------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [compose.yaml](./compose.yaml) | Default compose file using vllm as serving framework and redis as vector database | +| [compose_endpoint_openai.yaml](./compose_endpoint_openai.yaml) | Uses OpenAI-compatible endpoint (remote or local) as LLM serving framework with redis as vector database. | +| [compose_milvus.yaml](./compose_milvus.yaml) | Uses Milvus as the vector database. All other configurations remain the same as the default | +| [compose_pinecone.yaml](./compose_pinecone.yaml) | Uses Pinecone as the vector database. All other configurations remain the same as the default. For more details, refer to [README_pinecone.md](./README_pinecone.md). | +| [compose_qdrant.yaml](./compose_qdrant.yaml) | Uses Qdrant as the vector database. All other configurations remain the same as the default. For more details, refer to [README_qdrant.md](./README_qdrant.md). | +| [compose_tgi.yaml](./compose_tgi.yaml) | Uses TGI as the LLM serving framework. All other configurations remain the same as the default | +| [compose_without_rerank.yaml](./compose_without_rerank.yaml) | Default configuration without the reranker | +| [compose_faqgen.yaml](./compose_faqgen.yaml) | Enables FAQ generation using vLLM as the LLM serving framework. For more details, refer to [README_faqgen.md](./README_faqgen.md). | +| [compose_faqgen_tgi.yaml](./compose_faqgen_tgi.yaml) | Enables FAQ generation using TGI as the LLM serving framework. For more details, refer to [README_faqgen.md](./README_faqgen.md). | +| [compose.telemetry.yaml](./compose.telemetry.yaml) | Helper file for telemetry features for vllm. Can be used along with any compose files that serves vllm | +| [compose_tgi.telemetry.yaml](./compose_tgi.telemetry.yaml) | Helper file for telemetry features for tgi. Can be used along with any compose files that serves tgi | +| [compose_mariadb.yaml](./compose_mariadb.yaml) | Uses MariaDB Server as the vector database. All other configurations remain the same as the default | + +## ChatQnA with Conversational UI (Optional) + +To access the Conversational UI (react based) frontend, modify the UI service in the `compose` file used to deploy. Replace `chatqna-xeon-ui-server` service with the `chatqna-xeon-conversation-ui-server` service as per the config below: + +```yaml +chatqna-xeon-conversation-ui-server: + image: opea/chatqna-conversation-ui:latest + container_name: chatqna-xeon-conversation-ui-server + environment: + - APP_BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT} + - APP_DATA_PREP_SERVICE_URL=${DATAPREP_SERVICE_ENDPOINT} + ports: + - "5174:80" + depends_on: + - chatqna-xeon-backend-server + ipc: host + restart: always +``` + +Once the services are up, open the following URL in the browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If the developer prefers to use a different host port to access the frontend, it can be modified by port mapping in the `compose.yaml` file as shown below: + +```yaml + chatqna-xeon-conversation-ui-server: + image: opea/chatqna-conversation-ui:latest + ... + ports: + - "80:80" +``` + +Here is an example of running ChatQnA (default UI): + +![project-screenshot](../../../../assets/img/chat_ui_response.png) + +Here is an example of running ChatQnA with Conversational UI (React): + +![project-screenshot](../../../../assets/img/conversation_ui_response.png) + +### Validate Microservices + +Note, when verifying the microservices by curl or API from remote client, please make sure the **ports** of the microservices are opened in the firewall of the cloud node. +Follow the instructions to validate MicroServices. +For details on how to verify the correctness of the response, refer to [how-to-validate_service](../../hpu/gaudi/how_to_validate_service.md). + +1. **TEI Embedding Service** + Send a test request to the TEI Embedding Service to ensure it is running correctly: + + ```bash + curl http://${host_ip}:6006/embed \ + -X POST \ + -d '{"inputs":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' + ``` + + If you receive a connection error, ensure that the service is running and the port 6006 is open in the firewall. + +2. **Retriever Microservice** + + To consume the retriever microservice, you need to generate a mock embedding vector by Python script. The length of embedding vector + is determined by the embedding model. + Here we use the model `EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"`, which vector size is 768. + + Check the vector dimension of your embedding model, set `your_embedding` dimension equal to it. + + ```bash + export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + curl http://${host_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"test\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' + ``` + + If the response indicates an invalid embedding vector, verify that the vector size matches the model's expected dimension. + +3. **TEI Reranking Service** + + To test the TEI Reranking Service, use the following `curl` command: + + > Skip for ChatQnA without Rerank pipeline + + ```bash + curl http://${host_ip}:8808/rerank \ + -X POST \ + -d '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}' \ + -H 'Content-Type: application/json' + ``` + +4. **LLM Backend Service** + + In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready. + + Try the command below to check whether the LLM serving is ready. + + ```bash + docker logs textgen-service-endpoint-openai 2>&1 | grep complete + # If the service is ready, you will get the response like below. + INFO: Application startup complete. + ``` + + Then try the `cURL` command below to validate services. + +You may also test your underlying LLM endpoint. E.g., if OpenRouter.ai: + +```bash +curl https://openrouter.ai/api/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $OPENAI_API_KEY" \ + -d '{ + "model": ${LLM_MODEL_ID}, + "messages": [ + { + "role": "user", + "content": "What is the meaning of life?" + } + ] +}' +``` + +To test the OPEA service that is based on the above: + +```bash + curl http://${host_ip}:9000/v1/chat/completions \ + -X POST \ + -d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ + -H 'Content-Type: application/json' +``` + +5. **MegaService** + + Use the following `curl` command to test the MegaService: + + ```bash + curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{ + "messages": "What is the revenue of Nike in 2023?" + }' + ``` + +6. **Nginx Service** + + Use the following curl command to test the Nginx Service: + + ```bash + curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \ + -H "Content-Type: application/json" \ + -d '{"messages": "What is the revenue of Nike in 2023?"}' + ``` + +7. **Dataprep Microservice(Optional) ** + + If you want to update the default knowledge base, you can use the following commands: + + Update Knowledge Base via Local File [nke-10k-2023.pdf](https://github.com/opea-project/GenAIComps/blob/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf). Or + click [here](https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf) to download the file via any web browser. + Or run this command to get the file on a terminal. + + ```bash + wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf + ``` + + Upload: + + ```bash + curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./nke-10k-2023.pdf" + ``` + + This command updates a knowledge base by uploading a local file for processing. Update the file path according to your environment. + + Add Knowledge Base via HTTP Links: + + ```bash + curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \ + -H "Content-Type: multipart/form-data" \ + -F 'link_list=["https://opea.dev"]' + ``` + + This command updates a knowledge base by submitting a list of HTTP links for processing. + + Also, you are able to get the file list that you uploaded: + + ```bash + curl -X POST "http://${host_ip}:6007/v1/dataprep/get" \ + -H "Content-Type: application/json" + ``` + + Then you will get the response JSON like this. Notice that the returned `name`/`id` of the uploaded link is `https://xxx.txt`. + + ```json + [ + { + "name": "nke-10k-2023.pdf", + "id": "nke-10k-2023.pdf", + "type": "File", + "parent": "" + }, + { + "name": "https://opea.dev.txt", + "id": "https://opea.dev.txt", + "type": "File", + "parent": "" + } + ] + ``` + + To delete the file/link you uploaded: + + The `file_path` here should be the `id` get from `/v1/dataprep/get` API. + + ```bash + # delete link + curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \ + -d '{"file_path": "https://opea.dev.txt"}' \ + -H "Content-Type: application/json" + + # delete file + curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \ + -d '{"file_path": "nke-10k-2023.pdf"}' \ + -H "Content-Type: application/json" + + # delete all uploaded files and links + curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \ + -d '{"file_path": "all"}' \ + -H "Content-Type: application/json" + ``` + +### Profile Microservices + +To further analyze MicroService Performance, users could follow the instructions to profile MicroServices. + +#### 1. LLM Endpoint Service + +Users can profile the performance of the endpoint service using standard HTTP/network profiling tools such as: + +- cURL timing statistics +- Browser developer tools +- Network monitoring tools + +Example using cURL with timing data: + +```bash +curl -w "\nTime Statistics:\n-----------------\n\ +DNS Lookup: %{time_namelookup}s\n\ +TCP Connect: %{time_connect}s\n\ +TLS Handshake: %{time_appconnect}s\n\ +First Byte: %{time_starttransfer}s\n\ +Total Time: %{time_total}s\n" \ +-H "Content-Type: application/json" \ +-H "Authorization: Bearer $OPENAI_API_KEY" \ +-d '{ + "model": "${LLM_MODEL_ID}", + "messages": [ + { + "role": "user", + "content": "What is machine learning?" + } + ] +}' \ +${LLM_ENDPOINT}/v1/chat/completions +``` + +You can also use tools like `ab` (Apache Benchmark) for load testing: + +```bash +ab -n 100 -c 10 -p payload.json -T 'application/json' \ + -H "Authorization: Bearer $OPENAI_API_KEY" \ + ${LLM_ENDPOINT}/v1/chat/completions +``` + +For detailed API latency monitoring, consider using: + +- Grafana for visualization +- Prometheus for metrics collection +- OpenTelemetry for distributed tracing + +## Conclusion + +This guide should enable developer to deploy the default configuration or any of the other compose yaml files for different configurations. It also highlights the configurable parameters that can be set before deployment. diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_endpoint_openai.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_endpoint_openai.yaml new file mode 100644 index 0000000000..ed8045276e --- /dev/null +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_endpoint_openai.yaml @@ -0,0 +1,173 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + redis-vector-db: + image: redis/redis-stack:7.2.0-v9 + container_name: redis-vector-db + ports: + - "6379:6379" + - "8001:8001" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 10 + dataprep-redis-service: + image: ${REGISTRY:-opea}/dataprep:${TAG:-latest} + container_name: dataprep-redis-server + depends_on: + redis-vector-db: + condition: service_healthy + tei-embedding-service: + condition: service_started + ports: + - "6007:5000" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + REDIS_URL: redis://redis-vector-db:6379 + REDIS_HOST: redis-vector-db + INDEX_NAME: ${INDEX_NAME} + TEI_ENDPOINT: http://tei-embedding-service:80 + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"] + interval: 10s + timeout: 5s + retries: 50 + restart: unless-stopped + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.7 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "${MODEL_CACHE:-./data}:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate + retriever: + image: ${REGISTRY:-opea}/retriever:${TAG:-latest} + container_name: retriever-redis-server + depends_on: + - redis-vector-db + ports: + - "7000:7000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + REDIS_URL: redis://redis-vector-db:6379 + REDIS_HOST: redis-vector-db + INDEX_NAME: ${INDEX_NAME} + TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80 + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + LOGFLAG: ${LOGFLAG} + RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS" + restart: unless-stopped + tei-reranking-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.7 + container_name: tei-reranking-server + ports: + - "8808:80" + volumes: + - "${MODEL_CACHE:-./data}:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + command: --model-id ${RERANK_MODEL_ID} --auto-truncate + # Substitute vllm with OpeaTextGenService + textgen-service-endpoint-openai: # Used instead of vllm + image: opea/llm-textgen:${TAG:-latest} # Changed image + container_name: textgen-service-endpoint-openai # Updated container name + ipc: host + ports: + - "9000:9000" # Changed port mapping + environment: + LLM_COMPONENT_NAME: OpeaTextGenService + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_MODEL_ID: ${LLM_MODEL_ID} # Set to model ID + LLM_ENDPOINT: ${LLM_ENDPOINT} # An openai compatible endpoint, e.g. Hugging Face, OpenRouter, OpenAI + OPENAI_API_KEY: ${OPENAI_API_KEY} # Add OpenRouter API Key + chatqna-xeon-backend-server: + image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} + container_name: chatqna-xeon-backend-server + depends_on: + redis-vector-db: + condition: service_started + dataprep-redis-service: + condition: service_healthy + tei-embedding-service: + condition: service_started + retriever: + condition: service_started + tei-reranking-service: + condition: service_started + textgen-service-endpoint-openai: + condition: service_started + ports: + - "8888:8888" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - MEGA_SERVICE_HOST_IP=chatqna-xeon-backend-server + - EMBEDDING_SERVER_HOST_IP=tei-embedding-service + - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80} + - RETRIEVER_SERVICE_HOST_IP=retriever + - RERANK_SERVER_HOST_IP=tei-reranking-service + - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80} + - LLM_SERVER_HOST_IP=textgen-service-endpoint-openai # Updated host IP + - LLM_SERVER_PORT=${LLM_SERVER_PORT:-9000} + - LLM_MODEL=${LLM_MODEL_ID} + - LOGFLAG=${LOGFLAG} + ipc: host + restart: always + chatqna-xeon-ui-server: + image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest} + container_name: chatqna-xeon-ui-server + depends_on: + - chatqna-xeon-backend-server + ports: + - "5173:5173" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + ipc: host + restart: always + chatqna-xeon-nginx-server: + image: ${REGISTRY:-opea}/nginx:${TAG:-latest} + container_name: chatqna-xeon-nginx-server + depends_on: + - chatqna-xeon-backend-server + - chatqna-xeon-ui-server + ports: + - "${NGINX_PORT:-80}:80" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - FRONTEND_SERVICE_IP=chatqna-xeon-ui-server + - FRONTEND_SERVICE_PORT=5173 + - BACKEND_SERVICE_NAME=chatqna + - BACKEND_SERVICE_IP=chatqna-xeon-backend-server + - BACKEND_SERVICE_PORT=8888 + - DATAPREP_SERVICE_IP=dataprep-redis-service + - DATAPREP_SERVICE_PORT=5000 + ipc: host + restart: always + +networks: + default: + driver: bridge