Optimizations and fixes

Yuval-Roth · olgaoznovich · omerdor001 · Yuval-Roth · commit 15da47de3907 · 2025-06-26T17:36:51.000+03:00
* Updates to EmbeddingDispatcher: Added catch for exceptions in worker, set the processes to run in high priority

* Offloaded some CPU-intensive and blocking code in adapt_insert and adapt_query to a background thread
instead of having it run on the main asyncio event-loop

* Fixed not inserting into memory cache after memory cache miss.

* Fixes in WTINYLFU memory cache class.

* Replaced hardcoded similarity threshold in cosine similarity with dynamic value

Co-authored-by: olgaoznovich &lt;ol.oznovich@gmail.com&gt;
Co-authored-by: Yuval-Roth &lt;rothyuv@post.bgu.ac.il&gt;
Co-authored-by: omerdor001 &lt;omerdo@post.bgu.ac.il&gt;
Co-authored-by: adiaybgu &lt;adiay@post.bgu.ac.il&gt;
diff --git a/.gitignore b/.gitignore
@@ -93,7 +93,7 @@ celerybeat.pid
 
 # Environments
 .env
-.venv
+.venv*
 env/
 venv/
 ENV/
diff --git a/modelcache/adapter/adapter.py b/modelcache/adapter/adapter.py
@@ -35,9 +35,9 @@ async def create_insert(cls, *args, **kwargs):
             return str(e)
 
     @classmethod
-    def create_remove(cls, *args, **kwargs):
+    async def create_remove(cls, *args, **kwargs):
         try:
-            return adapt_remove(
+            return await adapt_remove(
                 *args,
                 **kwargs
             )
@@ -46,9 +46,9 @@ def create_remove(cls, *args, **kwargs):
             return str(e)
 
     @classmethod
-    def create_register(cls, *args, **kwargs):
+    async def create_register(cls, *args, **kwargs):
         try:
-            return adapt_register(
+            return await adapt_register(
                 *args,
                 **kwargs
             )
diff --git a/modelcache/adapter/adapter_insert.py b/modelcache/adapter/adapter_insert.py
@@ -16,7 +16,6 @@ async def adapt_insert(*args, **kwargs):
 
     pre_embedding_data_list = []
     embedding_futures_list = []
-    # embedding_data_list = []
     llm_data_list = []
 
     for row in chat_info:
@@ -37,7 +36,8 @@ async def adapt_insert(*args, **kwargs):
 
     embedding_data_list = await asyncio.gather(*embedding_futures_list)
 
-    chat_cache.data_manager.save(
+    await asyncio.to_thread(
+        chat_cache.data_manager.save,
         pre_embedding_data_list,
         llm_data_list,
         embedding_data_list,
diff --git a/modelcache/adapter/adapter_query.py b/modelcache/adapter/adapter_query.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import asyncio
 import logging
 from modelcache.embedding import MetricType
 from modelcache.utils.time import time_cal
@@ -24,17 +25,20 @@ async def adapt_query(cache_data_convert, *args, **kwargs):
         cache_obj=chat_cache
     )(pre_embedding_data)
 
-    cache_data_list = time_cal(
+    search_time_cal = time_cal(
         chat_cache.data_manager.search,
         func_name="vector_search",
         report_func=chat_cache.report.search,
         cache_obj=chat_cache
-    )(
+    )
+    cache_data_list = await asyncio.to_thread(
+        search_time_cal,
         embedding_data,
         extra_param=context.get("search_func", None),
         top_k=kwargs.pop("top_k", -1),
         model=model
     )
+
     cache_answers = []
     cache_questions = []
     cache_ids = []
@@ -43,7 +47,7 @@ async def adapt_query(cache_data_convert, *args, **kwargs):
     if chat_cache.similarity_metric_type == MetricType.COSINE:
         cosine_similarity = cache_data_list[0][0]
         # This code uses the built-in cosine similarity evaluation in milvus
-        if cosine_similarity < 0.9:
+        if cosine_similarity < chat_cache.similarity_threshold:
             return None
     elif chat_cache.similarity_metric_type == MetricType.L2:
         ## this is the code that uses L2 for similarity evaluation
@@ -87,8 +91,9 @@ async def adapt_query(cache_data_convert, *args, **kwargs):
         reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=False)
         for cache_data in cache_data_list:
             primary_id = cache_data[1]
-            ret = chat_cache.data_manager.get_scalar_data(
-                cache_data, extra_param=context.get("get_scalar_data", None),model=model
+            ret = await asyncio.to_thread(
+                chat_cache.data_manager.get_scalar_data,
+                cache_data, extra_param=context.get("get_scalar_data", None), model=model
             )
             if ret is None:
                 continue
@@ -133,8 +138,9 @@ async def adapt_query(cache_data_convert, *args, **kwargs):
         # 不使用 reranker 时，走原来的逻辑
         for cache_data in cache_data_list:
             primary_id = cache_data[1]
-            ret = chat_cache.data_manager.get_scalar_data(
-                cache_data, extra_param=context.get("get_scalar_data", None),model=model
+            ret = await asyncio.to_thread(
+                chat_cache.data_manager.get_scalar_data,
+                cache_data, extra_param=context.get("get_scalar_data", None), model=model
             )
             if ret is None:
                 continue
@@ -204,7 +210,7 @@ async def adapt_query(cache_data_convert, *args, **kwargs):
         )
         # 更新命中次数
         try:
-            chat_cache.data_manager.update_hit_count(return_id)
+            asyncio.create_task(asyncio.to_thread(chat_cache.data_manager.update_hit_count,return_id))
         except Exception:
             logging.info('update_hit_count except, please check!')
 
diff --git a/modelcache/adapter/adapter_register.py b/modelcache/adapter/adapter_register.py
@@ -1,11 +1,16 @@
 # -*- coding: utf-8 -*-
+import asyncio
 
 
-def adapt_register(*args, **kwargs):
+async def adapt_register(*args, **kwargs):
     chat_cache = kwargs.pop("cache_obj")
     model = kwargs.pop("model", None)
     if model is None or len(model) == 0:
         return ValueError('')
 
-    register_resp = chat_cache.data_manager.create_index(model)
+    register_resp = await asyncio.to_thread(
+        chat_cache.data_manager.create_index,
+        model
+    )
+
     return register_resp
diff --git a/modelcache/adapter/adapter_remove.py b/modelcache/adapter/adapter_remove.py
@@ -1,8 +1,10 @@
 # -*- coding: utf-8 -*-
-from modelcache.utils.error import NotInitError, RemoveError
+import asyncio
 
+from modelcache.utils.error import RemoveError
 
-def adapt_remove(*args, **kwargs):
+
+async def adapt_remove(*args, **kwargs):
     chat_cache = kwargs.pop("cache_obj")
     model = kwargs.pop("model", None)
     remove_type = kwargs.pop("remove_type", None)
@@ -13,9 +15,15 @@ def adapt_remove(*args, **kwargs):
     # delete data
     if remove_type == 'delete_by_id':
         id_list = kwargs.pop("id_list", [])
-        resp = chat_cache.data_manager.delete(id_list, model=model)
+        resp = await asyncio.to_thread(
+            chat_cache.data_manager.delete,
+            id_list, model=model
+        )
     elif remove_type == 'truncate_by_model':
-        resp = chat_cache.data_manager.truncate(model)
+        resp = await asyncio.to_thread(
+            chat_cache.data_manager.truncate,
+            model
+        )
     else:
         # resp = "remove_type_error"
         raise RemoveError()
diff --git a/modelcache/cache.py b/modelcache/cache.py
@@ -27,13 +27,6 @@
             #==================== Cache class definition =========================#
             #=====================================================================#
 
-executor = ThreadPoolExecutor(max_workers=2)
-
-def response_text(cache_resp):
-    return cache_resp['data']
-
-def response_hitquery(cache_resp):
-    return cache_resp['hitQuery']
 
 # noinspection PyMethodMayBeStatic
 class Cache:
@@ -80,11 +73,16 @@ def close():
                 modelcache_log.error(e)
 
     def save_query_resp(self, query_resp_dict, **kwargs):
-        self.data_manager.save_query_resp(query_resp_dict, **kwargs)
+        asyncio.create_task(asyncio.to_thread(
+            self.data_manager.save_query_resp,
+            query_resp_dict, **kwargs
+        ))
 
     def save_query_info(self,result, model, query, delta_time_log):
-        self.data_manager.save_query_resp(result, model=model, query=json.dumps(query, ensure_ascii=False),
-                                          delta_time=delta_time_log)
+        asyncio.create_task(asyncio.to_thread(
+            self.data_manager.save_query_resp,
+            result, model=model, query=json.dumps(query, ensure_ascii=False), delta_time=delta_time_log
+        ))
 
     async def handle_request(self, param_dict: dict):
         # param parsing
@@ -103,7 +101,7 @@ async def handle_request(self, param_dict: dict):
                 result = {"errorCode": 102,
                           "errorDesc": "type exception, should one of ['query', 'insert', 'remove', 'register']",
                           "cacheHit": False, "delta_time": 0, "hit_query": '', "answer": ''}
-                self.data_manager.save_query_resp(result, model=model, query='', delta_time=0)
+                self.save_query_resp(result, model=model, query='', delta_time=0)
                 return result
         except Exception as e:
             return {"errorCode": 103, "errorDesc": str(e), "cacheHit": False, "delta_time": 0, "hit_query": '',
@@ -120,14 +118,14 @@ async def handle_request(self, param_dict: dict):
         elif request_type == 'insert':
             return await self.handle_insert(chat_info, model)
         elif request_type == 'remove':
-            return self.handle_remove(model, param_dict)
+            return await self.handle_remove(model, param_dict)
         elif request_type == 'register':
-            return self.handle_register(model)
+            return await self.handle_register(model)
         else:
             return {"errorCode": 400, "errorDesc": "bad request"}
 
-    def handle_register(self, model):
-        response = adapter.ChatCompletion.create_register(
+    async def handle_register(self, model):
+        response = await adapter.ChatCompletion.create_register(
             model=model,
             cache_obj=self
         )
@@ -137,10 +135,10 @@ def handle_register(self, model):
             result = {"errorCode": 502, "errorDesc": "", "response": response, "writeStatus": "exception"}
         return result
 
-    def handle_remove(self, model, param_dict):
+    async def handle_remove(self, model, param_dict):
         remove_type = param_dict.get("remove_type")
         id_list = param_dict.get("id_list", [])
-        response = adapter.ChatCompletion.create_remove(
+        response = await adapter.ChatCompletion.create_remove(
             model=model,
             remove_type=remove_type,
             id_list=id_list,
@@ -191,12 +189,12 @@ async def handle_query(self, model, query):
                 result = {"errorCode": 201, "errorDesc": response, "cacheHit": False, "delta_time": delta_time,
                           "hit_query": '', "answer": ''}
             else:
-                answer = response_text(response)
-                hit_query = response_hitquery(response)
+                answer = response['data']
+                hit_query = response['hitQuery']
                 result = {"errorCode": 0, "errorDesc": '', "cacheHit": True, "delta_time": delta_time,
                           "hit_query": hit_query, "answer": answer}
             delta_time_log = round(time.time() - start_time, 2)
-            executor.submit(self.save_query_info, result, model, query, delta_time_log)
+            self.save_query_info(result, model, query, delta_time_log)
         except Exception as e:
             result = {"errorCode": 202, "errorDesc": str(e), "cacheHit": False, "delta_time": 0,
                       "hit_query": '', "answer": ''}
@@ -265,7 +263,9 @@ async def init(
         #==================================================#
 
         # switching based on embedding_model
-        if embedding_model == EmbeddingModel.HUGGINGFACE_ALL_MPNET_BASE_V2:
+        if (embedding_model == EmbeddingModel.HUGGINGFACE_ALL_MPNET_BASE_V2
+        or  embedding_model == EmbeddingModel.HUGGINGFACE_ALL_MINILM_L6_V2
+        or  embedding_model == EmbeddingModel.HUGGINGFACE_ALL_MINILM_L12_V2):
             query_pre_embedding_func = query_with_role
             insert_pre_embedding_func = query_with_role
             post_process_messages_func = first
@@ -287,8 +287,8 @@ async def init(
 
         # add more configurations for other embedding models as needed
         else:
-            modelcache_log.error(f"Please add configuration for {embedding_model} in modelcache/__init__.py.")
-            raise CacheError(f"Please add configuration for {embedding_model} in modelcache/__init__.py.")
+            modelcache_log.error(f"Please add configuration for {embedding_model} in modelcache/cache.py.")
+            raise CacheError(f"Please add configuration for {embedding_model} in modelcache/cache.py.")
 
         # ====================== Data manager ==============================#
 
@@ -300,7 +300,7 @@ async def init(
                 config=vector_config,
                 metric_type=similarity_metric_type,
             ),
-            eviction='ARC',
+            memory_cache_policy='ARC',
             max_size=10000,
             normalize=normalize,
         )
diff --git a/modelcache/embedding/base.py b/modelcache/embedding/base.py
@@ -1,8 +1,12 @@
 # -*- coding: utf-8 -*-
 from abc import abstractmethod, ABCMeta
 
+from modelcache.utils.error import CacheError
 from modelcache.utils.lazy_import import LazyImport
 from enum import Enum
+
+from modelcache.utils.log import modelcache_log
+
 huggingface = LazyImport("huggingface", globals(), "modelcache.embedding.huggingface")
 data2vec = LazyImport("data2vec", globals(), "modelcache.embedding.data2vec")
 llmEmb = LazyImport("llmEmb", globals(), "modelcache.embedding.llmEmb")
@@ -21,7 +25,7 @@ class EmbeddingModel(Enum):
     HUGGINGFACE_ALL_MPNET_BASE_V2 = {"dimension":768, "model_path":"sentence-transformers/all-mpnet-base-v2"}
     HUGGINGFACE_ALL_MINILM_L6_V2 = {"dimension":384, "model_path":"sentence-transformers/all-MiniLM-L6-v2"}
     HUGGINGFACE_ALL_MINILM_L12_V2 = {"dimension":384, "model_path":"sentence-transformers/all-MiniLM-L12-v2"}
-    DATA2VEC_AUDIO = {"dimension":None, "model_path":"model/text2vec-base-chinese/"}
+    DATA2VEC_AUDIO = {"dimension":768, "model_path":"model/text2vec-base-chinese/"}
     LLM_EMB2VEC_AUDIO = {"dimension":None, "model_path":None}
     FASTTEXT = {"dimension":None, "model_path":None}
     PADDLE_NLP = {"dimension":None, "model_path":None}
@@ -68,6 +72,14 @@ def get(model:EmbeddingModel, **kwargs):
             model_path = kwargs.pop("model_path","sentence-transformers/all-mpnet-base-v2")
             return huggingface.Huggingface(model_path)
 
+        elif model == EmbeddingModel.HUGGINGFACE_ALL_MINILM_L6_V2:
+            model_path = kwargs.pop("model_path","sentence-transformers/all-MiniLM-L6-v2")
+            return huggingface.Huggingface(model_path)
+
+        elif model == EmbeddingModel.HUGGINGFACE_ALL_MINILM_L12_V2:
+            model_path = kwargs.pop("model_path","sentence-transformers/all-MiniLM-L12-v2")
+            return huggingface.Huggingface(model_path)
+
         elif model == EmbeddingModel.DATA2VEC_AUDIO:
             model_path = kwargs.pop("model_path","model/text2vec-base-chinese/")
             return data2vec.Data2VecAudio(model_path)
@@ -99,5 +111,7 @@ def get(model:EmbeddingModel, **kwargs):
             return bge_m3.BgeM3Embedding(model_path)
 
         else:
-            raise ValueError(f"Unsupported embedding model: {model}")
+            modelcache_log.error(f"Please add configuration for {model} in modelcache/embedding/base.py.")
+            raise CacheError(f"Please add configuration for {model} in modelcache/embedding/base.py.")
+
 
diff --git a/modelcache/embedding/embedding_dispatcher.py b/modelcache/embedding/embedding_dispatcher.py
@@ -2,6 +2,7 @@
 import threading
 import uuid
 import asyncio
+import psutil
 from asyncio import Future, AbstractEventLoop
 
 from modelcache.embedding import EmbeddingModel
@@ -11,13 +12,18 @@
 def worker_func(embedding_model: EmbeddingModel, model_path, task_queue, result_queue, worker_id):
     base_embedding = BaseEmbedding.get(embedding_model, model_path=model_path)
     print(f"Embedding worker {worker_id} started.")
-    while True:
-        job_id, data = task_queue.get()
-        try:
-            result = base_embedding.to_embeddings(data)
-        except Exception as e:
-            result = e
-        result_queue.put((job_id, result))
+    try:
+        while True:
+            job_id, data = task_queue.get()
+            try:
+                result = base_embedding.to_embeddings(data)
+            except Exception as e:
+                result = e
+            result_queue.put((job_id, result))
+    except KeyboardInterrupt:
+        print(f"Embedding worker {worker_id} stopped.")
+    except Exception as e:
+        print(f"Embedding worker {worker_id} encountered an error: {e}")
 
 
 class EmbeddingDispatcher:
@@ -46,6 +52,7 @@ def __init__(
             )
             p.daemon = True
             p.start()
+            psutil.Process(p.pid).nice(psutil.HIGH_PRIORITY_CLASS)
             self.workers.append(p)
 
     def _start_result_collector_thread(self):
diff --git a/modelcache/manager/data_manager.py b/modelcache/manager/data_manager.py
diff --git a/modelcache/manager/eviction/wtinylfu_cache.py b/modelcache/manager/eviction/wtinylfu_cache.py
diff --git a/modelcache/manager/scalar_data/sql_storage.py b/modelcache/manager/scalar_data/sql_storage.py
diff --git a/websocket4modelcache.py b/websocket4modelcache.py

-Original file line number
+Diff line change
 # Environments
 .env
 -.venv
 +.venv*
 env/
 venv/
 ENV/