diff --git a/.gitignore b/.gitignore index 83cb7854..99126390 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,4 @@ emd_models/ **artifacts src/pipeline/emd *.log +.venv-vl/* diff --git a/docs/en/supported_models.md b/docs/en/supported_models.md index ed78a163..79b56171 100644 --- a/docs/en/supported_models.md +++ b/docs/en/supported_models.md @@ -78,3 +78,6 @@ | bge-reranker-v2-m3 | bge | rerank | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | | bge-reranker-large | bge | rerank | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | | jina-reranker-v2-base-multilingual | jina | rerank | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | +| bge-vl-base | bge | embedding | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | +| bge-vl-large | bge | embedding | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | ecs | ✅ | +| bge-vl-mllm-s1 | bge | embedding | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | ecs | ✅ | diff --git a/src/emd/models/embeddings/bge_vl.py b/src/emd/models/embeddings/bge_vl.py index c639e4ed..72738f54 100644 --- a/src/emd/models/embeddings/bge_vl.py +++ b/src/emd/models/embeddings/bge_vl.py @@ -1,5 +1,5 @@ from .. import Model -from ..engines import huggingface_embedding_engine449 +from ..engines import huggingface_embedding_engine449, huggingface_embedding_engine_447 from ..services import sagemaker_service, local_service, ecs_service from ..frameworks import fastapi_framework from ..instances import ( @@ -50,6 +50,7 @@ model_id="bge-vl-large", supported_engines=[huggingface_embedding_engine449], supported_instances=[ + g5dxlarge_instance, g5d2xlarge_instance, g5d4xlarge_instance, g5d8xlarge_instance, @@ -72,3 +73,32 @@ description="BGE-VL-large is a larger multimodal embedding model that supports text, image, and text-image pair inputs for high-performance multimodal representation learning and cross-modal retrieval tasks." ) ) + +Model.register( + dict( + model_id="bge-vl-mllm-s1", + supported_engines=[huggingface_embedding_engine_447], + supported_instances=[ + g5dxlarge_instance, + g5d2xlarge_instance, + g5d4xlarge_instance, + g5d8xlarge_instance, + g5d16xlarge_instance, + local_instance, + ], + supported_services=[ + ecs_service + ], + supported_frameworks=[ + fastapi_framework + ], + allow_china_region=True, + huggingface_model_id="BAAI/BGE-VL-MLLM-S1", + modelscope_model_id="BAAI/BGE-VL-MLLM-S1", + require_huggingface_token=False, + application_scenario="Multimodal RAG, composed image retrieval, visual search", + model_type=ModelType.EMBEDDING, + model_series=BGE_SERIES, + description="BGE-VL-MLLM-S1 is a larger multimodal embedding model that supports text, image, and text-image pair inputs for high-performance multimodal representation learning and cross-modal retrieval tasks." + ) +) diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py index f2812746..782d4122 100644 --- a/src/emd/models/engines.py +++ b/src/emd/models/engines.py @@ -507,6 +507,17 @@ class KtransformersEngine(OpenAICompitableEngine): "pretrained_tokenizer_init_kwargs":{"trust_remote_code":True} }) +huggingface_embedding_engine_447 = HuggingFaceLLMEngine(**{ + "engine_type":EngineType.HUGGINGFACE, + "engine_cls":"huggingface.embedding.transformers_embedding_backend.TransformerEmbeddingBackend", + "python_name":"python3", + "base_image_host":"public.ecr.aws", + "use_public_ecr":True, + "docker_login_region":"us-east-1", + "engine_dockerfile_config": {"VERSION":"4.47.0"}, + "pretrained_model_init_kwargs":{"trust_remote_code":True,"torch_dtype":"float16"}, +}) + huggingface_embedding_engine449 = HuggingFaceLLMEngine(**{ "engine_type":EngineType.HUGGINGFACE, "engine_cls":"huggingface.embedding.transformers_embedding_backend.TransformerEmbeddingBackend", diff --git a/src/pipeline/backend/huggingface/embedding/transformers_embedding_backend.py b/src/pipeline/backend/huggingface/embedding/transformers_embedding_backend.py index 559c090f..6177fb6e 100644 --- a/src/pipeline/backend/huggingface/embedding/transformers_embedding_backend.py +++ b/src/pipeline/backend/huggingface/embedding/transformers_embedding_backend.py @@ -36,6 +36,8 @@ def __init__(self,*args,**kwargs): self.model = None self.pretrained_model_init_kwargs = self.execute_model.executable_config.current_engine.pretrained_model_init_kwargs or {} self.is_bge_vl = "bge-vl" in self.model_id.lower() + self.is_bge_vl_mllm = "bge-vl-mllm" in self.model_id.lower() + self.model_abs_path = None def start(self): @@ -48,7 +50,7 @@ def start(self): s3_key = model_dir, model_files_s3_path=self.model_files_s3_path ) - model_abs_path = os.path.abspath(model_dir) + self.model_abs_path = os.path.abspath(model_dir) # TODO add model init args from model's definition torch_dtype = self.pretrained_model_init_kwargs.get("torch_dtype") @@ -61,15 +63,18 @@ def start(self): }[torch_dtype] self.model = AutoModel.from_pretrained( - model_abs_path, + self.model_abs_path, device_map="cuda", **self.pretrained_model_init_kwargs ) + if self.is_bge_vl_mllm: + self.model.eval() + # BGE-VL specific initialization - if self.is_bge_vl: + if self.is_bge_vl and not self.is_bge_vl_mllm: try: - self.model.set_processor(model_abs_path) + self.model.set_processor(self.model_abs_path) logger.info(f"BGE-VL processor set successfully for model: {self.model_id}") except Exception as e: logger.warning(f"Failed to set BGE-VL processor: {e}") @@ -165,6 +170,74 @@ def _parse_multimodal_inputs(self, inputs): return text_inputs, image_inputs, multimodal_inputs + def _generate_bge_vl_mllm_embeddings(self, inputs): + """Generate embeddings using BGE-VL-MLLM model""" + text_inputs, image_inputs, multimodal_inputs = self._parse_multimodal_inputs(inputs) + all_embeddings = [] + + # Process text-only inputs + if text_inputs: + try: + with torch.no_grad(): + self.model.set_processor(self.model_abs_path) + candidate_inputs = self.model.data_process( + text=text_inputs, + q_or_c="c" + ) + text_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :] + text_emb = torch.nn.functional.normalize(text_emb, dim=-1) + if hasattr(text_emb, 'tolist'): + all_embeddings.extend(text_emb.tolist()) + else: + all_embeddings.extend(text_emb) + except Exception as e: + logger.error(f"Failed to encode text inputs with MLLM: {e}") + raise ValueError(f"BGE-VL-MLLM text encoding failed: {e}") + + # Process image-only inputs + if image_inputs: + try: + with torch.no_grad(): + self.model.set_processor(self.model_abs_path) + candidate_inputs = self.model.data_process( + images=image_inputs, + q_or_c="c" + ) + image_embeddings = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :] + image_embeddings = torch.nn.functional.normalize(image_embeddings, dim=-1) + if hasattr(image_embeddings, 'tolist'): + all_embeddings.extend(image_embeddings.tolist()) + else: + all_embeddings.extend(image_embeddings) + except Exception as e: + logger.error(f"Failed to encode image inputs with MLLM: {e}") + raise ValueError(f"BGE-VL-MLLM image encoding failed: {e}") + + # Process multimodal inputs (text + image) + if multimodal_inputs: + with torch.no_grad(): + self.model.set_processor(self.model_abs_path) + for text, bytesio_image in multimodal_inputs: + try: + # Convert BytesIO back to PIL Image for MLLM model + candidate_inputs = self.model.data_process( + text=[text], + images=[bytesio_image], + q_or_c="c" + ) + with torch.no_grad(): + multimodal_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :] + multimodal_emb = torch.nn.functional.normalize(multimodal_emb, dim=-1) + if hasattr(multimodal_emb, 'tolist'): + all_embeddings.extend(multimodal_emb.tolist()) + else: + all_embeddings.extend(multimodal_emb) + except Exception as e: + logger.error(f"Failed to encode multimodal input with MLLM: {e}") + raise ValueError(f"BGE-VL-MLLM multimodal encoding failed: {e}") + + return all_embeddings + def _generate_bge_vl_embeddings(self, inputs): """Generate embeddings using BGE-VL model""" text_inputs, image_inputs, multimodal_inputs = self._parse_multimodal_inputs(inputs) @@ -220,7 +293,10 @@ def invoke(self, request:dict): logger.info(f'request: {request}') t0 = time.time() - if self.is_bge_vl: + if self.is_bge_vl_mllm: + # Use BGE-VL-MLLM multimodal processing + embeddings_list = self._generate_bge_vl_mllm_embeddings(inputs) + elif self.is_bge_vl: # Use BGE-VL multimodal processing embeddings_list = self._generate_bge_vl_embeddings(inputs) else: