Skip to content

feat: add embedding model bge-vl-mllm-s1 #164

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,4 @@ emd_models/
**artifacts
src/pipeline/emd
*.log
.venv-vl/*
3 changes: 3 additions & 0 deletions docs/en/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,6 @@
| bge-reranker-v2-m3 | bge | rerank | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ |
| bge-reranker-large | bge | rerank | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ |
| jina-reranker-v2-base-multilingual | jina | rerank | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ |
| bge-vl-base | bge | embedding | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ |
| bge-vl-large | bge | embedding | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | ecs | ✅ |
| bge-vl-mllm-s1 | bge | embedding | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | ecs | ✅ |
32 changes: 31 additions & 1 deletion src/emd/models/embeddings/bge_vl.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .. import Model
from ..engines import huggingface_embedding_engine449
from ..engines import huggingface_embedding_engine449, huggingface_embedding_engine_447
from ..services import sagemaker_service, local_service, ecs_service
from ..frameworks import fastapi_framework
from ..instances import (
Expand Down Expand Up @@ -50,6 +50,7 @@
model_id="bge-vl-large",
supported_engines=[huggingface_embedding_engine449],
supported_instances=[
g5dxlarge_instance,
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
Expand All @@ -72,3 +73,32 @@
description="BGE-VL-large is a larger multimodal embedding model that supports text, image, and text-image pair inputs for high-performance multimodal representation learning and cross-modal retrieval tasks."
)
)

Model.register(
dict(
model_id="bge-vl-mllm-s1",
supported_engines=[huggingface_embedding_engine_447],
supported_instances=[
g5dxlarge_instance,
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d16xlarge_instance,
local_instance,
],
supported_services=[
ecs_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="BAAI/BGE-VL-MLLM-S1",
modelscope_model_id="BAAI/BGE-VL-MLLM-S1",
require_huggingface_token=False,
application_scenario="Multimodal RAG, composed image retrieval, visual search",
model_type=ModelType.EMBEDDING,
model_series=BGE_SERIES,
description="BGE-VL-MLLM-S1 is a larger multimodal embedding model that supports text, image, and text-image pair inputs for high-performance multimodal representation learning and cross-modal retrieval tasks."
)
)
11 changes: 11 additions & 0 deletions src/emd/models/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,17 @@ class KtransformersEngine(OpenAICompitableEngine):
"pretrained_tokenizer_init_kwargs":{"trust_remote_code":True}
})

huggingface_embedding_engine_447 = HuggingFaceLLMEngine(**{
"engine_type":EngineType.HUGGINGFACE,
"engine_cls":"huggingface.embedding.transformers_embedding_backend.TransformerEmbeddingBackend",
"python_name":"python3",
"base_image_host":"public.ecr.aws",
"use_public_ecr":True,
"docker_login_region":"us-east-1",
"engine_dockerfile_config": {"VERSION":"4.47.0"},
"pretrained_model_init_kwargs":{"trust_remote_code":True,"torch_dtype":"float16"},
})

huggingface_embedding_engine449 = HuggingFaceLLMEngine(**{
"engine_type":EngineType.HUGGINGFACE,
"engine_cls":"huggingface.embedding.transformers_embedding_backend.TransformerEmbeddingBackend",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def __init__(self,*args,**kwargs):
self.model = None
self.pretrained_model_init_kwargs = self.execute_model.executable_config.current_engine.pretrained_model_init_kwargs or {}
self.is_bge_vl = "bge-vl" in self.model_id.lower()
self.is_bge_vl_mllm = "bge-vl-mllm" in self.model_id.lower()
self.model_abs_path = None


def start(self):
Expand All @@ -48,7 +50,7 @@ def start(self):
s3_key = model_dir,
model_files_s3_path=self.model_files_s3_path
)
model_abs_path = os.path.abspath(model_dir)
self.model_abs_path = os.path.abspath(model_dir)

# TODO add model init args from model's definition
torch_dtype = self.pretrained_model_init_kwargs.get("torch_dtype")
Expand All @@ -61,15 +63,18 @@ def start(self):
}[torch_dtype]

self.model = AutoModel.from_pretrained(
model_abs_path,
self.model_abs_path,
device_map="cuda",
**self.pretrained_model_init_kwargs
)

if self.is_bge_vl_mllm:
self.model.eval()

# BGE-VL specific initialization
if self.is_bge_vl:
if self.is_bge_vl and not self.is_bge_vl_mllm:
try:
self.model.set_processor(model_abs_path)
self.model.set_processor(self.model_abs_path)
logger.info(f"BGE-VL processor set successfully for model: {self.model_id}")
except Exception as e:
logger.warning(f"Failed to set BGE-VL processor: {e}")
Expand Down Expand Up @@ -165,6 +170,74 @@ def _parse_multimodal_inputs(self, inputs):

return text_inputs, image_inputs, multimodal_inputs

def _generate_bge_vl_mllm_embeddings(self, inputs):
"""Generate embeddings using BGE-VL-MLLM model"""
text_inputs, image_inputs, multimodal_inputs = self._parse_multimodal_inputs(inputs)
all_embeddings = []

# Process text-only inputs
if text_inputs:
try:
with torch.no_grad():
self.model.set_processor(self.model_abs_path)
candidate_inputs = self.model.data_process(
text=text_inputs,
q_or_c="c"
)
text_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
text_emb = torch.nn.functional.normalize(text_emb, dim=-1)
if hasattr(text_emb, 'tolist'):
all_embeddings.extend(text_emb.tolist())
else:
all_embeddings.extend(text_emb)
except Exception as e:
logger.error(f"Failed to encode text inputs with MLLM: {e}")
raise ValueError(f"BGE-VL-MLLM text encoding failed: {e}")

# Process image-only inputs
if image_inputs:
try:
with torch.no_grad():
self.model.set_processor(self.model_abs_path)
candidate_inputs = self.model.data_process(
images=image_inputs,
q_or_c="c"
)
image_embeddings = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
image_embeddings = torch.nn.functional.normalize(image_embeddings, dim=-1)
if hasattr(image_embeddings, 'tolist'):
all_embeddings.extend(image_embeddings.tolist())
else:
all_embeddings.extend(image_embeddings)
except Exception as e:
logger.error(f"Failed to encode image inputs with MLLM: {e}")
raise ValueError(f"BGE-VL-MLLM image encoding failed: {e}")

# Process multimodal inputs (text + image)
if multimodal_inputs:
with torch.no_grad():
self.model.set_processor(self.model_abs_path)
for text, bytesio_image in multimodal_inputs:
try:
# Convert BytesIO back to PIL Image for MLLM model
candidate_inputs = self.model.data_process(
text=[text],
images=[bytesio_image],
q_or_c="c"
)
with torch.no_grad():
multimodal_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
multimodal_emb = torch.nn.functional.normalize(multimodal_emb, dim=-1)
if hasattr(multimodal_emb, 'tolist'):
all_embeddings.extend(multimodal_emb.tolist())
else:
all_embeddings.extend(multimodal_emb)
except Exception as e:
logger.error(f"Failed to encode multimodal input with MLLM: {e}")
raise ValueError(f"BGE-VL-MLLM multimodal encoding failed: {e}")

return all_embeddings

def _generate_bge_vl_embeddings(self, inputs):
"""Generate embeddings using BGE-VL model"""
text_inputs, image_inputs, multimodal_inputs = self._parse_multimodal_inputs(inputs)
Expand Down Expand Up @@ -220,7 +293,10 @@ def invoke(self, request:dict):
logger.info(f'request: {request}')
t0 = time.time()

if self.is_bge_vl:
if self.is_bge_vl_mllm:
# Use BGE-VL-MLLM multimodal processing
embeddings_list = self._generate_bge_vl_mllm_embeddings(inputs)
elif self.is_bge_vl:
# Use BGE-VL multimodal processing
embeddings_list = self._generate_bge_vl_embeddings(inputs)
else:
Expand Down