From d657fa38226245dae22f67cdb32ba0b9ce10f6cf Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 28 Jul 2025 22:10:27 +0200 Subject: [PATCH] mtmd : support home-cooked Mistral Small Omni --- tools/mtmd/clip-impl.h | 2 ++ tools/mtmd/clip.cpp | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c8822dcf5c34c..bf0741d4b98d5 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -30,6 +30,7 @@ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" // vision-specific +#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" @@ -46,6 +47,7 @@ #define KEY_MINICPMV_VERSION "clip.minicpmv_version" // audio-specific +#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" #define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a4b62f9afe3bf..b232eed75aa69 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2098,15 +2098,27 @@ struct clip_model_loader { // projector type std::string proj_type; { + // default key get_string(KEY_PROJ_TYPE, proj_type, false); - if (!proj_type.empty()) { - model.proj_type = clip_projector_type_from_string(proj_type); + + // for models with mixed modalities + if (proj_type.empty()) { + if (modality == CLIP_MODALITY_VISION) { + get_string(KEY_VISION_PROJ_TYPE, proj_type, false); + } else if (modality == CLIP_MODALITY_AUDIO) { + get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false); + } else { + GGML_ABORT("unknown modality"); + } } + + model.proj_type = clip_projector_type_from_string(proj_type); + if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) { throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str())); } - // correct arch for multimodal models + // correct arch for multimodal models (legacy method) if (model.proj_type == PROJECTOR_TYPE_QWEN25O) { model.proj_type = modality == CLIP_MODALITY_VISION ? PROJECTOR_TYPE_QWEN25VL