diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java index 690f1bd5258..6892774f52c 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java @@ -387,4 +387,126 @@ public interface WhisperCppJnaLibrary extends Library { * @return The result of the benchmark as a string. */ String whisper_bench_ggml_mul_mat_str(int nThreads); + + // ============================================================================ + // Voice Activity Detection (VAD) Functions + // ============================================================================ + + /** + * Get default VAD parameters. + * + * @return Default VAD parameters + */ + Pointer whisper_vad_default_params(); + + /** + * Get default VAD context parameters. + * + * @return Default VAD context parameters + */ + Pointer whisper_vad_default_context_params(); + + /** + * Initialize VAD context from file with parameters. + * + * @param path_model Path to the VAD model file + * @param params VAD context parameters + * @return VAD context pointer on success, null on failure + */ + Pointer whisper_vad_init_from_file_with_params(String path_model, Pointer params); + + /** + * Initialize VAD context with model loader and parameters. + * + * @param loader Model loader + * @param params VAD context parameters + * @return VAD context pointer on success, null on failure + */ + Pointer whisper_vad_init_with_params(WhisperModelLoader loader, Pointer params); + + /** + * Detect speech in audio samples. + * + * @param vctx VAD context + * @param samples Audio samples (float array) + * @param n_samples Number of samples + * @return true if speech detected, false otherwise + */ + boolean whisper_vad_detect_speech(Pointer vctx, float[] samples, int n_samples); + + /** + * Get number of probability values in VAD context. + * + * @param vctx VAD context + * @return Number of probability values + */ + int whisper_vad_n_probs(Pointer vctx); + + /** + * Get probability array from VAD context. + * + * @param vctx VAD context + * @return Pointer to probability array + */ + Pointer whisper_vad_probs(Pointer vctx); + + /** + * Get VAD segments from pre-computed probabilities. + * + * @param vctx VAD context + * @param params VAD parameters + * @return Pointer to VAD segments + */ + Pointer whisper_vad_segments_from_probs(Pointer vctx, Pointer params); + + /** + * Get VAD segments directly from audio samples. + * + * @param vctx VAD context + * @param params VAD parameters + * @param samples Audio samples (float array) + * @param n_samples Number of samples + * @return Pointer to VAD segments + */ + Pointer whisper_vad_segments_from_samples(Pointer vctx, Pointer params, float[] samples, int n_samples); + + /** + * Get number of segments in VAD segments result. + * + * @param segments VAD segments pointer + * @return Number of segments + */ + int whisper_vad_segments_n_segments(Pointer segments); + + /** + * Get start time of a specific segment. + * + * @param segments VAD segments pointer + * @param i_segment Segment index + * @return Start time in seconds + */ + float whisper_vad_segments_get_segment_t0(Pointer segments, int i_segment); + + /** + * Get end time of a specific segment. + * + * @param segments VAD segments pointer + * @param i_segment Segment index + * @return End time in seconds + */ + float whisper_vad_segments_get_segment_t1(Pointer segments, int i_segment); + + /** + * Free VAD segments memory. + * + * @param segments VAD segments pointer to free + */ + void whisper_vad_free_segments(Pointer segments); + + /** + * Free VAD context memory. + * + * @param ctx VAD context pointer to free + */ + void whisper_vad_free(Pointer ctx); } diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java index 76ce80fb4cc..86a1dd6456f 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java @@ -331,6 +331,38 @@ public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) { public long i_start_rule; public float grammar_penalty; + /** Voice Activity Detection (VAD) parameters */ + + /** Enable VAD (default = false) */ + public CBool vad; + + /** Enable VAD */ + public void enableVAD(boolean enable) { + vad = enable ? CBool.TRUE : CBool.FALSE; + } + + /** Path to VAD model file */ + public String vad_model_path; + + /** Set VAD model path */ + public void setVADModelPath(String path) { + this.vad_model_path = path; + } + + /** VAD parameters */ + public WhisperVADParams.ByValue vad_params; + + /** Set VAD parameters */ + public void setVADParams(WhisperVADParams params) { + this.vad_params = new WhisperVADParams.ByValue(); + this.vad_params.threshold = params.threshold; + this.vad_params.min_speech_duration_ms = params.min_speech_duration_ms; + this.vad_params.min_silence_duration_ms = params.min_silence_duration_ms; + this.vad_params.max_speech_duration_s = params.max_speech_duration_s; + this.vad_params.speech_pad_ms = params.speech_pad_ms; + this.vad_params.samples_overlap = params.samples_overlap; + } + @Override protected List getFieldOrder() { return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", @@ -349,7 +381,8 @@ protected List getFieldOrder() { "encoder_begin_callback", "encoder_begin_callback_user_data", "abort_callback", "abort_callback_user_data", "logits_filter_callback", "logits_filter_callback_user_data", - "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty"); + "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty", + "vad", "vad_model_path", "vad_params"); } public static class ByValue extends WhisperFullParams implements Structure.ByValue { diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADContextParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADContextParams.java new file mode 100644 index 00000000000..e06e06a9fdf --- /dev/null +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADContextParams.java @@ -0,0 +1,66 @@ +package io.github.ggerganov.whispercpp.params; + +import com.sun.jna.*; +import java.util.Arrays; +import java.util.List; + +/** + * Parameters for initializing a VAD context. + */ +public class WhisperVADContextParams extends Structure { + + public WhisperVADContextParams() { + super(); + } + + public WhisperVADContextParams(Pointer p) { + super(p); + } + + /** Number of threads to use for VAD processing (default = 4) */ + public int n_threads; + + /** Use GPU for VAD (default = true) */ + public CBool use_gpu; + + /** CUDA device to use (default = 0) */ + public int gpu_device; + + /** + * Set number of threads for VAD processing. + * @param threads Number of threads + */ + public void setThreads(int threads) { + this.n_threads = threads; + } + + /** + * Enable or disable GPU for VAD. + * @param enable Whether to use GPU + */ + public void useGpu(boolean enable) { + use_gpu = enable ? CBool.TRUE : CBool.FALSE; + } + + /** + * Set CUDA device for VAD. + * @param device CUDA device ID + */ + public void setGpuDevice(int device) { + this.gpu_device = device; + } + + @Override + protected List getFieldOrder() { + return Arrays.asList( + "n_threads", + "use_gpu", + "gpu_device" + ); + } + + public static class ByValue extends WhisperVADContextParams implements Structure.ByValue { + public ByValue() { super(); } + public ByValue(Pointer p) { super(p); } + } +} diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADParams.java new file mode 100644 index 00000000000..0b27c5d1250 --- /dev/null +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADParams.java @@ -0,0 +1,103 @@ +package io.github.ggerganov.whispercpp.params; + +import com.sun.jna.*; +import java.util.Arrays; +import java.util.List; + +/** + * Voice Activity Detection (VAD) parameters. + * Used for detecting speech segments in audio. + */ +public class WhisperVADParams extends Structure { + + public WhisperVADParams() { + super(); + } + + public WhisperVADParams(Pointer p) { + super(p); + } + + /** Probability threshold to consider as speech (default = 0.5) */ + public float threshold; + + /** Minimum duration for a valid speech segment in milliseconds (default = 250) */ + public int min_speech_duration_ms; + + /** Minimum silence duration to consider speech as ended in milliseconds (default = 2000) */ + public int min_silence_duration_ms; + + /** Maximum duration of a speech segment before forcing a new segment in seconds (default = Float.MAX_VALUE) */ + public float max_speech_duration_s; + + /** Padding added before and after speech segments in milliseconds (default = 400) */ + public int speech_pad_ms; + + /** Overlap in seconds when copying audio samples from speech segment (default = 1.0) */ + public float samples_overlap; + + /** + * Set probability threshold for speech detection. + * @param threshold Probability threshold (0.0 to 1.0) + */ + public void setThreshold(float threshold) { + this.threshold = threshold; + } + + /** + * Set minimum speech duration. + * @param durationMs Duration in milliseconds + */ + public void setMinSpeechDuration(int durationMs) { + this.min_speech_duration_ms = durationMs; + } + + /** + * Set minimum silence duration. + * @param durationMs Duration in milliseconds + */ + public void setMinSilenceDuration(int durationMs) { + this.min_silence_duration_ms = durationMs; + } + + /** + * Set maximum speech duration. + * @param durationS Duration in seconds + */ + public void setMaxSpeechDuration(float durationS) { + this.max_speech_duration_s = durationS; + } + + /** + * Set speech padding. + * @param paddingMs Padding in milliseconds + */ + public void setSpeechPadding(int paddingMs) { + this.speech_pad_ms = paddingMs; + } + + /** + * Set samples overlap. + * @param overlapS Overlap in seconds + */ + public void setSamplesOverlap(float overlapS) { + this.samples_overlap = overlapS; + } + + @Override + protected List getFieldOrder() { + return Arrays.asList( + "threshold", + "min_speech_duration_ms", + "min_silence_duration_ms", + "max_speech_duration_s", + "speech_pad_ms", + "samples_overlap" + ); + } + + public static class ByValue extends WhisperVADParams implements Structure.ByValue { + public ByValue() { super(); } + public ByValue(Pointer p) { super(p); } + } +}