whisper : support speaker segmentation (local diarization) of mono audio via tinydiarize (#1058)

* add HuggingFace mirror to download ggml model * support tdrz via simple hack overriding solm tokens * fix incorrect translate/transcribe token_ids that are not static const * add apollo 13 sample for tdrz demo * render [SPEAKER TURN] consistently in all terminal output using vocab.id_to_token * extend whisper_segment with speaker_turn_next field and save in json output * fix failing go build * slipped in some python syntax whoops * whisper : finalize tinydiarize support (add flag + fixes) * whisper : tdrz support for word-level timestamps (respect max_len) * java : try to fix tests after adding tdrz_enable flag * main : remove TODO leftover * java : fix params order list after adding "tdrz_enable" * whisper : fix solm and add nosp token * main : print tinydiarize help --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-11-04 02:52:44 +03:00 · 2023-07-03 23:45:00 -07:00
parent fdf58a6668
commit c8d0f5fe98
8 changed files with 215 additions and 130 deletions
--- a/whisper.h
+++ b/whisper.h
@@ -277,15 +277,16 @@ extern "C" {
    // Special tokens
    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);

    // Task tokens
-    WHISPER_API whisper_token whisper_token_translate (void);
-    WHISPER_API whisper_token whisper_token_transcribe(void);
+    WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
+    WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);

    // Performance information from the default state.
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
@@ -358,6 +359,9 @@ extern "C" {
        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
        int  audio_ctx;         // overwrite the audio context size (0 = use default)

+        // [EXPERIMENTAL] [TDRZ] tinydiarize
+        bool tdrz_enable;       // enable tinydiarize speaker turn detection
+
        // tokens to provide to the whisper decoder as initial prompt
        // these are prepended to any existing text context from a previous call
        const char * initial_prompt;
@@ -460,6 +464,9 @@ extern "C" {
    WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
    WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);

+    // Get whether the next segment is predicted as a speaker turn
+    WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
+
    // Get the text of the specified segment
    WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
    WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
@@ -488,9 +495,9 @@ extern "C" {

    // Temporary helpers needed for exposing ggml interface

-    WHISPER_API int whisper_bench_memcpy(int n_threads);
-    WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
-    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
+    WHISPER_API int          whisper_bench_memcpy          (int n_threads);
+    WHISPER_API const char * whisper_bench_memcpy_str      (int n_threads);
+    WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads);
    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);

 #ifdef __cplusplus