stream : add "audio_ctx" parameter

Used to overwrite the audio context size of the Encoder. For example, setting "audio_ctx = 512" will make it run about 3 times faster, processing about 10s of audio, instead of 30s. The transcription quality drops, but this can be used for real-time streaming purposes where performance is important.
2023-11-04 02:52:44 +03:00 · 2022-11-20 21:12:01 +02:00
parent 62b5ff875c
commit fb8d77f760
3 changed files with 24 additions and 10 deletions
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -424,6 +424,9 @@ struct whisper_context {
    int64_t t_last;
    whisper_token tid_last;
    std::vector<float> energy; // PCM signal energy
+
+    // [EXPERIMENTAL] speed-up techniques
+    int32_t exp_n_audio_ctx; // 0 - use default
 };

 // load the model from a ggml file
@@ -974,9 +977,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

            model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
            model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-            //memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
-            //memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
        }

        const size_t memory_size =
@@ -1079,7 +1079,7 @@ static bool whisper_encode(
    const auto & mel_inp = wctx.mel;
    const auto & hparams = model.hparams;

-    const int n_ctx   = WHISPER_EXPERIMENT_AUDIO_CTX;
+    const int n_ctx   = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
    const int n_state = hparams.n_audio_state;
    const int n_head  = hparams.n_audio_head;
    const int n_layer = hparams.n_audio_layer;
@@ -1133,6 +1133,8 @@ static bool whisper_encode(
        cur = ggml_gelu(ctx0, cur);
    }

+    // ===================================================================
+    // NOTE: experimenting with partial evaluation of the encoder (ignore)
    //static int iter = -1;
    //const int n_iter = 1500/n_ctx;

@@ -1151,6 +1153,10 @@ static bool whisper_encode(
    struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);

    cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
+    // ===================================================================
+
+    // original:
+    //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));

    struct ggml_tensor * inpL = cur;

@@ -1494,8 +1500,7 @@ static bool whisper_decode(
    const int n_layer = hparams.n_text_layer;

    const int N = n_tokens;
-    //const int M = hparams.n_audio_ctx;
-    const int M = WHISPER_EXPERIMENT_AUDIO_CTX;
+    const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;

    struct ggml_init_params params = {
            .mem_size   = wctx.buf_compute.size(),
@@ -2405,6 +2410,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
                    /*.max_tokens           =*/ 0,

                    /*.speed_up             =*/ false,
+                    /*.audio_ctx            =*/ 0,

                    /*.language             =*/ "en",

@@ -2447,6 +2453,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
                    /*.max_tokens           =*/ 0,

                    /*.speed_up             =*/ false,
+                    /*.audio_ctx            =*/ 0,

                    /*.language             =*/ "en",

@@ -2577,6 +2584,9 @@ int whisper_full(
        prompt_past.clear();
    }

+    // overwrite audio_ctx
+    ctx->exp_n_audio_ctx = params.audio_ctx;
+
    // these tokens determine the task that will be performed
    std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
    if (whisper_is_multilingual(ctx)) {