stream : add "audio_ctx" parameter

Used to overwrite the audio context size of the Encoder. For example, setting "audio_ctx = 512" will make it run about 3 times faster, processing about 10s of audio, instead of 30s. The transcription quality drops, but this can be used for real-time streaming purposes where performance is important.
2023-11-04 02:52:44 +03:00 · 2022-11-20 21:12:01 +02:00
parent 62b5ff875c
commit fb8d77f760
3 changed files with 24 additions and 10 deletions
--- a/whisper.h
+++ b/whisper.h
@@ -24,8 +24,6 @@
 #define WHISPER_HOP_LENGTH  160
 #define WHISPER_CHUNK_SIZE  30

-#define WHISPER_EXPERIMENT_AUDIO_CTX 512
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -207,7 +205,8 @@ extern "C" {
        int   max_tokens;       // max tokens per segment (0 = no limit)

        // [EXPERIMENTAL] speed-up techniques
-        bool speed_up; // speed-up the audio by 2x using Phase Vocoder
+        bool speed_up;  // speed-up the audio by 2x using Phase Vocoder
+        int  audio_ctx; // overwrite the audio context size (0 = use default)

        const char * language;