mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2023-11-04 02:52:44 +03:00
stream : add "audio_ctx" parameter
Used to overwrite the audio context size of the Encoder. For example, setting "audio_ctx = 512" will make it run about 3 times faster, processing about 10s of audio, instead of 30s. The transcription quality drops, but this can be used for real-time streaming purposes where performance is important.
This commit is contained in:
22
whisper.cpp
22
whisper.cpp
@@ -424,6 +424,9 @@ struct whisper_context {
|
||||
int64_t t_last;
|
||||
whisper_token tid_last;
|
||||
std::vector<float> energy; // PCM signal energy
|
||||
|
||||
// [EXPERIMENTAL] speed-up techniques
|
||||
int32_t exp_n_audio_ctx; // 0 - use default
|
||||
};
|
||||
|
||||
// load the model from a ggml file
|
||||
@@ -974,9 +977,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
||||
|
||||
model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
|
||||
//memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
|
||||
//memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
|
||||
}
|
||||
|
||||
const size_t memory_size =
|
||||
@@ -1079,7 +1079,7 @@ static bool whisper_encode(
|
||||
const auto & mel_inp = wctx.mel;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_ctx = WHISPER_EXPERIMENT_AUDIO_CTX;
|
||||
const int n_ctx = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
|
||||
const int n_state = hparams.n_audio_state;
|
||||
const int n_head = hparams.n_audio_head;
|
||||
const int n_layer = hparams.n_audio_layer;
|
||||
@@ -1133,6 +1133,8 @@ static bool whisper_encode(
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// ===================================================================
|
||||
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
||||
//static int iter = -1;
|
||||
//const int n_iter = 1500/n_ctx;
|
||||
|
||||
@@ -1151,6 +1153,10 @@ static bool whisper_encode(
|
||||
struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
|
||||
|
||||
cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
|
||||
// ===================================================================
|
||||
|
||||
// original:
|
||||
//cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
|
||||
|
||||
struct ggml_tensor * inpL = cur;
|
||||
|
||||
@@ -1494,8 +1500,7 @@ static bool whisper_decode(
|
||||
const int n_layer = hparams.n_text_layer;
|
||||
|
||||
const int N = n_tokens;
|
||||
//const int M = hparams.n_audio_ctx;
|
||||
const int M = WHISPER_EXPERIMENT_AUDIO_CTX;
|
||||
const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
.mem_size = wctx.buf_compute.size(),
|
||||
@@ -2405,6 +2410,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
||||
/*.max_tokens =*/ 0,
|
||||
|
||||
/*.speed_up =*/ false,
|
||||
/*.audio_ctx =*/ 0,
|
||||
|
||||
/*.language =*/ "en",
|
||||
|
||||
@@ -2447,6 +2453,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
||||
/*.max_tokens =*/ 0,
|
||||
|
||||
/*.speed_up =*/ false,
|
||||
/*.audio_ctx =*/ 0,
|
||||
|
||||
/*.language =*/ "en",
|
||||
|
||||
@@ -2577,6 +2584,9 @@ int whisper_full(
|
||||
prompt_past.clear();
|
||||
}
|
||||
|
||||
// overwrite audio_ctx
|
||||
ctx->exp_n_audio_ctx = params.audio_ctx;
|
||||
|
||||
// these tokens determine the task that will be performed
|
||||
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
|
||||
if (whisper_is_multilingual(ctx)) {
|
||||
|
||||
Reference in New Issue
Block a user