Skip to content

Commit 481cd68

Browse files
committed
ref #10 : option to keep context in "stream" example
Seems the results become worse when we keep the context, so by default this is not enabled
1 parent 3f15bb8 commit 481cd68

File tree

3 files changed

+18
-7
lines changed

3 files changed

+18
-7
lines changed

stream.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct whisper_params {
4040

4141
bool verbose = false;
4242
bool translate = false;
43+
bool no_context = true;
4344
bool print_special_tokens = false;
4445
bool no_timestamps = true;
4546

@@ -64,6 +65,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
6465
params.verbose = true;
6566
} else if (arg == "--translate") {
6667
params.translate = true;
68+
} else if (arg == "-kc" || arg == "--keep-context") {
69+
params.no_context = false;
6770
} else if (arg == "-l" || arg == "--language") {
6871
params.language = argv[++i];
6972
if (whisper_lang_id(params.language.c_str()) == -1) {
@@ -103,6 +106,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
103106
fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
104107
fprintf(stderr, " -v, --verbose verbose output\n");
105108
fprintf(stderr, " --translate translate from source language to english\n");
109+
fprintf(stderr, " -nc, --no-context disable context from earlier audio (default: false)\n");
106110
fprintf(stderr, " -ps, --print_special print special tokens\n");
107111
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
108112
fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
@@ -273,6 +277,7 @@ int main(int argc, char ** argv) {
273277
wparams.print_realtime = false;
274278
wparams.print_timestamps = !params.no_timestamps;
275279
wparams.translate = params.translate;
280+
wparams.no_context = params.no_context;
276281
wparams.language = params.language.c_str();
277282
wparams.n_threads = params.n_threads;
278283

whisper.cpp

+12-7
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,8 @@ struct whisper_context {
405405

406406
std::vector<whisper_result> result_cur;
407407
std::vector<whisper_segment> result_all;
408+
409+
std::vector<whisper_token> prompt_past;
408410
};
409411

410412
// load the model from a ggml file
@@ -1020,8 +1022,6 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
10201022
// - model: the model
10211023
// - n_threads: number of threads to use
10221024
// - mel_offset: offset in the mel spectrogram (i.e. audio offset)
1023-
// - mel_inp: input mel spectrogram
1024-
// - features: output encoded features
10251025
//
10261026
bool whisper_encode(
10271027
whisper_context & wctx,
@@ -1405,10 +1405,9 @@ bool whisper_encode(
14051405
//
14061406
// - model: the model
14071407
// - n_threads: number of threads to use
1408-
// - n_past: prompt length
1409-
// - prompt: text prompt
1410-
// - logits_out: output logits
1411-
// - probs_out: output probabilities
1408+
// - tokens: text prompt
1409+
// - n_tokens: number of tokens in the prompt
1410+
// - n_past: number of past tokens to prefix the prompt with
14121411
//
14131412
bool whisper_decode(
14141413
whisper_context & wctx,
@@ -2259,6 +2258,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_decode_strat
22592258
.offset_ms = 0,
22602259

22612260
.translate = false,
2261+
.no_context = false,
22622262
.print_special_tokens = false,
22632263
.print_progress = true,
22642264
.print_realtime = false,
@@ -2279,6 +2279,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_decode_strat
22792279
.offset_ms = 0,
22802280

22812281
.translate = false,
2282+
.no_context = false,
22822283
.print_special_tokens = false,
22832284
.print_progress = true,
22842285
.print_realtime = false,
@@ -2297,6 +2298,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_decode_strat
22972298

22982299
return result;
22992300
}
2301+
23002302
int whisper_full(
23012303
struct whisper_context * ctx,
23022304
struct whisper_full_params params,
@@ -2309,7 +2311,10 @@ int whisper_full(
23092311
}
23102312

23112313
// the accumulated text context so far
2312-
std::vector<whisper_token> prompt_past = { };
2314+
auto & prompt_past = ctx->prompt_past;
2315+
if (params.no_context) {
2316+
prompt_past.clear();
2317+
}
23132318

23142319
// these tokens determine the task that will be performed
23152320
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };

whisper.h

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ extern "C" {
105105
int offset_ms;
106106

107107
bool translate;
108+
bool no_context;
108109
bool print_special_tokens;
109110
bool print_progress;
110111
bool print_realtime;

0 commit comments

Comments
 (0)