@@ -124,6 +124,9 @@ typedef struct {
124
124
char sz_model_path[MAX_PATH_LEN];
125
125
size_t n_threads;
126
126
127
+ // 03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
128
+ size_t n_decoding_mode; // 0:WHISPER_SAMPLING_GREEDY 1:WHISPER_SAMPLING_BEAM_SEARCH
129
+
127
130
size_t n_asr_mode; // 0: normal transcription 1: asr pressure test 2:benchmark 3: transcription + audio record
128
131
size_t n_benchmark_type; // what to benchmark: 0: asr, 1: memcpy 2: mulmat 3: whisper_encode/whisper full benchmark
129
132
bool b_use_gpu;
@@ -847,7 +850,9 @@ class whisper_asr {
847
850
n_end_time = ggml_time_us ();
848
851
n_durtion = (n_end_time - n_begin_time) / 1000 ;
849
852
850
- if (n_durtion > 1000 ) { // 1 seconds, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
853
+ // 1 second, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
854
+ // 0.8 second with new method(adjust audio_context dynamically) would cause app crash suddenly or produce sketchy/incorrect/repeat tokens
855
+ if (n_durtion > 900 ) {
851
856
LOGGD (" duration of audio data gathering is: %d milliseconds\n " , n_durtion);
852
857
LOGGD (" size of gathered audio data: %d\n " , _n_whisper_in_size);
853
858
LOGGD (" total audio sample counts %d\n " , _n_total_sample_counts);
@@ -1186,6 +1191,21 @@ static const char * whisper_asr_audio_to_text(const float * pf32_audio_buffer, i
1186
1191
1187
1192
begin_time = ggml_time_ms ();
1188
1193
whisper_reset_timings (p_asr_ctx->p_context );
1194
+
1195
+ // 03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
1196
+ p_asr_ctx->p_params ->max_tokens = 256 ;
1197
+ p_asr_ctx->p_params ->temperature_inc = 0 .0f ;
1198
+ p_asr_ctx->p_params ->audio_ctx = std::min (1500 , (int )ceil ((double )num_samples / (double )(320.0 )) + 16 );
1199
+ if (WHISPER_SAMPLING_GREEDY == p_asr_ctx->n_decoding_mode ) {
1200
+ p_asr_ctx->p_params ->strategy = WHISPER_SAMPLING_GREEDY;
1201
+ p_asr_ctx->p_params ->greedy .best_of = 1 ;// https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
1202
+ } else {
1203
+ p_asr_ctx->p_params ->strategy = WHISPER_SAMPLING_BEAM_SEARCH;
1204
+ p_asr_ctx->p_params ->beam_search .beam_size = 5 ;// https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
1205
+ p_asr_ctx->p_params ->greedy .best_of = 5 ;
1206
+ }
1207
+ // LOGGD("decoding_mode=%d, audio_ctx=%d\n", p_asr_ctx->n_decoding_mode, p_asr_ctx->p_params->audio_ctx);
1208
+
1189
1209
result = whisper_full (p_asr_ctx->p_context , *p_asr_ctx->p_params , pf32_audio_buffer, num_samples);
1190
1210
if (0 != result) {
1191
1211
LOGW (" whisper inference failure, pls check why?\n " );
@@ -1350,9 +1370,19 @@ int whisper_asr_init(const char * sz_model_path, int n_threads, int n_asrmode) {
1350
1370
params.speed_up = false ;
1351
1371
params.debug_mode = false ;
1352
1372
1373
+ params.audio_ctx = 0 ;
1374
+
1375
+ params.suppress_blank = false ;
1376
+ // params.suppress_non_speech_tokens = true;
1377
+ // params.language = "en";
1378
+
1379
+ // 03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
1380
+ p_asr_ctx->n_decoding_mode = WHISPER_SAMPLING_GREEDY;
1381
+
1382
+
1353
1383
// params.tdrz_enable = false;//whisper complain failed to compute log mel spectrogram when this flag was enabled
1354
1384
// params.suppress_blank = true;
1355
- // params.suppress_non_speech_tokens = true;
1385
+ params.suppress_non_speech_tokens = true ;
1356
1386
1357
1387
memcpy (p_asr_ctx->p_params , ¶ms, sizeof (struct whisper_full_params ));
1358
1388
0 commit comments