Skip to content

Commit 4cd35dd

Browse files
committed
real real-time transcription(real-time subtitle) with English online-TV on Xiaomi 14 at the first time but bug-fix is still required
1 parent 8b14792 commit 4cd35dd

File tree

7 files changed

+48
-30
lines changed

7 files changed

+48
-30
lines changed

cdeosplayer/kantv/src/main/java/com/cdeos/kantv/app/IApplication.java

+1
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ public void initGlobal() {
370370
CDELibraryLoader.load("whispercpp");
371371
CDELog.d(TAG, "cpu core counts:" + whispercpp.get_cpu_core_counts());
372372
CDELog.j(TAG, "asr mode: " + mSettings.getASRMode());
373+
CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());
373374
if ((CDEUtils.ASR_MODE_NORMAL == mSettings.getASRMode()) || (CDEUtils.ASR_MODE_TRANSCRIPTION_RECORD == mSettings.getASRMode())) {
374375
result = whispercpp.asr_init(modelPath, mSettings.getASRThreadCounts(), WHISPER_ASR_MODE_NORMAL);
375376
} else {

cdeosplayer/kantv/src/main/java/com/cdeos/kantv/player/ffplayer/FFPlayerView.java

+1
Original file line numberDiff line numberDiff line change
@@ -1575,6 +1575,7 @@ private void onASRStart(int asrMode) {
15751575
return;
15761576
} else {
15771577
CDELog.j(TAG, "ASR with GGML model file:" + file.getAbsolutePath());
1578+
CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());
15781579
}
15791580

15801581
if (CDEUtils.getASRSubsystemInit()) {

cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/settings/ASRSettingFragment.java

+6-25
Original file line numberDiff line numberDiff line change
@@ -120,37 +120,18 @@ public void onPause() {
120120
@Override
121121
public void onSharedPreferenceChanged(SharedPreferences sharedPreferences, String key) {
122122
CDELog.j(TAG, "key : " + key);
123-
if (key.contains("pref.asrmode")) {
124-
CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
125-
CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts());
126-
CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
127-
CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
128-
String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
129-
CDELog.j(TAG, "modelPath:" + modelPath);
130-
CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
131-
}
132-
133-
if (key.contains("pref.asrthreadcounts")) {
134-
CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
135-
CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts() + 1);
136-
CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
137-
CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
138-
String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
139-
CDELog.j(TAG, "modelPath:" + modelPath);
140-
CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
141-
}
142-
143-
144-
if (key.contains("pref.ggmlmodel")) {
145-
CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
146-
CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
123+
if (
124+
(key.contains("pref.asrmode"))
125+
|| (key.contains("pref.asrthreadcounts"))
126+
|| (key.contains("pref.ggmlmodel"))
127+
) {
147128
CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
148129
CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts());
149130
CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
150131
CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
151132
String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
152133
CDELog.j(TAG, "modelPath:" + modelPath);
153-
CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
134+
CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts(), mSettings.getASRMode());
154135
}
155136
}
156137
};

cdeosplayer/kantv/src/main/java/com/cdeos/kantv/utils/Settings.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,12 @@ public int getASRMode() {
7878

7979
public int getASRThreadCounts() {
8080
String key = mAppContext.getString(R.string.pref_key_asrthreadcounts);
81-
String value = mSharedPreferences.getString(key, "3"); // thread counts 4
81+
String value = mSharedPreferences.getString(key, "3"); // actual thread counts is 3 + 1 = 4
8282
try {
83-
return Integer.valueOf(value).intValue();
83+
return Integer.valueOf(value).intValue() + 1;
8484
} catch (NumberFormatException e) {
8585
CDELog.j(TAG, "exception occurred");
86-
return 3;
86+
return 4;
8787
}
8888
}
8989

external/.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@ gstreamer/
44
ncnn/
55
CLBlast/
66
llamacpp/
7+
ff-deps/
8+
ffdeps/
9+
ffmepg-deps/
710

811

12+
ffmpeg-6.1
13+
914
*.a
1015
*.so
File renamed without changes.

external/whispercpp/jni/whispercpp-jni-impl.cpp

+32-2
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ typedef struct {
124124
char sz_model_path[MAX_PATH_LEN];
125125
size_t n_threads;
126126

127+
//03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
128+
size_t n_decoding_mode; // 0:WHISPER_SAMPLING_GREEDY 1:WHISPER_SAMPLING_BEAM_SEARCH
129+
127130
size_t n_asr_mode; // 0: normal transcription 1: asr pressure test 2:benchmark 3: transcription + audio record
128131
size_t n_benchmark_type; // what to benchmark: 0: asr, 1: memcpy 2: mulmat 3: whisper_encode/whisper full benchmark
129132
bool b_use_gpu;
@@ -847,7 +850,9 @@ class whisper_asr {
847850
n_end_time = ggml_time_us();
848851
n_durtion = (n_end_time - n_begin_time) / 1000;
849852

850-
if (n_durtion > 1000) { // 1 seconds, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
853+
// 1 second, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
854+
// 0.8 second with new method(adjust audio_context dynamically) would cause app crash suddenly or produce sketchy/incorrect/repeat tokens
855+
if (n_durtion > 900) {
851856
LOGGD("duration of audio data gathering is: %d milliseconds\n", n_durtion);
852857
LOGGD("size of gathered audio data: %d\n", _n_whisper_in_size);
853858
LOGGD("total audio sample counts %d\n", _n_total_sample_counts);
@@ -1186,6 +1191,21 @@ static const char * whisper_asr_audio_to_text(const float * pf32_audio_buffer, i
11861191

11871192
begin_time = ggml_time_ms();
11881193
whisper_reset_timings(p_asr_ctx->p_context);
1194+
1195+
//03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
1196+
p_asr_ctx->p_params->max_tokens = 256;
1197+
p_asr_ctx->p_params->temperature_inc = 0.0f;
1198+
p_asr_ctx->p_params->audio_ctx = std::min(1500, (int)ceil((double)num_samples / (double)(320.0)) + 16);
1199+
if (WHISPER_SAMPLING_GREEDY == p_asr_ctx->n_decoding_mode) {
1200+
p_asr_ctx->p_params->strategy = WHISPER_SAMPLING_GREEDY;
1201+
p_asr_ctx->p_params->greedy.best_of = 1;//https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
1202+
} else {
1203+
p_asr_ctx->p_params->strategy = WHISPER_SAMPLING_BEAM_SEARCH;
1204+
p_asr_ctx->p_params->beam_search.beam_size = 5;//https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
1205+
p_asr_ctx->p_params->greedy.best_of = 5;
1206+
}
1207+
//LOGGD("decoding_mode=%d, audio_ctx=%d\n", p_asr_ctx->n_decoding_mode, p_asr_ctx->p_params->audio_ctx);
1208+
11891209
result = whisper_full(p_asr_ctx->p_context, *p_asr_ctx->p_params, pf32_audio_buffer, num_samples);
11901210
if (0 != result) {
11911211
LOGW("whisper inference failure, pls check why?\n");
@@ -1350,9 +1370,19 @@ int whisper_asr_init(const char * sz_model_path, int n_threads, int n_asrmode) {
13501370
params.speed_up = false;
13511371
params.debug_mode = false;
13521372

1373+
params.audio_ctx = 0;
1374+
1375+
params.suppress_blank = false;
1376+
//params.suppress_non_speech_tokens = true;
1377+
//params.language = "en";
1378+
1379+
//03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
1380+
p_asr_ctx->n_decoding_mode = WHISPER_SAMPLING_GREEDY;
1381+
1382+
13531383
//params.tdrz_enable = false;//whisper complain failed to compute log mel spectrogram when this flag was enabled
13541384
//params.suppress_blank = true;
1355-
//params.suppress_non_speech_tokens = true;
1385+
params.suppress_non_speech_tokens = true;
13561386

13571387
memcpy(p_asr_ctx->p_params, &params, sizeof(struct whisper_full_params));
13581388

0 commit comments

Comments
 (0)