real real-time transcription(real-time subtitle) with English online-TV on Xiaomi 14 at the first time but bug-fix is still required

zhouwg · zhouwg · commit 4cd35ddcb3d4 · 2024-03-20T21:15:59.000+08:00
diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/app/IApplication.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/app/IApplication.java
@@ -370,6 +370,7 @@ public void initGlobal() {
             CDELibraryLoader.load("whispercpp");
             CDELog.d(TAG, "cpu core counts:" + whispercpp.get_cpu_core_counts());
             CDELog.j(TAG, "asr mode: " + mSettings.getASRMode());
+            CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());
             if ((CDEUtils.ASR_MODE_NORMAL == mSettings.getASRMode()) || (CDEUtils.ASR_MODE_TRANSCRIPTION_RECORD == mSettings.getASRMode())) {
                 result = whispercpp.asr_init(modelPath, mSettings.getASRThreadCounts(), WHISPER_ASR_MODE_NORMAL);
             } else {
diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/player/ffplayer/FFPlayerView.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/player/ffplayer/FFPlayerView.java
@@ -1575,6 +1575,7 @@ private void onASRStart(int asrMode) {
             return;
         } else {
             CDELog.j(TAG, "ASR with GGML model file:" + file.getAbsolutePath());
+            CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());
         }
 
         if (CDEUtils.getASRSubsystemInit()) {
diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/settings/ASRSettingFragment.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/settings/ASRSettingFragment.java
@@ -120,37 +120,18 @@ public void onPause() {
          @Override
          public void onSharedPreferenceChanged(SharedPreferences sharedPreferences, String key) {
              CDELog.j(TAG, "key : " + key);
-             if (key.contains("pref.asrmode")) {
-                 CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
-                 CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts());
-                 CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
-                 CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
-                 String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
-                 CDELog.j(TAG, "modelPath:" + modelPath);
-                 CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
-             }
-
-             if (key.contains("pref.asrthreadcounts")) {
-                 CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
-                 CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts() + 1);
-                 CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
-                 CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
-                 String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
-                 CDELog.j(TAG, "modelPath:" + modelPath);
-                 CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
-             }
-
-
-             if (key.contains("pref.ggmlmodel")) {
-                 CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
-                 CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
+             if (
+                 (key.contains("pref.asrmode"))
+                 || (key.contains("pref.asrthreadcounts"))
+                 || (key.contains("pref.ggmlmodel"))
+             ) {
                  CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
                  CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts());
                  CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
                  CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
                  String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
                  CDELog.j(TAG, "modelPath:" + modelPath);
-                 CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
+                 CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts(), mSettings.getASRMode());
              }
          }
      };
diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/utils/Settings.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/utils/Settings.java
@@ -78,12 +78,12 @@ public int getASRMode() {
 
     public int getASRThreadCounts() {
         String key = mAppContext.getString(R.string.pref_key_asrthreadcounts);
-        String value = mSharedPreferences.getString(key, "3"); // thread counts 4
+        String value = mSharedPreferences.getString(key, "3"); // actual thread counts is 3 + 1 = 4
         try {
-            return Integer.valueOf(value).intValue();
+            return Integer.valueOf(value).intValue() + 1;
         } catch (NumberFormatException e) {
             CDELog.j(TAG, "exception occurred");
-            return 3;
+            return 4;
         }
     }
 
diff --git a/external/.gitignore b/external/.gitignore
@@ -4,7 +4,12 @@ gstreamer/
 ncnn/
 CLBlast/
 llamacpp/
+ff-deps/
+ffdeps/
+ffmepg-deps/
 
 
+ffmpeg-6.1
+
 *.a
 *.so
diff --git a/external/whispercpp/jni/kantv-asr.h b/external/whispercpp/jni/kantv-asr.h
diff --git a/external/whispercpp/jni/whispercpp-jni-impl.cpp b/external/whispercpp/jni/whispercpp-jni-impl.cpp
@@ -124,6 +124,9 @@ typedef struct {
     char  sz_model_path[MAX_PATH_LEN];
     size_t n_threads;
 
+    //03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
+    size_t n_decoding_mode;         // 0:WHISPER_SAMPLING_GREEDY 1:WHISPER_SAMPLING_BEAM_SEARCH
+
     size_t n_asr_mode;                              // 0: normal transcription  1: asr pressure test 2:benchmark 3: transcription + audio record
     size_t n_benchmark_type;                        // what to benchmark: 0: asr, 1: memcpy 2: mulmat  3: whisper_encode/whisper full benchmark
     bool   b_use_gpu;
@@ -847,7 +850,9 @@ class whisper_asr {
             n_end_time = ggml_time_us();
             n_durtion = (n_end_time - n_begin_time) / 1000;
 
-            if (n_durtion > 1000) { // 1 seconds, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
+            // 1 second, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
+            // 0.8 second with new method(adjust audio_context dynamically) would cause app crash suddenly or produce sketchy/incorrect/repeat tokens
+            if (n_durtion > 900) {
                 LOGGD("duration of audio data gathering is: %d milliseconds\n", n_durtion);
                 LOGGD("size of gathered audio data: %d\n", _n_whisper_in_size);
                 LOGGD("total audio sample counts %d\n", _n_total_sample_counts);
@@ -1186,6 +1191,21 @@ static const char * whisper_asr_audio_to_text(const float * pf32_audio_buffer, i
 
     begin_time = ggml_time_ms();
     whisper_reset_timings(p_asr_ctx->p_context);
+
+    //03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
+    p_asr_ctx->p_params->max_tokens        = 256;
+    p_asr_ctx->p_params->temperature_inc   = 0.0f;
+    p_asr_ctx->p_params->audio_ctx         = std::min(1500, (int)ceil((double)num_samples / (double)(320.0)) + 16);
+    if (WHISPER_SAMPLING_GREEDY == p_asr_ctx->n_decoding_mode) {
+        p_asr_ctx->p_params->strategy = WHISPER_SAMPLING_GREEDY;
+        p_asr_ctx->p_params->greedy.best_of = 1;//https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
+    } else {
+        p_asr_ctx->p_params->strategy               = WHISPER_SAMPLING_BEAM_SEARCH;
+        p_asr_ctx->p_params->beam_search.beam_size  = 5;//https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
+        p_asr_ctx->p_params->greedy.best_of         = 5;
+    }
+    //LOGGD("decoding_mode=%d, audio_ctx=%d\n", p_asr_ctx->n_decoding_mode, p_asr_ctx->p_params->audio_ctx);
+
     result = whisper_full(p_asr_ctx->p_context, *p_asr_ctx->p_params, pf32_audio_buffer, num_samples);
     if (0 != result) {
         LOGW("whisper inference failure, pls check why?\n");
@@ -1350,9 +1370,19 @@ int whisper_asr_init(const char * sz_model_path, int n_threads, int n_asrmode) {
      params.speed_up                = false;
      params.debug_mode              = false;
 
+     params.audio_ctx               = 0;
+
+     params.suppress_blank              = false;
+     //params.suppress_non_speech_tokens  = true;
+     //params.language                    = "en";
+
+     //03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
+     p_asr_ctx->n_decoding_mode         = WHISPER_SAMPLING_GREEDY;
+
+
      //params.tdrz_enable                  = false;//whisper complain failed to compute log mel spectrogram when this flag was enabled
      //params.suppress_blank               = true;
-     //params.suppress_non_speech_tokens   = true;
+     params.suppress_non_speech_tokens   = true;
 
      memcpy(p_asr_ctx->p_params, &params, sizeof(struct whisper_full_params));
 

Original file line number	Diff line number	Diff line change
`@@ -1575,6 +1575,7 @@ private void onASRStart(int asrMode) {`
`1575`	`1575`	`return;`
`1576`	`1576`	`} else {`
`1577`	`1577`	`CDELog.j(TAG, "ASR with GGML model file:" + file.getAbsolutePath());`
	`1578`	`+ CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());`
`1578`	`1579`	`}`
`1579`	`1580`
`1580`	`1581`	`if (CDEUtils.getASRSubsystemInit()) {`
Original file line number	Diff line number	Diff line change
`@@ -78,12 +78,12 @@ public int getASRMode() {`
`78`	`78`
`79`	`79`	`public int getASRThreadCounts() {`
`80`	`80`	`String key = mAppContext.getString(R.string.pref_key_asrthreadcounts);`
`81`		`- String value = mSharedPreferences.getString(key, "3"); // thread counts 4`
	`81`	`+ String value = mSharedPreferences.getString(key, "3"); // actual thread counts is 3 + 1 = 4`
`82`	`82`	`try {`
`83`		`- return Integer.valueOf(value).intValue();`
	`83`	`+ return Integer.valueOf(value).intValue() + 1;`
`84`	`84`	`} catch (NumberFormatException e) {`
`85`	`85`	`CDELog.j(TAG, "exception occurred");`
`86`		`- return 3;`
	`86`	`+ return 4;`
`87`	`87`	`}`
`88`	`88`	`}`
`89`	`89`