Merge pull request #359 from makaveli10/remove_blank_segment

makaveli10 · web-flow · commit d9cb4ffdd0e7 · 2025-04-22T17:59:10.000+05:30
Remove blank segment feature
diff --git a/whisper_live/backend/base.py b/whisper_live/backend/base.py
@@ -20,19 +20,13 @@ def __init__(self, client_uid, websocket):
         self.text = []
         self.current_out = ''
         self.prev_out = ''
-        self.t_start = None
         self.exit = False
         self.same_output_count = 0
-        self.show_prev_out_thresh = 5   # if pause(no output from whisper) show previous output for 5 seconds
-        self.add_pause_thresh = 3       # add a blank to segment list as a pause(no speech) for 3 seconds
         self.transcript = []
         self.send_last_n_segments = 10
         self.no_speech_thresh = 0.45
         self.clip_audio = False
 
-        # text formatting
-        self.pick_previous_segments = 2
-
         # threading
         self.lock = threading.Lock()
 
@@ -45,9 +39,7 @@ def speech_to_text(self):
 
         If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
         It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
-        are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
-        (no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
-        there is no speech for a specified duration to indicate a pause.
+        are sent to the client in real-time, and a history of segments is maintained to provide context.
 
         Raises:
             Exception: If there is an issue with audio processing or WebSocket communication.
@@ -85,7 +77,7 @@ def speech_to_text(self):
     def transcribe_audio(self):
         raise NotImplementedError
 
-    def handle_transcription_output(self):
+    def handle_transcription_output(self, result, duration):
         raise NotImplementedError
     
     def format_segment(self, start, end, text, completed=False):
@@ -228,33 +220,6 @@ def send_transcription_to_client(self, segments):
         except Exception as e:
             logging.error(f"[ERROR]: Sending data to client: {e}")
 
-    def get_previous_output(self):
-        """
-        Retrieves previously generated transcription outputs if no new transcription is available
-        from the current audio chunks.
-
-        Checks the time since the last transcription output and, if it is within a specified
-        threshold, returns the most recent segments of transcribed text. It also manages
-        adding a pause (blank segment) to indicate a significant gap in speech based on a defined
-        threshold.
-
-        Returns:
-            segments (list): A list of transcription segments. This may include the most recent
-                            transcribed text segments or a blank segment to indicate a pause
-                            in speech.
-        """
-        segments = []
-        if self.t_start is None:
-            self.t_start = time.time()
-        if time.time() - self.t_start < self.show_prev_out_thresh:
-            segments = self.prepare_segments()
-
-        # add a blank if there is no speech for 3 seconds
-        if len(self.text) and self.text[-1] != '':
-            if time.time() - self.t_start > self.add_pause_thresh:
-                self.text.append('')
-        return segments
-
     def disconnect(self):
         """
         Notify the client of disconnection and send a disconnect message.
diff --git a/whisper_live/backend/faster_whisper_backend.py b/whisper_live/backend/faster_whisper_backend.py
@@ -175,33 +175,6 @@ def transcribe_audio(self, input_sample):
             self.set_language(info)
         return result
 
-    def get_previous_output(self):
-        """
-        Retrieves previously generated transcription outputs if no new transcription is available
-        from the current audio chunks.
-
-        Checks the time since the last transcription output and, if it is within a specified
-        threshold, returns the most recent segments of transcribed text. It also manages
-        adding a pause (blank segment) to indicate a significant gap in speech based on a defined
-        threshold.
-
-        Returns:
-            segments (list): A list of transcription segments. This may include the most recent
-                            transcribed text segments or a blank segment to indicate a pause
-                            in speech.
-        """
-        segments = []
-        if self.t_start is None:
-            self.t_start = time.time()
-        if time.time() - self.t_start < self.show_prev_out_thresh:
-            segments = self.prepare_segments()
-
-        # add a blank if there is no speech for 3 seconds
-        if len(self.text) and self.text[-1] != '':
-            if time.time() - self.t_start > self.add_pause_thresh:
-                self.text.append('')
-        return segments
-
     def handle_transcription_output(self, result, duration):
         """
         Handle the transcription output, updating the transcript and sending data to the client.
@@ -215,9 +188,6 @@ def handle_transcription_output(self, result, duration):
             self.t_start = None
             last_segment = self.update_segments(result, duration)
             segments = self.prepare_segments(last_segment)
-        else:
-            # show previous output if there is pause i.e. no output from whisper
-            segments = self.get_previous_output()
 
         if len(segments):
             self.send_transcription_to_client(segments)
diff --git a/whisper_live/backend/openvino_backend.py b/whisper_live/backend/openvino_backend.py
@@ -118,9 +118,6 @@ def handle_transcription_output(self, result, duration):
             self.t_start = None
             last_segment = self.update_segments(result, duration)
             segments = self.prepare_segments(last_segment)
-        else:
-            # show previous output if there is pause i.e. no output from whisper
-            segments = self.get_previous_output()
 
         if len(segments):
             self.send_transcription_to_client(segments)
diff --git a/whisper_live/backend/trt_backend.py b/whisper_live/backend/trt_backend.py
@@ -149,9 +149,7 @@ def speech_to_text(self):
 
         If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
         It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
-        are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
-        (no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
-        there is no speech for a specified duration to indicate a pause.
+        are sent to the client in real-time, and a history of segments is maintained to provide context.
 
         Raises:
             Exception: If there is an issue with audio processing or WebSocket communication.