Skip to content

Commit d9cb4ff

Browse files
authored
Merge pull request #359 from makaveli10/remove_blank_segment
Remove blank segment feature
2 parents 617f587 + 9b364f2 commit d9cb4ff

File tree

4 files changed

+3
-73
lines changed

4 files changed

+3
-73
lines changed

whisper_live/backend/base.py

+2-37
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,13 @@ def __init__(self, client_uid, websocket):
2020
self.text = []
2121
self.current_out = ''
2222
self.prev_out = ''
23-
self.t_start = None
2423
self.exit = False
2524
self.same_output_count = 0
26-
self.show_prev_out_thresh = 5 # if pause(no output from whisper) show previous output for 5 seconds
27-
self.add_pause_thresh = 3 # add a blank to segment list as a pause(no speech) for 3 seconds
2825
self.transcript = []
2926
self.send_last_n_segments = 10
3027
self.no_speech_thresh = 0.45
3128
self.clip_audio = False
3229

33-
# text formatting
34-
self.pick_previous_segments = 2
35-
3630
# threading
3731
self.lock = threading.Lock()
3832

@@ -45,9 +39,7 @@ def speech_to_text(self):
4539
4640
If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
4741
It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
48-
are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
49-
(no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
50-
there is no speech for a specified duration to indicate a pause.
42+
are sent to the client in real-time, and a history of segments is maintained to provide context.
5143
5244
Raises:
5345
Exception: If there is an issue with audio processing or WebSocket communication.
@@ -85,7 +77,7 @@ def speech_to_text(self):
8577
def transcribe_audio(self):
8678
raise NotImplementedError
8779

88-
def handle_transcription_output(self):
80+
def handle_transcription_output(self, result, duration):
8981
raise NotImplementedError
9082

9183
def format_segment(self, start, end, text, completed=False):
@@ -228,33 +220,6 @@ def send_transcription_to_client(self, segments):
228220
except Exception as e:
229221
logging.error(f"[ERROR]: Sending data to client: {e}")
230222

231-
def get_previous_output(self):
232-
"""
233-
Retrieves previously generated transcription outputs if no new transcription is available
234-
from the current audio chunks.
235-
236-
Checks the time since the last transcription output and, if it is within a specified
237-
threshold, returns the most recent segments of transcribed text. It also manages
238-
adding a pause (blank segment) to indicate a significant gap in speech based on a defined
239-
threshold.
240-
241-
Returns:
242-
segments (list): A list of transcription segments. This may include the most recent
243-
transcribed text segments or a blank segment to indicate a pause
244-
in speech.
245-
"""
246-
segments = []
247-
if self.t_start is None:
248-
self.t_start = time.time()
249-
if time.time() - self.t_start < self.show_prev_out_thresh:
250-
segments = self.prepare_segments()
251-
252-
# add a blank if there is no speech for 3 seconds
253-
if len(self.text) and self.text[-1] != '':
254-
if time.time() - self.t_start > self.add_pause_thresh:
255-
self.text.append('')
256-
return segments
257-
258223
def disconnect(self):
259224
"""
260225
Notify the client of disconnection and send a disconnect message.

whisper_live/backend/faster_whisper_backend.py

-30
Original file line numberDiff line numberDiff line change
@@ -175,33 +175,6 @@ def transcribe_audio(self, input_sample):
175175
self.set_language(info)
176176
return result
177177

178-
def get_previous_output(self):
179-
"""
180-
Retrieves previously generated transcription outputs if no new transcription is available
181-
from the current audio chunks.
182-
183-
Checks the time since the last transcription output and, if it is within a specified
184-
threshold, returns the most recent segments of transcribed text. It also manages
185-
adding a pause (blank segment) to indicate a significant gap in speech based on a defined
186-
threshold.
187-
188-
Returns:
189-
segments (list): A list of transcription segments. This may include the most recent
190-
transcribed text segments or a blank segment to indicate a pause
191-
in speech.
192-
"""
193-
segments = []
194-
if self.t_start is None:
195-
self.t_start = time.time()
196-
if time.time() - self.t_start < self.show_prev_out_thresh:
197-
segments = self.prepare_segments()
198-
199-
# add a blank if there is no speech for 3 seconds
200-
if len(self.text) and self.text[-1] != '':
201-
if time.time() - self.t_start > self.add_pause_thresh:
202-
self.text.append('')
203-
return segments
204-
205178
def handle_transcription_output(self, result, duration):
206179
"""
207180
Handle the transcription output, updating the transcript and sending data to the client.
@@ -215,9 +188,6 @@ def handle_transcription_output(self, result, duration):
215188
self.t_start = None
216189
last_segment = self.update_segments(result, duration)
217190
segments = self.prepare_segments(last_segment)
218-
else:
219-
# show previous output if there is pause i.e. no output from whisper
220-
segments = self.get_previous_output()
221191

222192
if len(segments):
223193
self.send_transcription_to_client(segments)

whisper_live/backend/openvino_backend.py

-3
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,6 @@ def handle_transcription_output(self, result, duration):
118118
self.t_start = None
119119
last_segment = self.update_segments(result, duration)
120120
segments = self.prepare_segments(last_segment)
121-
else:
122-
# show previous output if there is pause i.e. no output from whisper
123-
segments = self.get_previous_output()
124121

125122
if len(segments):
126123
self.send_transcription_to_client(segments)

whisper_live/backend/trt_backend.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,7 @@ def speech_to_text(self):
149149
150150
If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
151151
It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
152-
are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
153-
(no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
154-
there is no speech for a specified duration to indicate a pause.
152+
are sent to the client in real-time, and a history of segments is maintained to provide context.
155153
156154
Raises:
157155
Exception: If there is an issue with audio processing or WebSocket communication.

0 commit comments

Comments
 (0)