@@ -20,19 +20,13 @@ def __init__(self, client_uid, websocket):
20
20
self .text = []
21
21
self .current_out = ''
22
22
self .prev_out = ''
23
- self .t_start = None
24
23
self .exit = False
25
24
self .same_output_count = 0
26
- self .show_prev_out_thresh = 5 # if pause(no output from whisper) show previous output for 5 seconds
27
- self .add_pause_thresh = 3 # add a blank to segment list as a pause(no speech) for 3 seconds
28
25
self .transcript = []
29
26
self .send_last_n_segments = 10
30
27
self .no_speech_thresh = 0.45
31
28
self .clip_audio = False
32
29
33
- # text formatting
34
- self .pick_previous_segments = 2
35
-
36
30
# threading
37
31
self .lock = threading .Lock ()
38
32
@@ -45,9 +39,7 @@ def speech_to_text(self):
45
39
46
40
If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
47
41
It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
48
- are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
49
- (no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
50
- there is no speech for a specified duration to indicate a pause.
42
+ are sent to the client in real-time, and a history of segments is maintained to provide context.
51
43
52
44
Raises:
53
45
Exception: If there is an issue with audio processing or WebSocket communication.
@@ -85,7 +77,7 @@ def speech_to_text(self):
85
77
def transcribe_audio (self ):
86
78
raise NotImplementedError
87
79
88
- def handle_transcription_output (self ):
80
+ def handle_transcription_output (self , result , duration ):
89
81
raise NotImplementedError
90
82
91
83
def format_segment (self , start , end , text , completed = False ):
@@ -228,33 +220,6 @@ def send_transcription_to_client(self, segments):
228
220
except Exception as e :
229
221
logging .error (f"[ERROR]: Sending data to client: { e } " )
230
222
231
- def get_previous_output (self ):
232
- """
233
- Retrieves previously generated transcription outputs if no new transcription is available
234
- from the current audio chunks.
235
-
236
- Checks the time since the last transcription output and, if it is within a specified
237
- threshold, returns the most recent segments of transcribed text. It also manages
238
- adding a pause (blank segment) to indicate a significant gap in speech based on a defined
239
- threshold.
240
-
241
- Returns:
242
- segments (list): A list of transcription segments. This may include the most recent
243
- transcribed text segments or a blank segment to indicate a pause
244
- in speech.
245
- """
246
- segments = []
247
- if self .t_start is None :
248
- self .t_start = time .time ()
249
- if time .time () - self .t_start < self .show_prev_out_thresh :
250
- segments = self .prepare_segments ()
251
-
252
- # add a blank if there is no speech for 3 seconds
253
- if len (self .text ) and self .text [- 1 ] != '' :
254
- if time .time () - self .t_start > self .add_pause_thresh :
255
- self .text .append ('' )
256
- return segments
257
-
258
223
def disconnect (self ):
259
224
"""
260
225
Notify the client of disconnection and send a disconnect message.
0 commit comments