Skip to content

Examples and Dependencies

Dimitrii Voronin edited this page Jun 27, 2024 · 25 revisions

Dependencies

Open In Colab

We are keeping the colab examples up-to-date, but you can manually manage your dependencies:

  • pytorch >= 1.12.0
  • torchaudio >= 0.9.0 (used only for examples, IO and resampling, can be omitted in production)

The provided JIT-models can be run with other torch backends as well.

Examples

Open In Colab

Imports
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime
  
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils
Speech timestapms from full audio
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
pprint(speech_timestamps)
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE) 
Audio('only_speech.wav')
Entire audio inference
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# audio is being splitted into 31.25 ms long pieces
# so output length equals ceil(input_length * 31.25 / SAMPLING_RATE)
predicts = model.audio_forward(wav, sr=SAMPLING_RATE)
Stream imitation example
## using VADIterator class

vad_iterator = VADIterator(model, sampling_rate=SAMPLING_RATE)
wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)

window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    speech_dict = vad_iterator(wav[i: i+ window_size_samples], return_seconds=True)
    if speech_dict:
        print(speech_dict, end=' ')
vad_iterator.reset_states() # reset model states after each audio
## just probabilities

wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
speech_probs = []
window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+window_size_samples]
    if len(chunk) < window_size_samples:
        break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
model.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts
Clone this wiki locally