Closed
Description
We encountered an issue where the audio resampler adds noise into the audio. Here is the code to reproduce the issue
import numpy as np
from livekit import rtc
from numpy.typing import NDArray
from scipy import signal
from scipy.io import wavfile
output_rate = 16000
input_rate, data = wavfile.read("normal.wav")
print(
f"data shape: {data.shape}, data dtype: {data.dtype}, data endianness: {data.dtype.byteorder}"
)
resampler = rtc.AudioResampler(
input_rate=input_rate,
output_rate=output_rate,
num_channels=1,
quality=rtc.AudioResamplerQuality.VERY_HIGH,
)
def array2frames(data: np.ndarray) -> list[rtc.AudioFrame]:
frames = []
frame_size = input_rate
for i in range(0, len(data), frame_size):
frames.append(
rtc.AudioFrame(
data=data[i : i + frame_size].reshape(-1, 1).tobytes(),
sample_rate=input_rate,
num_channels=1,
samples_per_channel=len(data[i : i + frame_size]),
)
)
return frames
def resample_frames(frames: list[rtc.AudioFrame]) -> list[rtc.AudioFrame]:
output_frames = []
for frame in frames:
for resampled_frame in resampler.push(frame):
output_frames.append(resampled_frame)
for frame in resampler.flush():
output_frames.append(frame)
return output_frames
def frames2array(frames: list[rtc.AudioFrame]) -> np.ndarray:
array = [np.frombuffer(frame.data, dtype=np.int16).reshape(-1) for frame in frames]
return np.concatenate(array)
# This is fine
frames = array2frames(data)
wavfile.write("output_unchanged.wav", input_rate, frames2array(frames))
# This is not fine
resampled_frames = resample_frames(frames)
resampled_array = frames2array(resampled_frames)
print(
f"resampled array shape: {resampled_array.shape}, resampled array dtype: {resampled_array.dtype}, resampled array endianness: {resampled_array.dtype.byteorder}"
)
wavfile.write(
"output_resampled.wav",
output_rate,
resampled_array,
)
# custom resampling works fine
def resample(
data: NDArray,
current_sample_rate: int,
target_sample_rate: int,
) -> NDArray:
"""
Resample the audio data to the target sample rate.
Parameters
----------
data : NDArray
The audio data to resample.
current_sample_rate : int
The current sample rate of the audio data.
target_sample_rate : int
The target sample rate to resample the audio data to.
Returns
-------
NDArray
The resampled audio data.
"""
n_samples = data.shape[0]
n_samples = round(n_samples * float(target_sample_rate) / current_sample_rate)
data = signal.resample(data, n_samples, axis=0).astype(data.dtype)
return data
resampled_data = resample(data, input_rate, output_rate)
wavfile.write("output_resampled_scipy.wav", output_rate, resampled_data)
Warning
The noise can be very loud. Be sure to adjust your volume before opening
Here are the files:
- normal.wav
- normal_unchanged.wav
- output_resampled.wav This is the problematic one.
- output_resampled_scipy.wav
I have tried other audio files with the same attributes (int16 24KHZ) but different voices, and they all worked fine with the same code. The original audio was generated from Cartesia. Any ideas?
Metadata
Metadata
Assignees
Labels
No labels