Skip to content

Commit 11b8876

Browse files
Fix Docker failing to start due to missing PortAudio, and adjust the code to stabilize latency to prevent audio instability when using a VAD
1 parent 63049a7 commit 11b8876

3 files changed

Lines changed: 25 additions & 19 deletions

File tree

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ WORKDIR /app
1010
# Install system dependencies, clean up cache to keep image size small
1111
RUN apt update && \
1212
apt install -y -qq ffmpeg && \
13+
apt install -y -qq libportaudio2 && \
1314
apt clean && rm -rf /var/lib/apt/lists/*
1415

1516
# Copy application files into the container

rvc/realtime/core.py

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def __init__(
8686
# noise reduce
8787
self.reduced_noise = (
8888
TorchGate(
89-
AUDIO_SAMPLE_RATE,
89+
self.pipeline.tgt_sr,
9090
prop_decrease=clean_strength,
9191
).to(self.device)
9292
if clean_audio
@@ -244,7 +244,7 @@ def inference(
244244
# Busy wait to keep power manager happy and clocks stable. Running pipeline on-demand seems to lag when the delay between
245245
# voice changer activation is too high.
246246
# https://forums.developer.nvidia.com/t/why-kernel-calculate-speed-got-slower-after-waiting-for-a-while/221059/9
247-
self.pipeline.voice_conversion(
247+
audio_model = self.pipeline.voice_conversion(
248248
self.convert_buffer,
249249
self.pitch_buffer,
250250
self.pitchf_buffer,
@@ -260,14 +260,14 @@ def inference(
260260
f0_autotune_strength,
261261
proposed_pitch,
262262
proposed_pitch_threshold,
263+
self.reduced_noise,
264+
self.board,
263265
)
264-
return None, vol
266+
267+
return torch.zeros(audio_model.shape, dtype=self.dtype, device=self.device), vol
265268

266269
if vol < self.input_sensitivity:
267-
# Busy wait to keep power manager happy and clocks stable. Running pipeline on-demand seems to lag when the delay between
268-
# voice changer activation is too high.
269-
# https://forums.developer.nvidia.com/t/why-kernel-calculate-speed-got-slower-after-waiting-for-a-while/221059/9
270-
self.pipeline.voice_conversion(
270+
audio_model = self.pipeline.voice_conversion(
271271
self.convert_buffer,
272272
self.pitch_buffer,
273273
self.pitchf_buffer,
@@ -283,9 +283,11 @@ def inference(
283283
f0_autotune_strength,
284284
proposed_pitch,
285285
proposed_pitch_threshold,
286+
self.reduced_noise,
287+
self.board,
286288
)
287289

288-
return None, vol
290+
return torch.zeros(audio_model.shape, dtype=self.dtype, device=self.device), vol
289291

290292
circular_write(audio_input_16k, self.convert_buffer)
291293

@@ -305,18 +307,11 @@ def inference(
305307
f0_autotune_strength,
306308
proposed_pitch,
307309
proposed_pitch_threshold,
310+
self.reduced_noise,
311+
self.board,
308312
)
309313

310314
audio_out: torch.Tensor = self.resample_out(audio_model * torch.sqrt(vol_t))
311-
312-
if self.reduced_noise is not None:
313-
audio_out = self.reduced_noise(audio_out.unsqueeze(0)).squeeze(0)
314-
if self.board is not None:
315-
audio_out = torch.as_tensor(
316-
self.board(audio_out.cpu().numpy(), AUDIO_SAMPLE_RATE),
317-
device=self.device,
318-
)
319-
320315
return audio_out, vol
321316

322317
def __del__(self):
@@ -424,9 +419,9 @@ def process_audio(
424419
proposed_pitch_threshold,
425420
)
426421

427-
if audio is None:
422+
# if audio is None:
428423
# In case there's an actual silence - send full block with zeros
429-
return np.zeros(block_size, dtype=np.float32), vol
424+
# return np.zeros(block_size, dtype=np.float32), vol
430425

431426
conv_input = audio[None, None, : self.crossfade_frame + self.sola_search_frame]
432427
cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])

rvc/realtime/pipeline.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ def voice_conversion(
228228
f0_autotune_strength: float = 1,
229229
proposed_pitch: bool = False,
230230
proposed_pitch_threshold: float = 155.0,
231+
reduced_noise = None,
232+
board = None,
231233
):
232234
"""
233235
Performs realtime voice conversion on a given audio segment.
@@ -321,6 +323,14 @@ def voice_conversion(
321323
out_audio[: return_length * scaled_window]
322324
)
323325

326+
if reduced_noise is not None:
327+
out_audio = reduced_noise(out_audio.unsqueeze(0)).squeeze(0)
328+
if board is not None:
329+
out_audio = torch.as_tensor(
330+
board(out_audio.cpu().numpy(), self.tgt_sr),
331+
device=self.device,
332+
)
333+
324334
return out_audio
325335

326336
def _retrieve_speaker_embeddings(

0 commit comments

Comments
 (0)