From 8c5227ed8489ba1ae528371a6df46de77a144333 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Fri, 24 Nov 2023 12:26:37 +0100 Subject: [PATCH] Fix tts_with_vc (#3275) * Revert "fix for issue 3067" This reverts commit 041b4b6723a1c07a540059c5d2854a8698579de4. Fixes #3143. The original issue (#3067) was people trying to use tts.tts_with_vc_to_file() with XTTS and was "fixed" in #3109. But XTTS has integrated VC and you can just do tts.tts_to_file(..., speaker_wav="..."), there is no point in passing it through FreeVC afterwards. So, reverting this commit because it breaks tts.tts_with_vc_to_file() for any model that doesn't have integrated VC, i.e. all models this method is meant for. * fix: support multi-speaker models in tts_with_vc/tts_with_vc_to_file * fix: only compute spk embeddings for models that support it Fixes #1440. Passing a `speaker_wav` argument to regular Vits models failed because they don't support voice cloning. Now that argument is simply ignored. --- TTS/api.py | 19 +++++++++++++++---- TTS/utils/synthesizer.py | 6 +++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index c8600dcd38..fdf97d107f 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -440,7 +440,7 @@ def voice_conversion_to_file( save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) return file_path - def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): + def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None): """Convert text to speech with voice conversion. It combines tts with voice conversion to fake voice cloning. @@ -457,17 +457,25 @@ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. """ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # Lazy code... save it to a temp file to resample it while reading it for VC - self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav) + self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name) if self.voice_converter is None: self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24") wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav) return wav def tts_with_vc_to_file( - self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav" + self, + text: str, + language: str = None, + speaker_wav: str = None, + file_path: str = "output.wav", + speaker: str = None, ): """Convert text to speech with voice conversion and save to file. @@ -484,6 +492,9 @@ def tts_with_vc_to_file( Defaults to None. file_path (str, optional): Output file path. Defaults to "output.wav". + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. """ - wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav) + wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker) save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8efe608bac..0d0eb78a42 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -358,7 +358,11 @@ def tts( ) # compute a new d_vector from the given clip. - if speaker_wav is not None and self.tts_model.speaker_manager is not None: + if ( + speaker_wav is not None + and self.tts_model.speaker_manager is not None + and self.tts_model.speaker_manager.encoder_ap is not None + ): speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) vocoder_device = "cpu"