diff --git a/TTS/api.py b/TTS/api.py index c8600dcd38..fdf97d107f 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -440,7 +440,7 @@ def voice_conversion_to_file( save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) return file_path - def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): + def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None): """Convert text to speech with voice conversion. It combines tts with voice conversion to fake voice cloning. @@ -457,17 +457,25 @@ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. """ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # Lazy code... save it to a temp file to resample it while reading it for VC - self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav) + self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name) if self.voice_converter is None: self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24") wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav) return wav def tts_with_vc_to_file( - self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav" + self, + text: str, + language: str = None, + speaker_wav: str = None, + file_path: str = "output.wav", + speaker: str = None, ): """Convert text to speech with voice conversion and save to file. @@ -484,6 +492,9 @@ def tts_with_vc_to_file( Defaults to None. file_path (str, optional): Output file path. Defaults to "output.wav". + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. """ - wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav) + wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker) save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8efe608bac..0d0eb78a42 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -358,7 +358,11 @@ def tts( ) # compute a new d_vector from the given clip. - if speaker_wav is not None and self.tts_model.speaker_manager is not None: + if ( + speaker_wav is not None + and self.tts_model.speaker_manager is not None + and self.tts_model.speaker_manager.encoder_ap is not None + ): speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) vocoder_device = "cpu"