From 5cd750ac7ede641415051c16f1f9c8111caf13de Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 20:21:53 +0100 Subject: [PATCH] Fix API and CI --- TTS/tts/layers/xtts/xtts_manager.py | 16 ++++++++++++++++ TTS/tts/models/xtts.py | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/TTS/tts/layers/xtts/xtts_manager.py b/TTS/tts/layers/xtts/xtts_manager.py index 2de5ff14c7..3e7d0f6c91 100644 --- a/TTS/tts/layers/xtts/xtts_manager.py +++ b/TTS/tts/layers/xtts/xtts_manager.py @@ -8,6 +8,14 @@ def __init__(self, speaker_file_path=None): def name_to_id(self): return self.speakers.keys() + @property + def num_speakers(self): + return len(self.name_to_id) + + @property + def speaker_names(self): + return list(self.name_to_id.keys()) + class LanguageManager(): def __init__(self, config): @@ -16,3 +24,11 @@ def __init__(self, config): @property def name_to_id(self): return self.langs + + @property + def num_languages(self): + return len(self.name_to_id) + + @property + def language_names(self): + return list(self.name_to_id) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 12ed774269..83812f377f 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -379,7 +379,7 @@ def get_conditioning_latents( return gpt_cond_latents, speaker_embedding - def synthesize(self, text, config, speaker_wav, language, speaker_id, **kwargs): + def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwargs): """Synthesize speech with the given input text. Args: @@ -520,6 +520,8 @@ def inference( ): language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) + gpt_cond_latent = gpt_cond_latent.to(self.device) + speaker_embedding = speaker_embedding.to(self.device) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) else: @@ -628,6 +630,8 @@ def inference_stream( ): language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) + gpt_cond_latent = gpt_cond_latent.to(self.device) + speaker_embedding = speaker_embedding.to(self.device) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) else: