coqui-ai · erogol · Jun 5, 2023 · Jun 5, 2023
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
@@ -131,9 +131,8 @@ def wav_to_spec(y, n_fft, hop_length, win_length, center=False):
         pad_mode="reflect",
         normalized=False,
         onesided=True,
-        return_complex=False,
     )
-
+    spec = torch.view_as_real(spec)
     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
     return spec
 
@@ -199,7 +198,6 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm
         pad_mode="reflect",
         normalized=False,
         onesided=True,
-        return_complex=False,
     )
 
     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)

diff --git a/TTS/utils/audio/torch_transforms.py b/TTS/utils/audio/torch_transforms.py
@@ -129,8 +129,8 @@ def __call__(self, x):
             pad_mode="reflect",  # compatible with audio.py
             normalized=self.normalized,
             onesided=True,
-            return_complex=False,
         )
+        o = torch.view_as_real(o)
         M = o[:, :, :, 0]
         P = o[:, :, :, 1]
         S = torch.sqrt(torch.clamp(M**2 + P**2, min=1e-8))

diff --git a/TTS/vc/modules/freevc/mel_processing.py b/TTS/vc/modules/freevc/mel_processing.py
@@ -64,9 +64,8 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
         pad_mode="reflect",
         normalized=False,
         onesided=True,
-        return_complex=False,
     )
-
+    spec = torch.view_as_real(spec)
     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
     return spec
 
@@ -114,9 +113,8 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
         pad_mode="reflect",
         normalized=False,
         onesided=True,
-        return_complex=False,
     )
-
+    spec = torch.view_as_real(spec)
     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 
     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)

diff --git a/docs/source/models/tortoise.md b/docs/source/models/tortoise.md
@@ -1,7 +1,7 @@
 # Tortoise 🐢
 Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input
 text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to
-the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS. 
+the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
 
 Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS.
 
@@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
 from TTS.tts.models.tortoise import Tortoise
 
 config = TortoiseConfig()
-model = Tortoise.inif_from_config(config)
+model = Tortoise.init_from_config(config)
 model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
 
 # with random speaker
@@ -29,23 +29,23 @@ from TTS.api import TTS
 tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
 
 # cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
-# with custom inference settings overriding defaults. 
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+# with custom inference settings overriding defaults.
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                 file_path="output.wav",
                 voice_dir="TTS/tts/utils/assets/tortoise/voices/",
                 speaker="lj",
                 num_autoregressive_samples=1,
                 diffusion_iterations=10)
 
 # Using presets with the same voice
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                 file_path="output.wav",
                 voice_dir="TTS/tts/utils/assets/tortoise/voices/",
                 speaker="lj",
                 preset="ultra_fast")
 
 # Random voice generation
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                 file_path="output.wav")
 ```
 
@@ -54,7 +54,7 @@ Using 🐸TTS Command line:
 ```console
 # cloning the `lj` voice
 tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
---text "This is an example." \ 
+--text "This is an example." \
 --out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
 --voice_dir TTS/tts/utils/assets/tortoise/voices/ \
 --speaker_idx "lj" \