From 66a1e248d03f2cd3cc2ae27b46ebcc9add91a223 Mon Sep 17 00:00:00 2001 From: Gorkem Date: Thu, 9 Nov 2023 18:28:39 +0300 Subject: [PATCH 1/5] torchaudio should use proper backend to load audio (#3179) --- TTS/tts/models/xtts.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 7cc9836a67..656a80bc99 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -69,12 +69,9 @@ def wav_to_mel_cloning( def load_audio(audiopath, sampling_rate): # better load setting following: https://github.com/faroit/python_audio_loading_benchmark - if audiopath[-4:] == ".mp3": - # it uses torchaudio with sox backend to load mp3 - audio, lsr = torchaudio.backend.sox_io_backend.load(audiopath) - else: - # it uses torchaudio soundfile backend to load all the others data type - audio, lsr = torchaudio.backend.soundfile_backend.load(audiopath) + + # torchaudio should chose proper backend to load audio depending on platform + audio, lsr = torchaudio.load(audiopath) # stereo to mono if needed if audio.size(0) != 1: From 1b9c400bca7bd92d8f2bcee853c8008b7b16834d Mon Sep 17 00:00:00 2001 From: Matthew Boakes Date: Thu, 9 Nov 2023 15:31:03 +0000 Subject: [PATCH 2/5] PyTorch 2.1 Updates (Weight Norm and TorchAudio I/O) (#3176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Replaced PyTorch weight_norm With parametrizations.weight_norm * TorchAudio: Migrating The I/O Functions To Use The Dispatcher Mechanism * Corrected Code Style --------- Co-authored-by: Eren Gölge --- TTS/tts/layers/delightful_tts/conv_layers.py | 13 +++--- .../layers/delightful_tts/kernel_predictor.py | 23 +++++----- TTS/tts/layers/generic/wavenet.py | 13 +++--- TTS/tts/layers/glow_tts/glow.py | 2 +- TTS/tts/layers/tortoise/vocoder.py | 42 ++++++++++--------- TTS/tts/layers/vits/discriminator.py | 2 +- TTS/tts/layers/xtts/hifigan_decoder.py | 19 +++++---- TTS/tts/models/xtts.py | 3 +- TTS/vc/models/freevc.py | 10 +++-- TTS/vc/modules/freevc/modules.py | 28 ++++++------- TTS/vc/modules/freevc/wavlm/wavlm.py | 2 +- TTS/vocoder/layers/hifigan.py | 23 +++++----- TTS/vocoder/layers/melgan.py | 11 ++--- TTS/vocoder/layers/wavegrad.py | 17 ++++---- TTS/vocoder/models/hifigan_discriminator.py | 4 +- TTS/vocoder/models/hifigan_generator.py | 19 +++++---- TTS/vocoder/models/melgan_discriminator.py | 2 +- TTS/vocoder/models/melgan_generator.py | 4 +- .../models/parallel_wavegan_discriminator.py | 9 ++-- .../models/parallel_wavegan_generator.py | 5 ++- TTS/vocoder/models/univnet_discriminator.py | 3 +- TTS/vocoder/models/univnet_generator.py | 5 ++- TTS/vocoder/models/wavegrad.py | 15 +++---- requirements.txt | 2 +- 24 files changed, 147 insertions(+), 129 deletions(-) diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py index 354a0336a1..fb9aa4495f 100644 --- a/TTS/tts/layers/delightful_tts/conv_layers.py +++ b/TTS/tts/layers/delightful_tts/conv_layers.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F +from torch.nn.utils import parametrize from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor @@ -73,7 +74,7 @@ def __init__( ) nn.init.xavier_uniform_(self.conv.weight, gain=nn.init.calculate_gain(w_init_gain)) if self.use_weight_norm: - self.conv = nn.utils.weight_norm(self.conv) + self.conv = nn.utils.parametrizations.weight_norm(self.conv) def forward(self, signal, mask=None): conv_signal = self.conv(signal) @@ -113,7 +114,7 @@ def __init__( dilation=1, w_init_gain="relu", ) - conv_layer = nn.utils.weight_norm(conv_layer.conv, name="weight") + conv_layer = nn.utils.parametrizations.weight_norm(conv_layer.conv, name="weight") convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) @@ -567,7 +568,7 @@ def __init__( # pylint: disable=dangerous-default-value self.convt_pre = nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.ConvTranspose1d( in_channels, in_channels, @@ -584,7 +585,7 @@ def __init__( # pylint: disable=dangerous-default-value self.conv_blocks.append( nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( in_channels, in_channels, @@ -665,6 +666,6 @@ def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=25 def remove_weight_norm(self): self.kernel_predictor.remove_weight_norm() - nn.utils.remove_weight_norm(self.convt_pre[1]) + parametrize.remove_parametrizations(self.convt_pre[1], "weight") for block in self.conv_blocks: - nn.utils.remove_weight_norm(block[1]) + parametrize.remove_parametrizations(block[1], "weight") diff --git a/TTS/tts/layers/delightful_tts/kernel_predictor.py b/TTS/tts/layers/delightful_tts/kernel_predictor.py index 19dfd57e7b..96c550b6c2 100644 --- a/TTS/tts/layers/delightful_tts/kernel_predictor.py +++ b/TTS/tts/layers/delightful_tts/kernel_predictor.py @@ -1,4 +1,5 @@ import torch.nn as nn # pylint: disable=consider-using-from-import +from torch.nn.utils import parametrize class KernelPredictor(nn.Module): @@ -36,7 +37,9 @@ def __init__( # pylint: disable=dangerous-default-value kpnet_bias_channels = conv_out_channels * conv_layers # l_b self.input_conv = nn.Sequential( - nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), + nn.utils.parametrizations.weight_norm( + nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True) + ), getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), ) @@ -46,7 +49,7 @@ def __init__( # pylint: disable=dangerous-default-value self.residual_convs.append( nn.Sequential( nn.Dropout(kpnet_dropout), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_hidden_channels, @@ -56,7 +59,7 @@ def __init__( # pylint: disable=dangerous-default-value ) ), getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_hidden_channels, @@ -68,7 +71,7 @@ def __init__( # pylint: disable=dangerous-default-value getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), ) ) - self.kernel_conv = nn.utils.weight_norm( + self.kernel_conv = nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_kernel_channels, @@ -77,7 +80,7 @@ def __init__( # pylint: disable=dangerous-default-value bias=True, ) ) - self.bias_conv = nn.utils.weight_norm( + self.bias_conv = nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_bias_channels, @@ -117,9 +120,9 @@ def forward(self, c): return kernels, bias def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.input_conv[0]) - nn.utils.remove_weight_norm(self.kernel_conv) - nn.utils.remove_weight_norm(self.bias_conv) + parametrize.remove_parametrizations(self.input_conv[0], "weight") + parametrize.remove_parametrizations(self.kernel_conv, "weight") + parametrize.remove_parametrizations(self.bias_conv, "weight") for block in self.residual_convs: - nn.utils.remove_weight_norm(block[1]) - nn.utils.remove_weight_norm(block[3]) + parametrize.remove_parametrizations(block[1], "weight") + parametrize.remove_parametrizations(block[3], "weight") diff --git a/TTS/tts/layers/generic/wavenet.py b/TTS/tts/layers/generic/wavenet.py index bc89da4fbe..f8de63b49f 100644 --- a/TTS/tts/layers/generic/wavenet.py +++ b/TTS/tts/layers/generic/wavenet.py @@ -1,5 +1,6 @@ import torch from torch import nn +from torch.nn.utils import parametrize @torch.jit.script @@ -62,7 +63,7 @@ def __init__( # init conditioning layer if c_in_channels > 0: cond_layer = torch.nn.Conv1d(c_in_channels, 2 * hidden_channels * num_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + self.cond_layer = torch.nn.utils.parametrizations.weight_norm(cond_layer, name="weight") # intermediate layers for i in range(num_layers): dilation = dilation_rate**i @@ -75,7 +76,7 @@ def __init__( in_layer = torch.nn.Conv1d( hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight") self.in_layers.append(in_layer) if i < num_layers - 1: @@ -84,7 +85,7 @@ def __init__( res_skip_channels = hidden_channels res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight") self.res_skip_layers.append(res_skip_layer) # setup weight norm if not weight_norm: @@ -115,11 +116,11 @@ def forward(self, x, x_mask=None, g=None, **kwargs): # pylint: disable=unused-a def remove_weight_norm(self): if self.c_in_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) + parametrize.remove_parametrizations(self.cond_layer, "weight") for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) + parametrize.remove_parametrizations(l, "weight") for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) + parametrize.remove_parametrizations(l, "weight") class WNBlocks(nn.Module): diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py index 273c62a5c0..b02c311808 100644 --- a/TTS/tts/layers/glow_tts/glow.py +++ b/TTS/tts/layers/glow_tts/glow.py @@ -186,7 +186,7 @@ def __init__( self.sigmoid_scale = sigmoid_scale # input layer start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1) - start = torch.nn.utils.weight_norm(start) + start = torch.nn.utils.parametrizations.weight_norm(start) self.start = start # output layer # Initializing last layer to 0 makes the affine coupling layers diff --git a/TTS/tts/layers/tortoise/vocoder.py b/TTS/tts/layers/tortoise/vocoder.py index 47365eb58d..a5200c2673 100644 --- a/TTS/tts/layers/tortoise/vocoder.py +++ b/TTS/tts/layers/tortoise/vocoder.py @@ -1,4 +1,3 @@ -import json from dataclasses import dataclass from enum import Enum from typing import Callable, Optional @@ -6,6 +5,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +import torch.nn.utils.parametrize as parametrize MAX_WAV_VALUE = 32768.0 @@ -44,7 +44,9 @@ def __init__( kpnet_bias_channels = conv_out_channels * conv_layers # l_b self.input_conv = nn.Sequential( - nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), + nn.utils.parametrizations.weight_norm( + nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True) + ), getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), ) @@ -54,7 +56,7 @@ def __init__( self.residual_convs.append( nn.Sequential( nn.Dropout(kpnet_dropout), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_hidden_channels, @@ -64,7 +66,7 @@ def __init__( ) ), getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_hidden_channels, @@ -76,7 +78,7 @@ def __init__( getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), ) ) - self.kernel_conv = nn.utils.weight_norm( + self.kernel_conv = nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_kernel_channels, @@ -85,7 +87,7 @@ def __init__( bias=True, ) ) - self.bias_conv = nn.utils.weight_norm( + self.bias_conv = nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_bias_channels, @@ -125,12 +127,12 @@ def forward(self, c): return kernels, bias def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.input_conv[0]) - nn.utils.remove_weight_norm(self.kernel_conv) - nn.utils.remove_weight_norm(self.bias_conv) + parametrize.remove_parametrizations(self.input_conv[0], "weight") + parametrize.remove_parametrizations(self.kernel_conv, "weight") + parametrize.remove_parametrizations(self.bias_conv) for block in self.residual_convs: - nn.utils.remove_weight_norm(block[1]) - nn.utils.remove_weight_norm(block[3]) + parametrize.remove_parametrizations(block[1], "weight") + parametrize.remove_parametrizations(block[3], "weight") class LVCBlock(torch.nn.Module): @@ -169,7 +171,7 @@ def __init__( self.convt_pre = nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.ConvTranspose1d( in_channels, in_channels, @@ -186,7 +188,7 @@ def __init__( self.conv_blocks.append( nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( in_channels, in_channels, @@ -267,9 +269,9 @@ def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=25 def remove_weight_norm(self): self.kernel_predictor.remove_weight_norm() - nn.utils.remove_weight_norm(self.convt_pre[1]) + parametrize.remove_parametrizations(self.convt_pre[1], "weight") for block in self.conv_blocks: - nn.utils.remove_weight_norm(block[1]) + parametrize.remove_parametrizations(block[1], "weight") class UnivNetGenerator(nn.Module): @@ -314,11 +316,13 @@ def __init__( ) ) - self.conv_pre = nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect")) + self.conv_pre = nn.utils.parametrizations.weight_norm( + nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect") + ) self.conv_post = nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")), + nn.utils.parametrizations.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")), nn.Tanh(), ) @@ -346,11 +350,11 @@ def eval(self, inference=False): self.remove_weight_norm() def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.conv_pre) + parametrize.remove_parametrizations(self.conv_pre, "weight") for layer in self.conv_post: if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) + parametrize.remove_parametrizations(layer, "weight") for res_block in self.res_stack: res_block.remove_weight_norm() diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py index 148f283c90..c27d11bef6 100644 --- a/TTS/tts/layers/vits/discriminator.py +++ b/TTS/tts/layers/vits/discriminator.py @@ -14,7 +14,7 @@ class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super().__init__() - norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm + norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm self.convs = nn.ModuleList( [ norm_f(Conv1d(1, 16, 15, 1, padding=7)), diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index 5fcff8703b..9add7826e6 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -3,7 +3,8 @@ from torch import nn from torch.nn import Conv1d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations from TTS.utils.io import load_fsspec @@ -120,9 +121,9 @@ def forward(self, x): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.convs2: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class ResBlock2(torch.nn.Module): @@ -176,7 +177,7 @@ def forward(self, x): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class HifiganGenerator(torch.nn.Module): @@ -251,10 +252,10 @@ def __init__( self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1) if not conv_pre_weight_norm: - remove_weight_norm(self.conv_pre) + remove_parametrizations(self.conv_pre, "weight") if not conv_post_weight_norm: - remove_weight_norm(self.conv_post) + remove_parametrizations(self.conv_post, "weight") if self.cond_in_each_up_layer: self.conds = nn.ModuleList() @@ -317,11 +318,11 @@ def inference(self, c): def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.resblocks: l.remove_weight_norm() - remove_weight_norm(self.conv_pre) - remove_weight_norm(self.conv_post) + remove_parametrizations(self.conv_pre, "weight") + remove_parametrizations(self.conv_post, "weight") def load_checkpoint( self, config, checkpoint_path, eval=False, cache=False diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 656a80bc99..f41bcfb944 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -1,5 +1,4 @@ import os -from contextlib import contextmanager from dataclasses import dataclass import librosa @@ -8,7 +7,7 @@ import torchaudio from coqpit import Coqpit -from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel +from TTS.tts.layers.tortoise.audio_utils import wav_to_univnet_mel from TTS.tts.layers.xtts.gpt import GPT from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.stream_generator import init_stream_support diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index fd53a77fc5..8bb9989224 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -5,9 +5,11 @@ import torch from coqpit import Coqpit from torch import nn -from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm +from torch.nn.utils import spectral_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations import TTS.vc.modules.freevc.commons as commons import TTS.vc.modules.freevc.modules as modules @@ -152,9 +154,9 @@ def forward(self, x, g=None): def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.resblocks: - l.remove_weight_norm() + remove_parametrizations(l, "weight") class DiscriminatorP(torch.nn.Module): diff --git a/TTS/vc/modules/freevc/modules.py b/TTS/vc/modules/freevc/modules.py index 0503a13c8a..9bb5499003 100644 --- a/TTS/vc/modules/freevc/modules.py +++ b/TTS/vc/modules/freevc/modules.py @@ -1,13 +1,9 @@ -import copy -import math - -import numpy as np -import scipy import torch from torch import nn -from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import Conv1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations import TTS.vc.modules.freevc.commons as commons from TTS.vc.modules.freevc.commons import get_padding, init_weights @@ -122,7 +118,7 @@ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_ch if gin_channels != 0: cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + self.cond_layer = torch.nn.utils.parametrizations.weight_norm(cond_layer, name="weight") for i in range(n_layers): dilation = dilation_rate**i @@ -130,7 +126,7 @@ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_ch in_layer = torch.nn.Conv1d( hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight") self.in_layers.append(in_layer) # last one is not necessary @@ -140,7 +136,7 @@ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_ch res_skip_channels = hidden_channels res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight") self.res_skip_layers.append(res_skip_layer) def forward(self, x, x_mask, g=None, **kwargs): @@ -172,11 +168,11 @@ def forward(self, x, x_mask, g=None, **kwargs): def remove_weight_norm(self): if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) + remove_parametrizations(self.cond_layer, "weight") for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) + remove_parametrizations(l, "weight") class ResBlock1(torch.nn.Module): @@ -250,9 +246,9 @@ def forward(self, x, x_mask=None): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.convs2: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class ResBlock2(torch.nn.Module): @@ -297,7 +293,7 @@ def forward(self, x, x_mask=None): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class Log(nn.Module): diff --git a/TTS/vc/modules/freevc/wavlm/wavlm.py b/TTS/vc/modules/freevc/wavlm/wavlm.py index 7efb11bfc6..fc93bd4f50 100644 --- a/TTS/vc/modules/freevc/wavlm/wavlm.py +++ b/TTS/vc/modules/freevc/wavlm/wavlm.py @@ -497,7 +497,7 @@ def __init__(self, args): nn.init.normal_(self.pos_conv.weight, mean=0, std=std) nn.init.constant_(self.pos_conv.bias, 0) - self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) + self.pos_conv = nn.utils.parametrizations.weight_norm(self.pos_conv, name="weight", dim=2) self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) if hasattr(args, "relative_position_embedding"): diff --git a/TTS/vocoder/layers/hifigan.py b/TTS/vocoder/layers/hifigan.py index f512007248..8dd75133bb 100644 --- a/TTS/vocoder/layers/hifigan.py +++ b/TTS/vocoder/layers/hifigan.py @@ -1,4 +1,5 @@ from torch import nn +from torch.nn.utils.parametrize import remove_parametrizations # pylint: disable=dangerous-default-value @@ -10,14 +11,16 @@ def __init__(self, kernel, channel, padding, dilations=[1, 3, 5]): resstack += [ nn.LeakyReLU(0.2), nn.ReflectionPad1d(dilation), - nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation)), + nn.utils.parametrizations.weight_norm( + nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation) + ), nn.LeakyReLU(0.2), nn.ReflectionPad1d(padding), - nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)), + nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)), ] self.resstack = nn.Sequential(*resstack) - self.shortcut = nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)) + self.shortcut = nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)) def forward(self, x): x1 = self.shortcut(x) @@ -25,13 +28,13 @@ def forward(self, x): return x1 + x2 def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.shortcut) - nn.utils.remove_weight_norm(self.resstack[2]) - nn.utils.remove_weight_norm(self.resstack[5]) - nn.utils.remove_weight_norm(self.resstack[8]) - nn.utils.remove_weight_norm(self.resstack[11]) - nn.utils.remove_weight_norm(self.resstack[14]) - nn.utils.remove_weight_norm(self.resstack[17]) + remove_parametrizations(self.shortcut, "weight") + remove_parametrizations(self.resstack[2], "weight") + remove_parametrizations(self.resstack[5], "weight") + remove_parametrizations(self.resstack[8], "weight") + remove_parametrizations(self.resstack[11], "weight") + remove_parametrizations(self.resstack[14], "weight") + remove_parametrizations(self.resstack[17], "weight") class MRF(nn.Module): diff --git a/TTS/vocoder/layers/melgan.py b/TTS/vocoder/layers/melgan.py index 4bb328e983..7ad41a0f78 100644 --- a/TTS/vocoder/layers/melgan.py +++ b/TTS/vocoder/layers/melgan.py @@ -1,5 +1,6 @@ from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations class ResidualStack(nn.Module): @@ -27,7 +28,7 @@ def __init__(self, channels, num_res_blocks, kernel_size): ] self.shortcuts = nn.ModuleList( - [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for i in range(num_res_blocks)] + [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for _ in range(num_res_blocks)] ) def forward(self, x): @@ -37,6 +38,6 @@ def forward(self, x): def remove_weight_norm(self): for block, shortcut in zip(self.blocks, self.shortcuts): - nn.utils.remove_weight_norm(block[2]) - nn.utils.remove_weight_norm(block[4]) - nn.utils.remove_weight_norm(shortcut) + remove_parametrizations(block[2], "weight") + remove_parametrizations(block[4], "weight") + remove_parametrizations(shortcut, "weight") diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index 24b905f994..9f1512c6d4 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -1,7 +1,8 @@ import torch import torch.nn.functional as F from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations class Conv1d(nn.Conv1d): @@ -56,8 +57,8 @@ def forward(self, x, noise_scale): return shift, scale def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.input_conv) - nn.utils.remove_weight_norm(self.output_conv) + remove_parametrizations(self.input_conv, "weight") + remove_parametrizations(self.output_conv, "weight") def apply_weight_norm(self): self.input_conv = weight_norm(self.input_conv) @@ -111,13 +112,13 @@ def forward(self, x, shift, scale): return o def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.res_block) + remove_parametrizations(self.res_block, "weight") for _, layer in enumerate(self.main_block): if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") for _, layer in enumerate(self.out_block): if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") def apply_weight_norm(self): self.res_block = weight_norm(self.res_block) @@ -153,10 +154,10 @@ def forward(self, x): return o + res def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.res_block) + remove_parametrizations(self.res_block, "weight") for _, layer in enumerate(self.main_block): if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") def apply_weight_norm(self): self.res_block = weight_norm(self.res_block) diff --git a/TTS/vocoder/models/hifigan_discriminator.py b/TTS/vocoder/models/hifigan_discriminator.py index ca5eaf408c..7447a5fbc4 100644 --- a/TTS/vocoder/models/hifigan_discriminator.py +++ b/TTS/vocoder/models/hifigan_discriminator.py @@ -30,7 +30,7 @@ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super().__init__() self.period = period get_padding = lambda k, d: int((k * d - d) / 2) - norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm + norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm self.convs = nn.ModuleList( [ norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), @@ -125,7 +125,7 @@ class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super().__init__() - norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm + norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm self.convs = nn.ModuleList( [ norm_f(nn.Conv1d(1, 128, 15, 1, padding=7)), diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index 4916d1e697..9247532259 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -3,7 +3,8 @@ from torch import nn from torch.nn import Conv1d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations from TTS.utils.io import load_fsspec @@ -99,9 +100,9 @@ def forward(self, x): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.convs2: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class ResBlock2(torch.nn.Module): @@ -155,7 +156,7 @@ def forward(self, x): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class HifiganGenerator(torch.nn.Module): @@ -227,10 +228,10 @@ def __init__( self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1) if not conv_pre_weight_norm: - remove_weight_norm(self.conv_pre) + remove_parametrizations(self.conv_pre, "weight") if not conv_post_weight_norm: - remove_weight_norm(self.conv_post) + remove_parametrizations(self.conv_post, "weight") def forward(self, x, g=None): """ @@ -283,11 +284,11 @@ def inference(self, c): def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.resblocks: l.remove_weight_norm() - remove_weight_norm(self.conv_pre) - remove_weight_norm(self.conv_post) + remove_parametrizations(self.conv_pre, "weight") + remove_parametrizations(self.conv_post, "weight") def load_checkpoint( self, config, checkpoint_path, eval=False, cache=False diff --git a/TTS/vocoder/models/melgan_discriminator.py b/TTS/vocoder/models/melgan_discriminator.py index 14f00c5927..e41467da3c 100644 --- a/TTS/vocoder/models/melgan_discriminator.py +++ b/TTS/vocoder/models/melgan_discriminator.py @@ -1,6 +1,6 @@ import numpy as np from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm class MelganDiscriminator(nn.Module): diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py index 989797f0b8..bb3fee789c 100644 --- a/TTS/vocoder/models/melgan_generator.py +++ b/TTS/vocoder/models/melgan_generator.py @@ -1,6 +1,6 @@ import torch from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm from TTS.utils.io import load_fsspec from TTS.vocoder.layers.melgan import ResidualStack @@ -80,7 +80,7 @@ def remove_weight_norm(self): for _, layer in enumerate(self.layers): if len(layer.state_dict()) != 0: try: - nn.utils.remove_weight_norm(layer) + nn.utils.parametrize.remove_parametrizations(layer, "weight") except ValueError: layer.remove_weight_norm() diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py index adf1bdaea0..d02af75f05 100644 --- a/TTS/vocoder/models/parallel_wavegan_discriminator.py +++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py @@ -2,6 +2,7 @@ import torch from torch import nn +from torch.nn.utils.parametrize import remove_parametrizations from TTS.vocoder.layers.parallel_wavegan import ResidualBlock @@ -68,7 +69,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): - torch.nn.utils.weight_norm(m) + torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) @@ -76,7 +77,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: # print(f"Weight norm is removed from {m}.") - nn.utils.remove_weight_norm(m) + remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -171,7 +172,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): - torch.nn.utils.weight_norm(m) + torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) @@ -179,7 +180,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: print(f"Weight norm is removed from {m}.") - nn.utils.remove_weight_norm(m) + remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index 5587fb7264..8338d94653 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -2,6 +2,7 @@ import numpy as np import torch +from torch.nn.utils.parametrize import remove_parametrizations from TTS.utils.io import load_fsspec from TTS.vocoder.layers.parallel_wavegan import ResidualBlock @@ -126,7 +127,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: # print(f"Weight norm is removed from {m}.") - torch.nn.utils.remove_weight_norm(m) + remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -135,7 +136,7 @@ def _remove_weight_norm(m): def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): - torch.nn.utils.weight_norm(m) + torch.nn.utils.parametrizations.weight_norm(m) # print(f"Weight norm is applied to {m}.") self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/univnet_discriminator.py b/TTS/vocoder/models/univnet_discriminator.py index 4c09520c2a..497d67ac76 100644 --- a/TTS/vocoder/models/univnet_discriminator.py +++ b/TTS/vocoder/models/univnet_discriminator.py @@ -1,7 +1,8 @@ import torch import torch.nn.functional as F from torch import nn -from torch.nn.utils import spectral_norm, weight_norm +from torch.nn.utils import spectral_norm +from torch.nn.utils.parametrizations import weight_norm from TTS.utils.audio.torch_transforms import TorchSTFT from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 2ee28c7b85..5e66b70df8 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -3,6 +3,7 @@ import numpy as np import torch import torch.nn.functional as F +from torch.nn.utils import parametrize from TTS.vocoder.layers.lvc_block import LVCBlock @@ -113,7 +114,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: # print(f"Weight norm is removed from {m}.") - torch.nn.utils.remove_weight_norm(m) + parametrize.remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -124,7 +125,7 @@ def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): - torch.nn.utils.weight_norm(m) + torch.nn.utils.parametrizations.weight_norm(m) # print(f"Weight norm is applied to {m}.") self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index a0f9221a8f..c1166e0914 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -5,7 +5,8 @@ import torch from coqpit import Coqpit from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from trainer.trainer_utils import get_optimizer, get_scheduler @@ -178,27 +179,27 @@ def remove_weight_norm(self): for _, layer in enumerate(self.dblocks): if len(layer.state_dict()) != 0: try: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") except ValueError: layer.remove_weight_norm() for _, layer in enumerate(self.film): if len(layer.state_dict()) != 0: try: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") except ValueError: layer.remove_weight_norm() for _, layer in enumerate(self.ublocks): if len(layer.state_dict()) != 0: try: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") except ValueError: layer.remove_weight_norm() - nn.utils.remove_weight_norm(self.x_conv) - nn.utils.remove_weight_norm(self.out_conv) - nn.utils.remove_weight_norm(self.y_conv) + remove_parametrizations(self.x_conv, "weight") + remove_parametrizations(self.out_conv, "weight") + remove_parametrizations(self.y_conv, "weight") def apply_weight_norm(self): for _, layer in enumerate(self.dblocks): diff --git a/requirements.txt b/requirements.txt index 04343c848d..53e8af590c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ numpy==1.22.0;python_version<="3.10" numpy==1.24.3;python_version>"3.10" cython==0.29.30 scipy>=1.11.2 -torch>=1.7 +torch>=2.1 torchaudio soundfile==0.12.* librosa==0.10.* From a8e9163fb30324624765e4dcb3244cd7e2bbeb05 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 9 Nov 2023 17:32:12 +0200 Subject: [PATCH 3/5] xtts/tokenizer: merge duplicate implementations of preprocess_text (#3170) This was found via ruff: > F811 Redefinition of unused `preprocess_text` from line 570 --- TTS/tts/layers/xtts/tokenizer.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 4c7ae6e3aa..edb0904277 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -568,14 +568,16 @@ def check_input_length(self, txt, lang): print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.") def preprocess_text(self, txt, lang): - if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]: + if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "zh-cn"}: txt = multilingual_cleaners(txt, lang) - if lang == "zh-cn": + if lang in {"zh", "zh-cn"}: txt = chinese_transliterate(txt) elif lang == "ja": txt = japanese_cleaners(txt, self.katsu) + elif lang == "ko": + txt = korean_cleaners(txt) else: - raise NotImplementedError() + raise NotImplementedError(f"Language '{lang}' is not supported.") return txt def encode(self, txt, lang): @@ -594,23 +596,6 @@ def decode(self, seq): txt = txt.replace("[UNK]", "") return txt - def preprocess_text(self, txt, lang): - if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "zh", "ar", "cs", "ru", "nl", "tr", "hu"]: - txt = multilingual_cleaners(txt, lang) - elif lang == "ja": - if self.katsu is None: - import cutlet - - self.katsu = cutlet.Cutlet() - txt = japanese_cleaners(txt, self.katsu) - elif lang == "zh-cn" or lang == "zh": - txt = chinese_transliterate(txt) - elif lang == "ko": - txt = korean_cleaners(txt) - else: - raise NotImplementedError() - return txt - def __len__(self): return self.tokenizer.get_vocab_size() From 3b1e7038bc36abfa62ae4e4299c9df85bcfa5fdd Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 9 Nov 2023 16:49:52 +0100 Subject: [PATCH 4/5] fix(formatters): set missing root_path attribute (#3182) Fixes #2778 --- TTS/tts/datasets/formatters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index fbf6881f04..053444b0c1 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -280,7 +280,7 @@ def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) text = cols[1] - items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items @@ -294,7 +294,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument utt_id = line.split()[1] text = line[line.find('"') + 1 : line.rfind('"') - 1] wav_file = os.path.join(root_path, "wavn", utt_id + ".wav") - items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items From 6f1cba2f81de6c97e81c0b2030b631e588047968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 9 Nov 2023 17:41:37 +0100 Subject: [PATCH 5/5] Update to v0.20.3 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 727d97b9bb..144996ed2c 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.20.2 +0.20.3