Skip to content

Commit

Permalink
Merge branch 'p3_11' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
erogol committed Jun 28, 2023
2 parents c844b65 + 4786548 commit 6b9ebf5
Show file tree
Hide file tree
Showing 32 changed files with 113 additions and 115 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/aux_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/inference_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand Down
16 changes: 6 additions & 10 deletions .github/workflows/pypi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
fi
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9
- run: |
python -m pip install -U pip setuptools wheel build
- run: |
Expand All @@ -36,7 +36,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
Expand Down Expand Up @@ -66,19 +66,15 @@ jobs:
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.7"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.8"
name: "wheel-3.9"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.9"
name: "wheel-3.10"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.10"
name: "wheel-3.11"
path: "dist/"
- run: |
ls -lh dist/
Expand All @@ -91,7 +87,7 @@ jobs:
EOF
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9
- run: |
python -m pip install twine
- run: |
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/style_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ jobs:
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Lint check
run: |
make lint
# - name: Lint check
# run: |
# make lint
2 changes: 1 addition & 1 deletion .github/workflows/text_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tts_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vocoder_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/zoo_tests0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/zoo_tests1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand All @@ -43,6 +43,7 @@ jobs:
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/zoo_tests2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion TTS/encoder/utils/visual.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
[0, 0, 0],
[183, 183, 183],
],
dtype=np.float,
dtype=float,
)
/ 255
)
Expand Down
10 changes: 5 additions & 5 deletions TTS/tts/configs/bark_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Dict

from TTS.tts.configs.shared_configs import BaseTTSConfig
Expand Down Expand Up @@ -46,11 +46,11 @@ class BarkConfig(BaseTTSConfig):
"""

model: str = "bark"
audio: BarkAudioConfig = BarkAudioConfig()
audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
num_chars: int = 0
semantic_config: GPTConfig = GPTConfig()
fine_config: FineGPTConfig = FineGPTConfig()
coarse_config: GPTConfig = GPTConfig()
semantic_config: GPTConfig = field(default_factory=GPTConfig)
fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
coarse_config: GPTConfig = field(default_factory=GPTConfig)
CONTEXT_WINDOW_SIZE: int = 1024
SEMANTIC_RATE_HZ: float = 49.9
SEMANTIC_VOCAB_SIZE: int = 10_000
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/configs/fast_pitch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class FastPitchConfig(BaseTTSConfig):
base_model: str = "forward_tts"

# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs()
model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)

# multi-speaker settings
num_speakers: int = 0
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/configs/fast_speech_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
base_model: str = "forward_tts"

# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))

# multi-speaker settings
num_speakers: int = 0
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/configs/fastspeech2_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig):
base_model: str = "forward_tts"

# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True)
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))

# multi-speaker settings
num_speakers: int = 0
Expand Down
42 changes: 22 additions & 20 deletions TTS/tts/configs/speedy_speech_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
base_model: str = "forward_tts"

# set model args as SpeedySpeech
model_args: ForwardTTSArgs = ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
model_args: ForwardTTSArgs = field(
default_factory=lambda: ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
)
)

# multi-speaker settings
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/configs/tortoise_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class TortoiseConfig(BaseTTSConfig):
model: str = "tortoise"
# model specific params
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
audio: TortoiseAudioConfig = TortoiseAudioConfig()
audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
model_dir: str = None

# settings
Expand Down
37 changes: 8 additions & 29 deletions TTS/tts/layers/bark/hubert/kmeans_hubert.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,11 @@
import logging
from pathlib import Path

import fairseq
import torch
from einops import pack, unpack
from torch import nn
from torchaudio.functional import resample

logging.root.setLevel(logging.ERROR)


from transformers import HubertModel
def round_down_nearest_multiple(num, divisor):
return num // divisor * divisor

Expand Down Expand Up @@ -49,22 +45,11 @@ def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=
self.target_sample_hz = target_sample_hz
self.seq_len_multiple_of = seq_len_multiple_of
self.output_layer = output_layer

if device is not None:
self.to(device)

model_path = Path(checkpoint_path)

assert model_path.exists(), f"path {checkpoint_path} does not exist"

checkpoint = torch.load(checkpoint_path)
load_model_input = {checkpoint_path: checkpoint}
model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)

self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
if device is not None:
model[0].to(device)

self.model = model[0]
self.model.to(device)
self.model.eval()

@property
Expand All @@ -81,19 +66,13 @@ def forward(self, wav_input, flatten=True, input_sample_hz=None):
if exists(self.seq_len_multiple_of):
wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)

embed = self.model(
outputs = self.model.forward(
wav_input,
features_only=True,
mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
output_layer=self.output_layer,
output_hidden_states=True,
)

embed, packed_shape = pack([embed["x"]], "* d")

# codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())

codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()

embed = outputs["hidden_states"][self.output_layer]
embed, packed_shape = pack([embed], "* d")
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
if flatten:
return codebook_indices

Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/layers/bark/inference_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def generate_voice(
# generate semantic tokens
# Load the HuBERT model
hubert_manager = HubertManager()
hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
# hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])

hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
Expand Down
11 changes: 8 additions & 3 deletions TTS/tts/layers/losses.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):

def __init__(self, pos_weight: float = None):
super().__init__()
self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
self.register_buffer("pos_weight", torch.tensor([pos_weight]))

def forward(self, x, target, length):
"""
Expand All @@ -191,10 +191,15 @@ class for each corresponding step.
mask = sequence_mask(sequence_length=length, max_len=target.size(1))
num_items = mask.sum()
loss = functional.binary_cross_entropy_with_logits(
x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
x.masked_select(mask),
target.masked_select(mask),
pos_weight=self.pos_weight.to(x.device),
reduction="sum",
)
else:
loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
loss = functional.binary_cross_entropy_with_logits(
x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
)
num_items = torch.numel(x)
loss = loss / num_items
return loss
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def maximum_path_numpy(value, mask, max_neg_val=None):
device = value.device
dtype = value.dtype
value = value.cpu().detach().numpy()
mask = mask.cpu().detach().numpy().astype(np.bool)
mask = mask.cpu().detach().numpy().astype(bool)

b, t_x, t_y = value.shape
direction = np.zeros(value.shape, dtype=np.int64)
Expand Down
5 changes: 4 additions & 1 deletion TTS/utils/audio/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,10 @@ def _istft(self, y: np.ndarray) -> np.ndarray:

def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
try:
S_complex = np.abs(S).astype(np.complex)
except AttributeError: # np.complex is deprecated since numpy 1.20.0
S_complex = np.abs(S).astype(complex)
y = self._istft(S_complex * angles)
if not np.isfinite(y).all():
print(" [!] Waveform is not finite everywhere. Skipping the GL.")
Expand Down
Loading

0 comments on commit 6b9ebf5

Please sign in to comment.