Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Is there pretrained HIFI gan vocoder? #59

Closed
netuserjun opened this issue Aug 16, 2022 · 10 comments
Closed

Is there pretrained HIFI gan vocoder? #59

netuserjun opened this issue Aug 16, 2022 · 10 comments

Comments

@netuserjun
Copy link

Thank you for share this great repository.

I tried some song to convert voice, but there is problem with high pitch.

The pretrained vocoder can not express high pitch..

So If you have the pretrained hifi pkl file with proper config, please share..

Thanks.

@yl4579
Copy link
Owner

yl4579 commented Aug 22, 2022

I have pre-trained a Hifi-GAN model on LibriTTS for my TTS project. Not sure if it works well for VCTK or other datasets. Here is the link: https://drive.google.com/file/d/1odvXt8w_cjHDoYMzJMZCJtwR2r7yRaSY/view?usp=sharing

@yl4579 yl4579 closed this as completed Aug 22, 2022
@yl4579
Copy link
Owner

yl4579 commented Aug 22, 2022

If you would like to fine-tune using the preprocessing of this repo, try replacing the meldataset.py in the Hifi-GAN repo with the following file:

import math
import os
import random
import torch
import torch.utils.data
import numpy as np
from librosa.util import normalize
from scipy.io.wavfile import read
from librosa.filters import mel as librosa_mel_fn
import torchaudio
import librosa

MAX_WAV_VALUE = 32768.0


def load_wav(full_path):
    sampling_rate, data = read(full_path)
    return data, sampling_rate


def dynamic_range_compression(x, C=1, clip_val=1e-5):
    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)


def dynamic_range_decompression(x, C=1):
    return np.exp(x) / C


def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
    return torch.log(torch.clamp(x, min=clip_val) * C)


def dynamic_range_decompression_torch(x, C=1):
    return torch.exp(x) / C


def spectral_normalize_torch(magnitudes):
    output = dynamic_range_compression_torch(magnitudes)
    return output


def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression_torch(magnitudes)
    return output


mel_basis = {}
hann_window = {}

# preprocess 
to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def preprocess(wave_tensor):    
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor


def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    if torch.min(y) < -1.:
        print('min value is ', torch.min(y))
    if torch.max(y) > 1.:
        print('max value is ', torch.max(y))

    global mel_basis, hann_window
    if fmax not in mel_basis:
        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
    y = y.squeeze(1)

    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
                      center=center, pad_mode='reflect', normalized=False, onesided=True)

    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))

    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
    spec = spectral_normalize_torch(spec)

    return spec


def get_dataset_filelist(a):
    with open(a.input_training_file, 'r', encoding='utf-8') as fi:
        training_files = [x.split('|')[0]
                          for x in fi.read().split('\n') if len(x) > 0]

    with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
        validation_files = [x.split('|')[0]
                            for x in fi.read().split('\n') if len(x) > 0]
    return training_files, validation_files


class MelDataset(torch.utils.data.Dataset):
    def __init__(self, training_files, segment_size, n_fft, num_mels,
                 hop_size, win_size, sampling_rate,  fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
                 device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
        self.audio_files = training_files
        random.seed(1234)
        if shuffle:
            random.shuffle(self.audio_files)
        self.segment_size = segment_size
        self.sampling_rate = sampling_rate
        self.split = split
        self.n_fft = n_fft
        self.num_mels = num_mels
        self.hop_size = hop_size
        self.win_size = win_size
        self.fmin = fmin
        self.fmax = fmax
        self.fmax_loss = fmax_loss
        self.cached_wav = None
        self.n_cache_reuse = n_cache_reuse
        self._cache_ref_count = 0
        self.device = device
        self.fine_tuning = fine_tuning
        self.base_mels_path = base_mels_path

    def __getitem__(self, index):
        filename = self.audio_files[index]
        if self._cache_ref_count == 0:
            audio, sampling_rate = load_wav(filename)
            audio = audio / MAX_WAV_VALUE
            if not self.fine_tuning:
                audio = normalize(audio) * 0.95
            self.cached_wav = audio
            if sampling_rate != self.sampling_rate:
                audio = librosa.resample(audio, sampling_rate, self.sampling_rate)
#                 raise ValueError("{} SR doesn't match target {} SR, {}".format(
#                     sampling_rate, self.sampling_rate, filename))
            self._cache_ref_count = self.n_cache_reuse
        else:
            audio = self.cached_wav
            self._cache_ref_count -= 1

        audio = torch.FloatTensor(audio)
        audio = audio.unsqueeze(0)

        if not self.fine_tuning:
            if self.split:
                if audio.size(1) >= self.segment_size:
                    max_audio_start = audio.size(1) - self.segment_size
                    audio_start = random.randint(0, max_audio_start)
                    audio = audio[:, audio_start:audio_start+self.segment_size]
                else:
                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')

            mel = preprocess(audio)
        else:
            mel = np.load(filename + '.npy')
            mel = torch.from_numpy(mel)

            if len(mel.shape) < 3:
                mel = mel.unsqueeze(0)

            if self.split:
                frames_per_seg = math.ceil(self.segment_size / self.hop_size)

                if audio.size(1) >= self.segment_size:
                    mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
                    audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
                else:
                    mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')

        mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
                                   center=False)
        
        if mel.shape[-1] != mel_loss.shape[-1]:
            mel = mel[..., :mel_loss.shape[-1]]
        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())

    def __len__(self):
        return len(self.audio_files)

@netuserjun
Copy link
Author

Thank you so much!
I'll try and share the result.

@jupinter
Copy link

The pretrained vocoder got a problem, "RuntimeError: PytorchStreamReader failed locating file constants.pkl: file not found", constants.pkl not included in the zip file.

@MuruganR96
Copy link

MuruganR96 commented Nov 18, 2022

Hi @yl4579

Thanks for sharing pretrained model.

I want to train mel features with the HiFiGAN setting for StarGAN-V2-VC. Can you share the meldataset.py? or guide me on what changes need to make?

Thanks

@MuruganR96
Copy link

MuruganR96 commented Nov 19, 2022

@yl4579 I tried the code that I posted above. for me losses NaN, Inf.

Can you please tell me, Is the hifigan features that pass as input to the StarGAN-v2-VC is wrong?

--- epoch 44 ---
train/real     : nan
train/fake     : nan
train/reg      : inf
train/real_adv_cls: 0.0000
train/con_reg  : nan
train/adv      : nan
train/sty      : nan
train/ds       : 0.0005
train/cyc      : nan
train/norm     : inf
train/asr      : 0.2963
train/f0       : 0.6593
train/adv_cls  : 0.0000
eval/real      : nan
eval/fake      : nan
eval/reg       : 0.0000
eval/real_adv_cls: 0.0000
eval/con_reg   : 0.0000
eval/adv       : nan
eval/sty       : nan
eval/ds        : nan
eval/cyc       : nan
eval/norm      : nan
eval/asr       : nan
eval/f0        : nan
eval/adv_cls   : 0.0000

Thanks

@yl4579
Copy link
Owner

yl4579 commented Nov 22, 2022

@mraj96 please see the meldataset.py I shared above.

As for NaN, please refer to #60 and see if it helps.

@MuruganR96
Copy link

MuruganR96 commented Nov 22, 2022

@yl4579 meldataset.py i am facing the issue.

python3 train.py --fine_tuning False --config config.json

Traceback (most recent call last):
  File "/root/hifigan_v2vc/hifi-gan/train.py", line 271, in <module>
    main()
  File "/root/hifigan_v2vc/hifi-gan/train.py", line 267, in main
    train(0, a, h)
  File "/root/hifigan_v2vc/hifi-gan/train.py", line 149, in train
    loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
  File "/root/hifigan_v2vc/hifi-gan/models.py", line 255, in feature_loss
    loss += torch.mean(torch.abs(rl - gl))
RuntimeError: The size of tensor a (9600) must match the size of tensor b (9602) at non-singleton dimension 2

when hifigan training, Have you modified the model.py code with these changes? - jik876/hifi-gan#12 (comment)

I fixed it. cutoff (conv_in_size * u) after each transposed convolution.

@1nlplearner
Copy link

If you would like to fine-tune using the preprocessing of this repo, try replacing the meldataset.py in the Hifi-GAN repo with the following file:

import math
import os
import random
import torch
import torch.utils.data
import numpy as np
from librosa.util import normalize
from scipy.io.wavfile import read
from librosa.filters import mel as librosa_mel_fn
import torchaudio
import librosa

MAX_WAV_VALUE = 32768.0


def load_wav(full_path):
    sampling_rate, data = read(full_path)
    return data, sampling_rate


def dynamic_range_compression(x, C=1, clip_val=1e-5):
    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)


def dynamic_range_decompression(x, C=1):
    return np.exp(x) / C


def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
    return torch.log(torch.clamp(x, min=clip_val) * C)


def dynamic_range_decompression_torch(x, C=1):
    return torch.exp(x) / C


def spectral_normalize_torch(magnitudes):
    output = dynamic_range_compression_torch(magnitudes)
    return output


def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression_torch(magnitudes)
    return output


mel_basis = {}
hann_window = {}

# preprocess 
to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def preprocess(wave_tensor):    
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor


def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    if torch.min(y) < -1.:
        print('min value is ', torch.min(y))
    if torch.max(y) > 1.:
        print('max value is ', torch.max(y))

    global mel_basis, hann_window
    if fmax not in mel_basis:
        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
    y = y.squeeze(1)

    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
                      center=center, pad_mode='reflect', normalized=False, onesided=True)

    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))

    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
    spec = spectral_normalize_torch(spec)

    return spec


def get_dataset_filelist(a):
    with open(a.input_training_file, 'r', encoding='utf-8') as fi:
        training_files = [x.split('|')[0]
                          for x in fi.read().split('\n') if len(x) > 0]

    with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
        validation_files = [x.split('|')[0]
                            for x in fi.read().split('\n') if len(x) > 0]
    return training_files, validation_files


class MelDataset(torch.utils.data.Dataset):
    def __init__(self, training_files, segment_size, n_fft, num_mels,
                 hop_size, win_size, sampling_rate,  fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
                 device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
        self.audio_files = training_files
        random.seed(1234)
        if shuffle:
            random.shuffle(self.audio_files)
        self.segment_size = segment_size
        self.sampling_rate = sampling_rate
        self.split = split
        self.n_fft = n_fft
        self.num_mels = num_mels
        self.hop_size = hop_size
        self.win_size = win_size
        self.fmin = fmin
        self.fmax = fmax
        self.fmax_loss = fmax_loss
        self.cached_wav = None
        self.n_cache_reuse = n_cache_reuse
        self._cache_ref_count = 0
        self.device = device
        self.fine_tuning = fine_tuning
        self.base_mels_path = base_mels_path

    def __getitem__(self, index):
        filename = self.audio_files[index]
        if self._cache_ref_count == 0:
            audio, sampling_rate = load_wav(filename)
            audio = audio / MAX_WAV_VALUE
            if not self.fine_tuning:
                audio = normalize(audio) * 0.95
            self.cached_wav = audio
            if sampling_rate != self.sampling_rate:
                audio = librosa.resample(audio, sampling_rate, self.sampling_rate)
#                 raise ValueError("{} SR doesn't match target {} SR, {}".format(
#                     sampling_rate, self.sampling_rate, filename))
            self._cache_ref_count = self.n_cache_reuse
        else:
            audio = self.cached_wav
            self._cache_ref_count -= 1

        audio = torch.FloatTensor(audio)
        audio = audio.unsqueeze(0)

        if not self.fine_tuning:
            if self.split:
                if audio.size(1) >= self.segment_size:
                    max_audio_start = audio.size(1) - self.segment_size
                    audio_start = random.randint(0, max_audio_start)
                    audio = audio[:, audio_start:audio_start+self.segment_size]
                else:
                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')

            mel = preprocess(audio)
        else:
            mel = np.load(filename + '.npy')
            mel = torch.from_numpy(mel)

            if len(mel.shape) < 3:
                mel = mel.unsqueeze(0)

            if self.split:
                frames_per_seg = math.ceil(self.segment_size / self.hop_size)

                if audio.size(1) >= self.segment_size:
                    mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
                    audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
                else:
                    mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')

        mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
                                   center=False)
        
        if mel.shape[-1] != mel_loss.shape[-1]:
            mel = mel[..., :mel_loss.shape[-1]]
        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())

    def __len__(self):
        return len(self.audio_files)

thanks for sharing the code.
i have 2 questions

  1. sampling_rate in config.yaml is 24000, and i find the function torchaudio.transforms.MelSpectrogram set sr=16k as default, should i set sr=24k when initial to_mel?
  2. why mel_loss has different transform function from mel

@yl4579
Copy link
Owner

yl4579 commented Feb 12, 2023

@1nlplearner For 1, see #10. For 2, it doesn't matter what loss function you see, I just used the default mel_loss from HifiGAN because you are now working in the waveform domain. You can take any form of melspectrogram spectrogram to calculate the loss.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

5 participants