Replies: 3 comments 1 reply
-
|
Beta Was this translation helpful? Give feedback.
-
|
我已经按照文档的要求,也把音频采样率调整到22050HZ,epoch200 为什么效果会差呢 微调的tts_models--de--thorsten--tacotron2-DDC模型 |
Beta Was this translation helpful? Give feedback.
-
|
甚至都无法进行正确的读取文本 |
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
我这里有30分钟左右德语数据集,我的目的是进行微调 让模型克隆数据集里的声音更加自然 ,我的config配置如下 不知道为什么微调的模型还没有原模型好,请帮助解答下。这是我的配置文件
{ "output_path": "/aia-storage/root/liyingjun/TTS/mix-yinpin/ours/dataset-mixed/output", "logger_uri": null, "run_name": "Thorsten-Dec2021-22k-DDC", "project_name": null, "run_description": "\ud83d\udc38Coqui trainer run.", "print_step": 25, "plot_step": 100, "model_param_stats": false, "wandb_entity": null, "dashboard_logger": "tensorboard", "save_on_interrupt": true, "log_model_step": null, "save_step": 100, "save_n_checkpoints": 5, "save_checkpoints": true, "save_all_best": true, "save_best_after": 0, "target_loss": null, "print_eval": true, "test_delay_epochs": -1, "run_eval": true, "run_eval_steps": null, "distributed_backend": "nccl", "distributed_url": "tcp://localhost:54321", "mixed_precision": false, "precision": "fp16", "epochs": 150, "batch_size": 8, "eval_batch_size": 8, "grad_clip": 5.0, "scheduler_after_epoch": false, "lr": 1e-05, "optimizer": "RAdam", "optimizer_params": { "betas": [ 0.9, 0.998 ], "weight_decay": 1e-06 }, "lr_scheduler": "NoamLR", "lr_scheduler_params": { "warmup_steps": 4000 }, "use_grad_scaler": false, "allow_tf32": false, "cudnn_enable": true, "cudnn_deterministic": false, "cudnn_benchmark": false, "training_seed": 54321, "model": "tacotron2", "num_loader_workers": 4, "num_eval_loader_workers": 4, "use_noise_augment": false, "audio": { "fft_size": 1024, "win_length": 1024, "hop_length": 256, "frame_shift_ms": null, "frame_length_ms": null, "stft_pad_mode": "reflect", "sample_rate": 22050, "resample": false, "preemphasis": 0.0, "ref_level_db": 20, "do_sound_norm": false, "log_func": "np.log", "do_trim_silence": true, "trim_db": 60, "do_rms_norm": false, "db_level": null, "power": 1.5, "griffin_lim_iters": 60, "num_mels": 80, "mel_fmin": 50.0, "mel_fmax": null, "spec_gain": 1, "do_amp_to_db_linear": true, "do_amp_to_db_mel": true, "pitch_fmax": 640.0, "pitch_fmin": 0.0, "signal_norm": false, "min_level_db": -100, "symmetric_norm": true, "max_norm": 4.0, "clip_norm": true, "stats_path": null }, "use_phonemes": true, "phonemizer": "espeak", "phoneme_language": "de", "compute_input_seq_cache": false, "text_cleaner": "basic_german_cleaners", "enable_eos_bos_chars": false, "test_sentences_file": "", "phoneme_cache_path": "/ssd/___tts/thorsten-ddc/output/phoneme_cache", "characters": { "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes", "vocab_dict": null, "pad": "<PAD>", "eos": "<EOS>", "bos": "<BOS>", "blank": "<BLNK>", "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b", "punctuations": "!'(),-.:;? ", "phonemes": null, "is_unique": false, "is_sorted": true }, "add_blank": false, "batch_group_size": 0, "loss_masking": true, "min_audio_len": 1, "max_audio_len": Infinity, "min_text_len": 1, "max_text_len": Infinity, "compute_f0": false, "compute_energy": false, "compute_linear_spec": false, "precompute_num_workers": 8, "start_by_longest": false, "shuffle": false, "drop_last": false, "datasets": [ { "formatter": "", "dataset_name": "", "path": "/aia-storage/root/liyingjun/TTS/mix-yinpin/ours/dataset-mixed", "meta_file_train": "metadata.csv", "ignored_speakers": null, "language": "", "phonemizer": "", "meta_file_val": "", "meta_file_attn_mask": "" } ], "test_sentences": [ "und \u00fcberzeugen dank feingef\u00fchl f\u00fcr den ganz gro\u00dfen leinwand-stoff.", "zur schadensh\u00f6he gab es keine angaben.", "au\u00dferdem k\u00f6nnen glasscheiben, w\u00e4nde und andere hindernisse das ergebnis beeinflussen.", "ihre lippen m\u00fcssen dennoch nicht zwingend auf farbe verzichten.", "es dauert lange, eine eigene Stimme zu entwickeln, aber jetzt wo ich sie habe, bin ich nie wieder still.", "heute scheint die Sonne, Regen ist nicht zu erwarten.", "die aktuelle Au\u00dfentemperatur betr\u00e4gt zw\u00f6lf Grad Celsius bei einer Luftfeuchtigkeit von achtunddrei\u00dfig Prozent." ], "eval_split_max_size": null, "eval_split_size": 0.05, "use_speaker_weighted_sampler": false, "speaker_weighted_sampler_alpha": 1.0, "use_language_weighted_sampler": false, "language_weighted_sampler_alpha": 1.0, "use_length_weighted_sampler": false, "length_weighted_sampler_alpha": 1.0, "use_gst": false, "gst": null, "gst_style_input": null, "use_capacitron_vae": false, "capacitron_vae": null, "num_speakers": 1, "num_chars": 131, "r": 6, "gradual_training": [ [ 0, 6, 64 ], [ 10000, 4, 32 ], [ 50000, 3, 32 ], [ 100000, 2, 32 ] ], "memory_size": -1, "prenet_type": "original", "prenet_dropout": true, "prenet_dropout_at_inference": false, "stopnet": true, "separate_stopnet": true, "stopnet_pos_weight": 10.0, "max_decoder_steps": 500, "encoder_in_features": 512, "decoder_in_features": 512, "decoder_output_dim": 80, "out_channels": 80, "attention_type": "original", "attention_heads": null, "attention_norm": "sigmoid", "attention_win": false, "windowing": false, "use_forward_attn": false, "forward_attn_mask": false, "transition_agent": false, "location_attn": true, "bidirectional_decoder": false, "double_decoder_consistency": true, "ddc_r": 6, "speakers_file": null, "use_speaker_embedding": false, "speaker_embedding_dim": 512, "use_d_vector_file": false, "d_vector_file": false, "d_vector_dim": null, "seq_len_norm": false, "decoder_loss_alpha": 0.25, "postnet_loss_alpha": 0.25, "postnet_diff_spec_alpha": 0.25, "decoder_diff_spec_alpha": 0.25, "decoder_ssim_alpha": 0.25, "postnet_ssim_alpha": 0.25, "ga_alpha": 5.0, "github_branch": "* dev" }这是我的train,py```import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
先加载预训练模型的配置
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from TTS.tts.datasets.tokenizer import Tokenizer
output_path ='/aia-storage/root/liyingjun/TTS/mix-yinpin/ours/dataset-mixed/output'
def formatter(root_path, manifest_file, **kwargs):
txt_file = os.path.join(root_path, manifest_file)
items = []
speaker_name = "thorsten"
init configs
dataset_config = BaseDatasetConfig(
meta_file_train="metadata.csv", path='/aia-storage/root/liyingjun/TTS/mix-yinpin/ours/dataset-mixed'
)
config = Tacotron2Config()
config.load_json("/aia-storage/root/liyingjun/TTS/etc/tts/tts_models--de--thorsten--tacotron2-DDC/config.json")
config.batch_size = 8 # 微调时减小batch_size(显存不足可继续减小)
config.eval_batch_size = 8
config.lr = 1e-5 # 微调学习率(预训练通常1e-3,微调要更小)
config.epochs = 150 # 微调不需要太多轮次
config.datasets = [dataset_config]
config.eval_split_size = 0.05
init audio processor
ap = AudioProcessor(**config.audio.to_dict())
INITIALIZE THE AUDIO PROCESSOR
Audio processor is used for feature extraction and audio I/O.
It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
INITIALIZE THE TOKENIZER
Tokenizer is used to convert text to sequences of token IDs.
If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
LOAD DATA SAMPLES
Each sample is a list of
[text, audio_file_path, speaker_name]You can define your custom sample loader returning the list of samples.
Or define your custom formatter and pass it to the
load_tts_samples.Check
TTS.tts.datasets.load_tts_samplesfor more details.train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
formatter=formatter
)
INITIALIZE THE MODEL
Models take a config object and a speaker manager as input
Config defines the details of the model like the number of layers, the size of the embedding, etc.
Speaker manager is used by multi-speaker models.
model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
model.load_checkpoint(config, "/aia-storage/root/liyingjun/TTS/etc/tts/tts_models--de--thorsten--tacotron2-DDC/model_file.pth")
init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path=output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()
Beta Was this translation helpful? Give feedback.
All reactions