12345678910111213141516171819202122232425262728293031323334353637 |
- transformer:
- encoder_layer: 4
- encoder_head: 2
- encoder_hidden: 256
- decoder_layer: 6
- decoder_head: 2
- decoder_hidden: 256
- conv_filter_size: 1024
- conv_kernel_size: [9, 1]
- encoder_dropout: 0.2
- decoder_dropout: 0.2
- variance_predictor:
- filter_size: 256
- kernel_size: 3
- dropout: 0.5
- variance_embedding:
- pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
- energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
- n_bins: 256
- # gst:
- # use_gst: False
- # conv_filters: [32, 32, 64, 64, 128, 128]
- # gru_hidden: 128
- # token_size: 128
- # n_style_token: 10
- # attn_head: 4
- multi_speaker: True
- max_seq_len: 1000
- vocoder:
- model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN'
- speaker: "universal" # support 'LJSpeech', 'universal'
|