
    i{"                     j    d Z ddlmZ ddlmZ ddlmZ  ed      e G d d	e                    Zd	gZy
)zVITS model configuration    )strict   )PreTrainedConfig)auto_docstringzfacebook/mms-tts-eng)
checkpointc                   
   e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZeed<   dZeed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeez  ed<   dZeez  ed<   dZeez  ed<   dZeed<   dZeed <   dZeed!<   d"Zeed#<   d$Zeed%<   d&Zeed'<   d(Ze e   e!ed)f   z  ed*<   d+Z"e e   e!ed)f   z  ed,<   d-Z#e e   e!ed)f   z  ed.<   d/Z$e e!z  ed0<   dZ%eed1<   d	Z&eed2<   dZ'eed3<   d4Z(eed5<   d6Z)eed7<   dZ*eed8<   d9Z+eez  ed:<   dZ,eed;<   d<Z-eed=<   dZ.eed><   dZ/eed?<   d@Z0eedA<   dBZ1eedC<   d"Z2eedD<   dEZ3eez  edF<   dGZ4eez  edH<   dIZ5eedJ<   dKZ6eedL<   dMZ7eedN<   dOZ8edOz  edP<   dQ Z9yO)R
VitsConfiga  
    window_size (`int`, *optional*, defaults to 4):
        Window size for the relative positional embeddings in the attention layers of the Transformer encoder.
    use_bias (`bool`, *optional*, defaults to `True`):
        Whether to use bias in the key, query, value projection layers in the Transformer encoder.
    ffn_kernel_size (`int`, *optional*, defaults to 3):
        Kernel size of the 1D convolution layers used by the feed-forward network in the Transformer encoder.
    flow_size (`int`, *optional*, defaults to 192):
        Dimensionality of the flow layers.
    spectrogram_bins (`int`, *optional*, defaults to 513):
        Number of frequency bins in the target spectrogram.
    use_stochastic_duration_prediction (`bool`, *optional*, defaults to `True`):
        Whether to use the stochastic duration prediction module or the regular duration predictor.
    num_speakers (`int`, *optional*, defaults to 1):
        Number of speakers if this is a multi-speaker model.
    speaker_embedding_size (`int`, *optional*, defaults to 0):
        Number of channels used by the speaker embeddings. Is zero for single-speaker models.
    upsample_initial_channel (`int`, *optional*, defaults to 512):
        The number of input channels into the HiFi-GAN upsampling network.
    upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
        A tuple of integers defining the stride of each 1D convolutional layer in the HiFi-GAN upsampling network.
        The length of `upsample_rates` defines the number of convolutional layers and has to match the length of
        `upsample_kernel_sizes`.
    upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
        A tuple of integers defining the kernel size of each 1D convolutional layer in the HiFi-GAN upsampling
        network. The length of `upsample_kernel_sizes` defines the number of convolutional layers and has to match
        the length of `upsample_rates`.
    resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`):
        A tuple of integers defining the kernel sizes of the 1D convolutional layers in the HiFi-GAN
        multi-receptive field fusion (MRF) module.
    resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
        A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
        HiFi-GAN multi-receptive field fusion (MRF) module.
    leaky_relu_slope (`float`, *optional*, defaults to 0.1):
        The angle of the negative slope used by the leaky ReLU activation.
    depth_separable_channels (`int`, *optional*, defaults to 2):
        Number of channels to use in each depth-separable block.
    depth_separable_num_layers (`int`, *optional*, defaults to 3):
        Number of convolutional layers to use in each depth-separable block.
    duration_predictor_flow_bins (`int`, *optional*, defaults to 10):
        Number of channels to map using the unonstrained rational spline in the duration predictor model.
    duration_predictor_tail_bound (`float`, *optional*, defaults to 5.0):
        Value of the tail bin boundary when computing the unconstrained rational spline in the duration predictor
        model.
    duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
        Kernel size of the 1D convolution layers used in the duration predictor model.
    duration_predictor_dropout (`float`, *optional*, defaults to 0.5):
        The dropout ratio for the duration predictor model.
    duration_predictor_num_flows (`int`, *optional*, defaults to 4):
        Number of flow stages used by the duration predictor model.
    duration_predictor_filter_channels (`int`, *optional*, defaults to 256):
        Number of channels for the convolution layers used in the duration predictor model.
    prior_encoder_num_flows (`int`, *optional*, defaults to 4):
        Number of flow stages used by the prior encoder flow model.
    prior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 4):
        Number of WaveNet layers used by the prior encoder flow model.
    posterior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 16):
        Number of WaveNet layers used by the posterior encoder model.
    wavenet_kernel_size (`int`, *optional*, defaults to 5):
        Kernel size of the 1D convolution layers used in the WaveNet model.
    wavenet_dilation_rate (`int`, *optional*, defaults to 1):
        Dilation rates of the dilated 1D convolutional layers used in the WaveNet model.
    wavenet_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the WaveNet layers.
    speaking_rate (`float`, *optional*, defaults to 1.0):
        Speaking rate. Larger values give faster synthesised speech.
    noise_scale (`float`, *optional*, defaults to 0.667):
        How random the speech prediction is. Larger values create more variation in the predicted speech.
    noise_scale_duration (`float`, *optional*, defaults to 0.8):
        How random the duration prediction is. Larger values create more variation in the predicted durations.

    Example:

    ```python
    >>> from transformers import VitsModel, VitsConfig

    >>> # Initializing a "facebook/mms-tts-eng" style configuration
    >>> configuration = VitsConfig()

    >>> # Initializing a model (with random weights) from the "facebook/mms-tts-eng" style configuration
    >>> model = VitsModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```vits&   
vocab_size   hidden_size   num_hidden_layers   num_attention_heads   window_sizeTuse_biasi   ffn_dimg?	layerdropr   ffn_kernel_size	flow_sizei  spectrogram_binsrelu
hidden_acthidden_dropoutattention_dropoutactivation_dropoutg{Gz?initializer_rangegh㈵>layer_norm_eps"use_stochastic_duration_prediction   num_speakersr   speaker_embedding_sizei   upsample_initial_channel)   r'   r   r   .upsample_rates)   r)   r   r   upsample_kernel_sizes)r         resblock_kernel_sizes)r#   r      r.   r.   resblock_dilation_sizesleaky_relu_slopedepth_separable_channelsdepth_separable_num_layers
   duration_predictor_flow_binsg      @duration_predictor_tail_boundduration_predictor_kernel_sizeg      ?duration_predictor_dropoutduration_predictor_num_flows   "duration_predictor_filter_channelsprior_encoder_num_flows prior_encoder_num_wavenet_layersr)   $posterior_encoder_num_wavenet_layersr/   wavenet_kernel_sizewavenet_dilation_rateg        wavenet_dropoutg      ?speaking_rategMbX?noise_scaleg?noise_scale_durationi>  sampling_rateNpad_token_idc                     t        | j                        t        | j                        k7  r8t        dt        | j                         dt        | j                         d      y)zOPart of `@strict`-powered validation. Validates the architecture of the config.z'The length of `upsample_kernel_sizes` (z-) must match the length of `upsample_rates` ()N)lenr*   r(   
ValueError)selfs    |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/vits/configuration_vits.pyvalidate_architecturez VitsConfig.validate_architecture   se    t))*c$2E2E.FF9#d>X>X:Y9Z [%%()<)<%=$>aA  G    ):__name__
__module____qualname____doc__
model_typer   int__annotations__r   r   r   r   r   boolr   r   floatr   r   r   r   strr   r   r   r    r!   r"   r$   r%   r&   r(   listtupler*   r-   r0   r1   r2   r3   r5   r6   r7   r8   r9   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rM    rN   rL   r	   r	      sI   Tl JJKs  KHdGS Ius{ OSIscJ"%NECK%%(us{(&))#u# NE /3&3L#"#C#$'c'2>NDIc3h/>9G49uS#X6G9C49uS#X6C,MTE\M!e!$%c%&''(* #*+.!5.*+"C+.11() #).1&1#$S$,-$c-02(#2  !"3"#&OUS[&!$M53;$K"%%%M3#L#*#rN   r	   N)	rR   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r[   rN   rL   <module>r`      sJ     . 3 # 12M! M  3M` .rN   