
    i1!                         d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
  ed	
      e G d de                    Z ed	
      e G d de                    ZddgZy)zMoshi model configuration    )strict   )PreTrainedConfig)RopeParameters)auto_docstring   )
AutoConfigzkmhf/hf-moshiko)
checkpointc                       e Zd ZU dZdZdgZdZeed<   dZ	eed<   dZ
eed	<   d
Zeed<   dZeed<   dZedz  ed<   dZeed<   dZeed<   dZeed<   dZedz  ed<   dZeed<   dZeed<   dZeed<   dZeez  ed<   dZeed <   d!Zeed"<   dZeed#<   d$Zeed%<   dZedz  ed&<   dZedz  ed'<   dZee e   z  dz  ed(<    fd)Z!d* Z" xZ#S )+MoshiDepthConfiga  
    input_size (`int`, *optional*, defaults to 4096):
        Dimensionality of the input hidden states. Used to connect the main decoder to the depth decoder.
    audio_vocab_size (`int`, *optional*, defaults to 2048):
        Vocabulary size of the audio part of model. Defines the number of different tokens that can be
        represented by the `audio_codes` passed when calling the Moshi models.
    ffn_dim (`int`, *optional*, defaults to 5632):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the depth decoder block. Must be even.

    Example:

    ```python
    >>> from transformers import (
    ...     MoshiDepthConfig,
    ...     MoshiDepthDecoder,
    ... )

    >>> configuration = MoshiDepthConfig()

    >>> # Initializing a MoshiDepthDecoder (with random weights) from the kmhf/hf-moshiko style configuration
    >>> model = MoshiDepthDecoder(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```moshi_depthpast_key_values }  
vocab_sizei   hidden_size   
input_size   num_hidden_layers   num_attention_headsNnum_key_value_headsi   audio_vocab_size	   max_position_embeddingssilu
hidden_acthead_dim{Gz?initializer_rangeT	use_cache   sliding_window        attention_dropouti   ffn_dim:0yE>rms_norm_epsnum_codebooksFtie_word_embeddingspad_token_idbos_token_ideos_token_idc                     | j                   | j                   n| j                  | _         | j                  xs | j                  | j                  z  | _        t	        |   di | y )N )r   r   r   r   super__post_init__)selfkwargs	__class__s     ~/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/moshi/configuration_moshi.pyr1   zMoshiDepthConfig.__post_init__N   s[    (,(@(@(LD$$RVRjRj 	  U)9)9T=U=U)U''    c                 Z    | j                   dz  dk(  rt        d| j                    d      y)OPart of `@strict`-powered validation. Validates the architecture of the config.r      	`ffn_dim=` must be even.N)r&   
ValueErrorr2   s    r5   validate_architecturez&MoshiDepthConfig.validate_architectureU   s0    <<!q yoFGG !r6   )$__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencer   int__annotations__r   r   r   r   r   r   r   r   strr   r    floatr!   boolr#   r%   r&   r(   r)   r*   r+   r,   r-   listr1   r>   __classcell__r4   s   @r5   r   r      s   4 J#4"5JKJs!!&*t* c #$S$JHcDj#u#ItNC%(us{(GSL%M3 %%#L#*##L#*#+/L#S	/D(/(Hr6   r   c                        e Zd ZU dZdZdgZeedZdZ	e
ed<   dZe
ed<   d	Ze
ed
<   d	Ze
ed<   dZe
dz  ed<   dZe
dz  ed<   dZe
ed<   dZeez  dz  ed<   dZeed<   dZe
dz  ed<   dZeed<   dZeed<   dZe
ed<   dZee
z  ed<   dZe
ed<   dZeed<   d Z e
ed!<   d"Z!eed#<   dZ"e
dz  ed$<   dZ#e
dz  ed%<   dZ$e
e%e
   z  dz  ed&<   dZ&ee'z  dz  ed'<   dZ(ee'z  dz  ed(<    fd)Z)d* Z*e+d+        Z,e-d'e'fd,       Z. xZ/S )-MoshiConfiga  
    audio_vocab_size (`int`, *optional*):
        Vocabulary size of the audio part of model. Defines the number of different tokens that can be
        represented by the `audio_codes` passed when calling the Moshi models.
    ffn_dim (`int`, *optional*, defaults to 22528):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the main decoder block. Must be even.
    audio_encoder_config (`PreTrainedConfig | dict`, *optional*):
        Configuration for the audio encoder.
    depth_decoder_config (`PreTrainedConfig | dict`, *optional*):
        Configuration for the depth decoder.

    Example:

    ```python
    >>> from transformers import (
    ...     MoshiConfig,
    ...     MoshiForConditionalGeneration,
    ... )

    >>> configuration = MoshiConfig()

    >>> # Initializing a MoshiForConditionalGeneration (with random weights) from the kmhf/hf-moshiko style configuration
    >>> model = MoshiForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # Saving the model, including its configuration
    >>> model.save_pretrained("kmhf/hf-moshiko")

    >>> # loading model and config from pretrained folder
    >>> moshi_config = MoshiConfig.from_pretrained("kmhf/hf-moshiko")
    >>> model = MoshiForConditionalGeneration.from_pretrained("kmhf/hf-moshiko", config=moshi_config)
    ```moshir   )audio_encoder_configdepth_decoder_configr   r   r   r       r   r   Nr   r   i  r   rope_parametersr   r   r   r   r    Tr!   r#   r$   r%   i X  r&   r'   r(   r"   r)   Fr*   r+   r,   r-   rP   rQ   c                 h   | j                   | j                   n| j                  | _         | j                  xs | j                  | j                  z  | _        t	        | j
                  t              rB| j
                  j                  dd      }t        j                  |fi | j
                  | _        n&| j
                  t        j                  d      | _        | j                  | j
                  j                  n| j                  | _
        t	        | j                  t              rc| j                  j                  | j                  | j                  | j                  | j                  d       t!        di | j                  | _        n| j                  t!               | _        t#        | H  di | y )NrC   mimi)r   r   r   r)   r/   )r   r   r   r   
isinstancerP   dictpopr	   	for_modelr   codebook_sizerQ   updater   r)   r   r0   r1   )r2   r3   audio_encoder_model_typer4   s      r5   r1   zMoshiConfig.__post_init__   sm   (,(@(@(LD$$RVRjRj 	  U)9)9T=U=U)Ud//6'+'@'@'D'D\SY'Z$(2(<(<=U(sY]YrYr(sD%&&.(2(<(<V(DD% 8<7L7L7TD%%33Z^ZoZo 	 d//6%%,,(,(=(="&"2"2"&//%)%7%7	 )9(U4;T;T(UD%&&.(8(:D%''r6   c                     | j                   dz  dk(  rt        d| j                    d      | j                  | j                  j                  kD  r0t        d| j                   d| j                  j                   d      y)	r8   r   r9   r:   r;   z`num_codebooks=zX` is greater than the maximum number of codebooks that the audio encoder can deal with (z). Please lower it.N)r&   r<   r)   rP   r=   s    r5   r>   z!MoshiConfig.validate_architecture   s    <<!q yoFGG 9 9 G GG!$"4"4!5  6N  OS  Oh  Oh  Ov  Ov  Nw  wJ  K  Hr6   c                 .    | j                   j                  S )N)rP   sampling_rater=   s    r5   r_   zMoshiConfig.sampling_rate   s    ((666r6   c                 2     | dd|j                         i|S )z
        Instantiate a [`MoshiConfig`] (or a derived class) from an audio encoder configuration.

        Returns:
            [`MoshiConfig`]: An instance of a configuration object
        rP   r/   )to_dict)clsrP   r3   s      r5   from_audio_encoder_configz%MoshiConfig.from_audio_encoder_config   s*      
!5!=!=!?

 	
r6   )0r?   r@   rA   rB   rC   rD   r	   r   sub_configsr   rE   rF   r   r   r   r   r   r   rS   r   rW   r   rG   r   r    rH   r!   rI   r#   r%   r&   r(   r)   r*   r+   r,   r-   rJ   rP   r   rQ   r1   r>   propertyr_   classmethodrc   rK   rL   s   @r5   rN   rN   [   s   !F J#4"5+5O_`KJKs!!&*t*#'cDj'#'S'48O^d*T18JHcDj#u#ItNC%(us{(GSL%M3 %%#L#*##L#*#+/L#S	/D(/;?$!11D8?;?$!11D8?(< 7 7 
.
 
r6   rN   N)rB   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   auto.configuration_autor	   r   rN   __all__r/   r6   r5   <module>rm      s      . 3 1 # 0 ,->H' >H  .>HB ,-|
" |
  .|
~ ,
-r6   