
    i                     j    d Z ddlmZ ddlmZ ddlmZ  ed      e G d d	e                    Zd	gZy
)zmT5 model configuration    )strict   )PreTrainedConfig)auto_docstringzgoogle/mt5-small)
checkpointc                       e Zd ZU dZdZdgZdddddZd	Zee	d
<   dZ
ee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   dZee	d<   dZee	d<   dZee	d<   dZeez  e	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d <   dZee	d!<   dZee	d"<   dZedz  e	d#<   d$Zedz  e	d%<   d&Zeee   z  dz  e	d'<   d$Z edz  e	d(<   d)Z!eez  e	d*<   d+Z"ee	d,<    fd-Z#d. Z$ xZ%S )/	MT5Configa  
    relative_attention_num_buckets (`int`, *optional*, defaults to 32):
        The number of buckets to use for each attention layer.
    relative_attention_max_distance (`int`, *optional*, defaults to 128):
        The maximum distance of the longer sequences for the bucket separation.
    feed_forward_proj (`str`, *optional*, defaults to `"gated-gelu"`):
        Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
    mt5past_key_valuesd_model	num_heads
num_layersd_kv)hidden_sizenum_attention_headsnum_hidden_layershead_dimi  
vocab_sizei   @   i   d_ff   Nnum_decoder_layers       relative_attention_num_buckets   relative_attention_max_distanceg?dropout_rategư>layer_norm_epsilong      ?initializer_factor
gated-gelufeed_forward_projTis_encoder_decoder	use_cachetie_word_embeddingsbos_token_idr   pad_token_id   eos_token_iddecoder_start_token_idg        classifier_dropoutF
is_decoderc                 8   | j                   | j                   n| j                  | _         | j                  j                  d      }|d   | _        |d   dk(  | _        | j                  dk(  rd| _        |j                  dd        d| _        t        | $  d	i | y )
N-r   gatedr!   gelu_newr%   T )
r   r   r"   splitdense_act_fnis_gated_actpopr%   super__post_init__)selfkwargsact_info	__class__s      z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/mt5/configuration_mt5.pyr8   zMT5Config.__post_init__B   s    '+'>'>'JD##PTP_P_ 	 ))//4$RL$QK72!!\1 *D 	

($/#' ''    c                     | j                   j                  d      }t        |      dkD  r|d   dk7  st        |      dkD  rt        d| j                    d      y)	zOPart of `@strict`-powered validation. Validates the architecture of the config.r.   r(   r   r0      z`feed_forward_proj`: z is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'N)r"   r3   len
ValueError)r9   r;   s     r=   validate_architecturezMT5Config.validate_architectureS   sf    ))//4x=1!!73x=1;L'(>(>'? @) )  <Mr>   )&__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   r   r   r   r   r   r   r   r   floatr   r    r"   strr#   boolr$   r%   r&   r'   r)   listr*   r+   r,   r8   rC   __classcell__)r<   s   @r=   r	   r	      sA    J#4"5 *)	M JGSD#ND#J%)d
)Is*,"C,+.#S. #L%#+# $$ ##)s)##It $$#L#*# L#* +,L#S	/D(,)*C$J*&))J("	r>   r	   N)	rG   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r2   r>   r=   <module>rV      sJ     . 3 # -.D  D  /DN -r>   