
    i                     v    d Z ddlmZ ddlmZ ddlmZ ddlmZ  ed      e G d	 d
e                    Z	d
gZ
y)zMistral4 model configuration    )strict   )PreTrainedConfig)RopeParameters)auto_docstringz#mistralai/Mistral-Small-4-119B-2603)
checkpointc            
           e Zd ZU dZdZdgZdddddddddd	Zd	gd
gfddgdgfdgdgfdZddiZdZ	e
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
dz  ed<   dZe
ed<   d Ze
ed<   d!Zeed"<   d#Ze
ed$<   d%Ze
dz  ed&<   d'Ze
ed(<   d Ze
dz  ed)<   d'Ze
ed*<   dZe
dz  ed+<   dZe
dz  ed,<   d-Ze
dz  ed.<   d/Ze
dz  ed0<   d1Ze dz  ed2<   d3Z!e"ed4<   d5Z#e
ed6<   d7Z$eed8<   d9Z%eed:<   d1Z&e ed;<   d<Z'e
dz  ed=<   dZ(e
dz  ed><   d?Z)e
e*e
   z  dz  ed@<   dZ+e
dz  edA<   dBZ,e edC<   dZ-e.e/z  dz  edD<   d1Z0e dz  edE<   dBZ1e edF<   dGZ2ee
z  dz  edH<    fdIZ3dLdJe4dz  fdKZ5 xZ6S )MMistral4Configa  
    n_group (`int`, *optional*, defaults to 1):
        Number of groups for routed experts.
    first_k_dense_replace (`int`, *optional*, defaults to 0):
        Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                        \--k dense layers--/
    rope_interleave (`bool`, *optional*, defaults to `True`):
        Whether to interleave the rotary position embeddings.

    Example:

    ```python
    >>> from transformers import Mistral4Model, Mistral4Config

    >>> # Initializing a Mistral4 style configuration
    >>> configuration = Mistral4Config()

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mistral4past_key_valuespacked_colwiserowwisemoe_tp_expertscolwise)	z!layers.*.mlp.experts.gate_up_projzlayers.*.mlp.experts.down_projzlayers.*.mlp.expertsz%layers.*.mlp.shared_experts.gate_projz#layers.*.mlp.shared_experts.up_projz%layers.*.mlp.shared_experts.down_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormnum_local_expertsn_routed_expertsi   
vocab_sizei   hidden_sizei 0  intermediate_sizei   moe_intermediate_size$   num_hidden_layers    num_attention_headsNnum_key_value_heads   n_shared_experts         ?routed_scaling_factor   kv_lora_ranki   q_lora_rank@   qk_rope_head_dim
v_head_dimqk_nope_head_dimn_group
topk_group   num_experts_per_tokr   first_k_dense_replaceTnorm_topk_probsilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_eps	use_cache   pad_token_idbos_token_id   eos_token_idpretraining_tpFtie_word_embeddingsrope_parametersrope_interleaveattention_biasg        attention_dropoutc                    | j                   Adddd| j                  ddddd| j                  | j                  | j                  z   z  d| _         | j                  | j
                  | _        | j                  | j                  z   | _        | j                  | j                  z   | _        | j                   j                  d	| j                  | j                  z         t        | (  dd
ddhi| y )Nyarng     @g      `@i    g      @@r&   g?)type
rope_thetafactor original_max_position_embeddingsr7   	beta_fast	beta_slowmscale_all_dimmscalellama_4_scaling_betapartial_rotary_factorrQ   ignore_keys_at_rope_validationrP   r7    )rB   r7   r,   r.   r"   r!   qk_head_dimhead_dim
setdefaultsuper__post_init__)selfkwargs	__class__s     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/mistral4/configuration_mistral4.pyrX   zMistral4Config.__post_init__h   s    '%48+/+G+G! "%(+)-)>)>$BWBWZ^ZoZoBo)p$D  ##+'+'?'?D$0043H3HH--0E0EE''(?AVAVY]YfYfAfg 	
,BD]+^	
bh	
    rR   c                    |j                  dd       }|xs | j                  | _        | j                  | j                  ni | _        | j                  j                  d|j                  d| j                               | j	                          || j
                  |z  | _        | j                          dD ]6  }|| j                  v st        | j                  |         | j                  |<   8 |S )Nrope_scalingrI   )rL   rM   rJ   )poprB   rV   default_thetastandardize_rope_paramsrR   validate_ropefloat)rY   rR   rZ   r_   keys        r\   convert_rope_params_to_dictz*Mistral4Config.convert_rope_params_to_dict   s    zz.$7+Ct/C/C7;7K7K7Wt33]_ 	''fjjtOaOa6bc$$&)5262U2UXv2vD/ 8 	MCd***,1$2F2Fs2K,L$$S)	M r]   )N)7__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planattribute_mapr   int__annotations__r   r   r   r   r!   r"   r$   r   r'   rd   r)   r*   r,   r-   r.   r/   r0   r2   r3   r4   boolr6   strr7   r8   r9   r:   r<   r=   r?   listr@   rA   rB   r   dictrC   rD   rE   rX   setrf   __classcell__)r[   s   @r\   r
   r
      sL   * J#4"5-=*3 01:/81:"+ )"+
 &(9:#%568IJ!"_$56 	/M JK"s"!%3%s!!&(t(cc#&5&L#"Kt"c Jd
 cGS4ZJd
&'t'()3:)"&ND4K&J#*S*#u#L%It!L#*! L#* +,L#S	/D(,!"NC$J" %%48O^d*T18#'OTD[' ND ,/us{T)/
4#PT* r]   r
   N)rj   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r
   __all__rS   r]   r\   <module>r}      sO    # . 3 1 # @Ay% y  Byx 
r]   