
    i*                     `   d dl mZ d dlZd dlmc mZ d dlmZ ddlmZ	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,  ejZ                  e.      Z/ G d de#      Z0 G d de$      Z1 G d de*      Z2 G d dejf                        Z4 G d de      Z5 G d de      Z6 G d  d!e      Z7 G d" d#e      Z8 G d$ d%e      Z9 G d& d'e"      Z: G d( d)e!      Z; G d* d+ee9      Z< G d, d-ee9      Z=g d.Z>y)/    )CallableN)nn   )initialization)Cache)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassification)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)logging)is_flash_attention_requested   )DeepseekV3AttentionDeepseekV3DecoderLayerDeepseekV3MoEDeepseekV3NaiveMoeapply_rotary_pos_emb_interleave)LlamaForCausalLM
LlamaModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)get_llama_4_attn_scale)Qwen2MoeMLP   )Mistral4Configc                       e Zd Zy)Mistral4RMSNormN__name__
__module____qualname__     ~/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/mistral4/modular_mistral4.pyr!   r!   3       r'   r!   c                       e Zd Zy)Mistral4RotaryEmbeddingNr"   r&   r'   r(   r+   r+   7   r)   r'   r+   c                       e Zd Zy)Mistral4MLPNr"   r&   r'   r(   r-   r-   ;   r)   r'   r-   c                   $     e Zd Z fdZd Z xZS )Mistral4TopkRouterc                     t         |           || _        |j                  | _        t	        j
                  t        j                  | j                  |j                  f            | _	        y N)
super__init__confign_routed_expertsr   	Parametertorchemptyhidden_sizeweight)selfr4   	__class__s     r(   r3   zMistral4TopkRouter.__init__@   sM     & 7 7ll5;;0E0EvGYGY/Z#[\r'   c                     |j                  d| j                  j                        }t        j                  || j
                        }|S )N)viewr4   r9   Flinearr:   )r;   hidden_statesrouter_logitss      r(   forwardzMistral4TopkRouter.forwardG   s8    %**2t{{/F/FG<r'   )r#   r$   r%   r3   rD   __classcell__r<   s   @r(   r/   r/   ?   s    ]r'   r/   c                       e Zd Zy)Mistral4NaiveMoeNr"   r&   r'   r(   rH   rH   M   r)   r'   rH   c                   b    e Zd Zdej                  deej                  ej                  f   fdZy)Mistral4MoErC   returnc                 P   |j                  d      }|j                  d| j                  | j                  | j                  z        j	                  dd      d   j                  d      }t        j                  || j                  dd      d   }t        j                  |      }|j                  d|d       |j                  d      j                  d| j                  | j                  | j                  z        j                  d| j                        }|j                  |j                          d      }t        j                  || j                  dd      d   }|j!                  d|      }| j"                  r|j                  dd	
      dz   }	||	z  }|| j$                  z  }||fS )Nr>   r   dimr   F)krN   sortedr           T)rN   keepdimg#B;)softmaxr?   n_groupr5   topksumr7   
topk_group
zeros_likescatter_	unsqueezeexpandreshapemasked_fillbooltop_kgathernorm_topk_probrouted_scaling_factor)
r;   rC   group_scores	group_idx
group_mask
score_maskscores_for_choicetopk_indicestopk_weightsdenominators
             r(   route_tokens_to_expertsz#Mistral4MoE.route_tokens_to_expertsR   s   %--b1r4<<1F1F$,,1VW\\]^df\ghijnnsunv 	 JJ|tBuUVWX	%%l3
Ay!,  $VBd&;&;t||&KLWR../ 	
 *55z7H6H#Nzz"3tzzrRWXYZ[$++A|<&**r4*@5HKK'L#d&@&@@\))r'   N)r#   r$   r%   r7   Tensortuplerk   r&   r'   r(   rJ   rJ   Q   s.    *U\\ *eELLZ_ZfZfLfFg *r'   rJ   c                      e Zd ZdedefdZ	 ddej                  deej                  ej                  f   dej                  dz  dej                  d	e	dz  d
e
e   deej                  ej                  dz  eej                     dz  f   fdZy)Mistral4Attentionr4   	layer_idxc                    t         j                  j                  |        || _        || _        |j
                  |j                  z  | _        |j                  | _        |j
                  | _	        |j                  | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        d| _        | j                  ?t        j"                  |j$                  | j                  | j                  z  d      | _        nt        j"                  |j$                  |j                  |j(                        | _        t-        |j                        | _        t        j"                  |j                  | j                  | j                  z  d      | _        t        j"                  |j$                  | j                  | j                  z   |j(                        | _        t-        | j                        | _        t        j"                  | j                  | j                  | j                  | j                  z   z  d      | _        t        j"                  | j                  | j                  z  |j$                  |j(                        | _        | j                  dz  | _        y )NTF)biasg      )r   Moduler3   r4   rp   num_attention_headsnum_key_value_headsnum_key_value_groupsattention_dropout	num_headsq_lora_rankqk_rope_head_dimkv_lora_rank
v_head_dimqk_nope_head_dimqk_head_dim	is_causalLinearr9   q_projattention_biasq_a_projr!   q_a_layernormq_b_projkv_a_proj_with_mqakv_a_layernorm	kv_b_projo_projscalingr;   r4   rp   s      r(   r3   zMistral4Attention.__init__j   s   
		4 "$*$>$>&B\B\$\!!'!9!933!-- & 7 7"// ++ & 7 7!--#))F$6$6IYIY8Y`efDKIIf&8&8&:L:LSYShShiDM!01C1C!DDIIf&8&8$..4K[K[:[bghDM"$)) 5 55&&#

 .d.?.?@NNd33dooEF
 iiNNT__,&&
 ''D1r'   NrB   position_embeddingsattention_maskposition_idspast_key_valueskwargsrK   c                     |j                   d d \  }}||d| j                  f}	||d| j                  | j                  z   f}
| j                  | j                  |      }n/| j                  | j                  | j                  |                  }|j                  |	      j                  dd      }t        j                  || j                  | j                  gd      \  }}| j                  |      }t        j                  || j                  | j                  gd      \  }}| j!                  | j#                  |            j                  |
      j                  dd      }t        j                  || j                  | j                  gd      \  }}|j                  |d|| j                        }|\  }}| j$                  j&                  rt)        ||||      \  }}nt+        ||||      \  }} |j,                  g |j                   d d d }t        j.                  ||fd      }t        j.                  ||fd      }|t1        || j$                  j2                  j5                  d      | j$                  j2                  j5                  d            j7                  |j8                        z  }| |j;                  ||| j<                        \  }}t?        | j$                        rH| j                  | j                  k7  r/tA        jB                  |d| j                  | j                  z
  g      }tE        jF                  | j$                  jH                  tJ              } || ||||f| jL                  sdn| jN                  | jP                  d	|\  }}t?        | j$                        r4| j                  | j                  k7  r|d d d d d d d | j                  f   }|jS                  ||d      jU                         }| jW                  |      }||fS )
Nr>   r   r   rM   llama_4_scaling_beta original_max_position_embeddingsr   rQ   )dropoutr   ),shaper~   r}   r|   ry   r   r   r   r   r?   	transposer7   splitrz   r   r{   r   r   r4   rope_interleaver   r   r[   catr   rope_parametersgettodtypeupdaterp   r   r@   padr   get_interface_attn_implementationr   trainingrw   r   r\   
contiguousr   )r;   rB   r   r   r   r   r   
batch_size
seq_lengthquery_shape	key_shapeq_statesq_passq_rotcompressed_kvk_passk_rotvalue_statescossinquery_states
key_statesattention_interfaceattn_outputattn_weightss                            r(   rD   zMistral4Attention.forward   s    "/!4!4Sb!9
J!:r43C3CDR1F1F1XY	#{{=1H}}T%7%7m8T%UVH==-771=Ht/D/DdF[F[.\bde//>MD4E4EtG\G\3]cef 3 3F ;<AA)LVVWXZ[\${{6D4I4I4??3[acd

:q*d6K6KL&S;;&&:5%cRLE5/uc3GLE54fll3B/44yy&%b9YYB7
#&<KK''++,BCKK''++,NO'
 "\
 	! &'6'='=j,X\XfXf'g$J'49I9IT__9\5543C3Cdoo3U/VWL(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ (49I9IT__9\%aA/@/@&@AK!))*j"EPPRkk+.L((r'   r1   )r#   r$   r%   r   intr3   r7   rl   rm   r   r   r   rD   r&   r'   r(   ro   ro   i   s    )2~ )2# )2b )-F)||F) #5<<#=>F) t+	F)
 llF) F) -.F) 
u||U\\D0%2E2LL	MF)r'   ro   c                       e Zd ZdedefdZy)Mistral4DecoderLayerr4   rp   c                    t         j                  j                  |        |j                  | _        t	        ||      | _        ||j                  k\  rt        |      | _        nt        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )N)r4   rp   )eps)r   rs   r3   r9   ro   	self_attnfirst_k_dense_replacerJ   mlpr-   r!   rms_norm_epsinput_layernormpost_attention_layernormr   s      r(   r3   zMistral4DecoderLayer.__init__   s    
		4 !--*&IN444"6*DH"6*DH.v/A/AvGZGZ[(78J8JPVPcPc(d%r'   N)r#   r$   r%   r   r   r3   r&   r'   r(   r   r      s    e~ e# er'   r   c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZg Zg Z ej(                          fd       Z xZS )Mistral4PreTrainedModelr4   modelTr   r   )rB   
attentionsc                    t         |   |       t        |t              r7t	        j
                  |j                  d| j                  j                         y t        |t              rmt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         y y )NrQ   )meanstd)r2   _init_weights
isinstancer/   initnormal_r:   r4   initializer_rangerH   gate_up_proj	down_proj)r;   moduler<   s     r(   r   z%Mistral4PreTrainedModel._init_weights  s    f%f01LLSdkk6S6ST 01LL,,3DKK<Y<YZLL))9V9VW 2r'   )r#   r$   r%   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   ro   _can_record_outputs_keep_in_fp32_modules_strict"_keys_to_ignore_on_load_unexpectedr7   no_gradr   rE   rF   s   @r(   r   r      s|    &*#/0#4"5N!"&-' $& )+&U]]_X Xr'   r   c                       e Zd Zy)Mistral4ModelNr"   r&   r'   r(   r   r     r)   r'   r   c                       e Zd Zy)Mistral4ForCausalLMNr"   r&   r'   r(   r   r     r)   r'   r   c                       e Zd Zy)!Mistral4ForSequenceClassificationNr"   r&   r'   r(   r   r     r)   r'   r   c                       e Zd Zy)Mistral4ForTokenClassificationNr"   r&   r'   r(   r   r     r)   r'   r   )r   r   r   r   r   )?collections.abcr   r7   torch.nn.functionalr   
functionalr@    r   r   cache_utilsr   modeling_flash_attention_utilsr   modeling_layersr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   utils.genericr    deepseek_v3.modeling_deepseek_v3r   r   r   r   r   llama.modeling_llamar   r   r   r   r   r   ministral3.modeling_ministral3r   qwen2_moe.modeling_qwen2_moer   configuration_mistral4r   
get_loggerr#   loggerr!   r+   r-   rs   r/   rH   rJ   ro   r   r   r   r   r   r   __all__r&   r'   r(   <module>r      s   %     &   B ^ F &  9   D 6 2 
		H	%	l 		2 		+ 	 	) 	*- *0r)+ r)je1 e Xo X:	J 		* 		(HJa 		%BD[ 	r'   