
    i'|                     b   d dl mZ d dlmZ d dlZd dlmc mZ d dlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5  ed       G d dejl                               Z7 G d dejl                        Z8 G d dejl                        Z9 G d dejl                        Z:e G d  d!ejl                               Z; G d" d#ejl                        Z<d$ Z= ed%      dHd&       Z>d'ej~                  d(e@d)ej~                  fd*ZA	 dId+ejl                  d,ej~                  d-ej~                  d.ej~                  d/ej~                  dz  d0eBd1eBd2e)e+   fd3ZCdJd4ZDd5ej~                  d6eBd7e@d)ej~                  fd8ZE G d9 d:ejl                        ZF G d; d<e      ZG G d= d>e'      ZHe, G d? d@eH             ZIe, G dA dBeHe             ZJ G dC dDeeH      ZK G dE dFeeH      ZLg dGZMy)K    )Callable)OptionalN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_experts_implementationuse_kernel_forward_from_hubuse_kernel_func_from_hub)create_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)is_flash_attention_requestedmaybe_autocastmerge_with_config_defaults)capture_outputs   )Mistral4ConfigRMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	Mistral4RMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z>
        Mistral4RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer'   	__class__s      /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/mistral4/modeling_mistral4.pyr+   zMistral4RMSNorm.__init__3   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor-   float32powmeanrsqrtr0   r/   )r1   r6   input_dtypevariances       r4   forwardzMistral4RMSNorm.forward;   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r5   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler/   shaper0   )r1   s    r4   
extra_reprzMistral4RMSNorm.extra_reprB   s*    ))*+6$2G2G1HIIr5   )gư>)
__name__
__module____qualname__floatr+   r-   TensorrC   rG   __classcell__r3   s   @r4   r&   r&   1   s7    $ $$ $;U\\ ;ell ;Jr5   r&   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )Mistral4RotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultrQ   F)
persistentoriginal_inv_freq)r*   r+   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrR   rope_parametersrT   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r1   rR   devicerope_init_fnrQ   r3   s        r4   r+   z Mistral4RotaryEmbedding.__init__I   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr5   r`   ztorch.deviceseq_lenr(   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r8   r;   )r`   r;   )	r[   getattrr2   num_attention_headsr-   arangeint64r<   rK   )rR   r`   rb   basedimattention_factorrQ   s          r4   r\   z7Mistral4RotaryEmbedding.compute_default_rope_parametersY   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r5   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r9   r"   mpscpuF)device_typeenabledr8   rl   rf   )rQ   rK   expandrF   r<   r`   
isinstancetypestrr   	transposer-   catcosr]   sinr;   )
r1   xposition_idsinv_freq_expandedposition_ids_expandedrq   freqsembrz   r{   s
             r4   rC   zMistral4RotaryEmbedding.forwardw   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$N)NNN)rH   rI   rJ   r-   rL   __annotations__r#   r+   staticmethodr   intrE   rK   r\   no_gradr   rC   rM   rN   s   @r4   rP   rP   F   s    llV~ V  (,+/"*%*(* t* 
~u$	%	* *: U]]_<  <r5   rP   c                   &     e Zd Zd fd	Zd Z xZS )Mistral4MLPc                    t         |           || _        |j                  | _        ||j                  n|| _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFbias)r*   r+   rR   r2   intermediate_sizer   Linear	gate_projup_proj	down_projr   
hidden_actact_fn)r1   rR   r   r3   s      r4   r+   zMistral4MLP.__init__   s    !--=N=V!9!9\m4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r5   c                     | j                  | j                  | j                  |            | j                  |      z        }|S r   )r   r   r   r   )r1   r|   r   s      r4   rC   zMistral4MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r5   r   rH   rI   rJ   r+   rC   rM   rN   s   @r4   r   r      s    0r5   r   c                   $     e Zd Z fdZd Z xZS )Mistral4TopkRouterc                     t         |           || _        |j                  | _        t	        j
                  t        j                  | j                  |j                  f            | _	        y r   )
r*   r+   rR   n_routed_expertsr   r,   r-   emptyr2   r/   r1   rR   r3   s     r4   r+   zMistral4TopkRouter.__init__   sM     & 7 7ll5;;0E0EvGYGY/Z#[\r5   c                     |j                  d| j                  j                        }t        j                  || j
                        }|S Nr9   )viewrR   r2   Flinearr/   )r1   r6   router_logitss      r4   rC   zMistral4TopkRouter.forward   s8    %**2t{{/F/FG<r5   r   rN   s   @r4   r   r      s    ]r5   r   c                        e Zd ZdZ fdZdej                  dej                  dej                  dej                  fdZ xZS )Mistral4NaiveMoez2Collection of expert weights stored as 3D tensors.c                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  d| j                  z  | j
                              | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        |j                     | _        y )Nr8   )r*   r+   num_local_expertsnum_expertsr2   
hidden_dimmoe_intermediate_sizeintermediate_dimr   r,   r-   r   gate_up_projr   r   r   r   r   s     r4   r+   zMistral4NaiveMoe.__init__   s    !33 ,, & < <LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV../r5   r6   top_k_indextop_k_weightsr(   c                 f   t        j                  |      }t        j                         5  t         j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        D ]  }|d   }|| j                  k(  rt        j                  |         \  }}	||	   }
t        j                  j                  |
| j                  |         j                  dd      \  }}| j                  |      |z  }t        j                  j                  || j                   |         }|||	|d f   z  }|j#                  d|	|j%                  |j&                                |S # 1 sw Y   xY w)N)num_classesr8   r"   r   )r9   rs   r9   )r-   
zeros_liker   r   
functionalone_hotr   permutegreatersumnonzerowherer   r   chunkr   r   
index_add_r<   r;   )r1   r6   r   r   final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statess                 r4   rC   zMistral4NaiveMoe.forward   s    $..}=]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 % 
	nJ#AJT---#(;;{:/F#G Iy))4M}}++M4;L;LZ;XY__`agi_jHD"$(KK$5$:!$&MM$8$89NPTP^P^_iPj$k!$9M)U^`dJd<e$e!**1i9N9Q9QReRkRk9lm
	n #"#	S 	Ss   A=F&&F0)	rH   rI   rJ   __doc__r+   r-   rL   rC   rM   rN   s   @r4   r   r      sF    <0#||# \\# ||	#
 
#r5   r   c                   ~     e Zd ZdZ fdZdej                  deej                  ej                  f   fdZd Z	 xZ
S )Mistral4MoEz:
    A mixed expert module containing shared experts.
    c                    t         |           || _        t        |      | _        t        |      | _        t        ||j                  |j                  z        | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                   | _        y )N)rR   r   )r*   r+   rR   r   expertsr   r   r   r   n_shared_expertsshared_expertsr   n_group
topk_groupnorm_topk_probrouted_scaling_factornum_experts_per_toktop_kr   s     r4   r+   zMistral4MoE.__init__   s    '/&v.	)V-I-IFLcLc-c
 !' 7 7~~ ++$33%+%A%A"//
r5   r   r(   c                 P   |j                  d      }|j                  d| j                  | j                  | j                  z        j	                  dd      d   j                  d      }t        j                  || j                  dd      d   }t        j                  |      }|j                  d|d       |j                  d      j                  d| j                  | j                  | j                  z        j                  d| j                        }|j                  |j                          d      }t        j                  || j                  dd      d   }|j!                  d|      }| j"                  r|j                  dd	
      dz   }	||	z  }|| j$                  z  }||fS )Nr9   r8   rs   r   F)krl   sortedr"           T)rl   r:   g#B;)softmaxr   r   r   topkr   r-   r   r   scatter_	unsqueezert   reshapemasked_fillboolr   gatherr   r   )
r1   r   group_scores	group_idx
group_mask
score_maskscores_for_choicetopk_indicestopk_weightsdenominators
             r4   route_tokens_to_expertsz#Mistral4MoE.route_tokens_to_experts   s   %--b1r4<<1F1F$,,1VW\\]^df\ghijnnsunv 	 JJ|tBuUVWX	%%l3
Ay!,  $VBd&;&;t||&KLWR../ 	
 *55z7H6H#Nzz"3tzzrRWXYZ[$++A|<&**r4*@5HKK'L#d&@&@@\))r5   c                    |}|j                   }| j                  |      }| j                  |      \  }}|j                  d|j                   d         } | j	                  |||      j                  | }|| j                  |      z   }|S r   )rF   r   r   r   r   r   )r1   r6   	residuals
orig_shaper   r   r   s          r4   rC   zMistral4MoE.forward   s    !	"((
		-0%)%A%A-%P"l%**2}/B/B2/FGT]L,OTTV`a%(;(;I(FFr5   )rH   rI   rJ   r   r+   r-   rL   rE   r   rC   rM   rN   s   @r4   r   r      s>    0*U\\ *eELLZ_ZfZfLfFg *,r5   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr9   r8   rs   )rF   r-   ry   )r|   x1x2s      r4   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r5   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r   )qr   rz   r{   unsqueeze_dimq_embedk_embeds          r4   apply_rotary_pos_embr   	  sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr5   r6   n_repr(   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)rF   rt   r   )r6   r   batchnum_key_value_headsslenre   s         r4   	repeat_kvr   #  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr5   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr8   r   r9   )rl   r;   )ptrainingr"   )r   num_key_value_groupsr-   matmulrx   r   r   r   r=   r<   r;   r   r  
contiguous)r   r   r   r   r   r   r   r  
key_statesvalue_statesattn_weightsattn_outputs               r4   eager_attention_forwardr  /  s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r5   c                    |j                  |      }|j                  |      }| j                  \  }}}}	| j                  ||||	dz  d      j                  dd      j	                  ||||	      } |j                  \  }}}}	|j                  ||||	dz  d      j                  dd      j	                  ||||	      }| |z  t        |       |z  z   }
||z  t        |      |z  z   }|
|fS )a  
    TODO let's just use the original freqcis computation to not have the view
    transpose + reshape! This is not optimized!
    Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r8      r   )r   rF   r   rx   r   r   )r   r   rz   r{   r}   r   bhsdr   r   s               r4   apply_rotary_pos_emb_interleaver  H  s    0 --
&C
--
&CJAq!Q	q!QQ",,Q2::1aAFAJAq!Q	q!QQ",,Q2::1aAFA3w;q>C/0G3w;q>C/0GGr5   positions_idsbetarX   c           	          d|t        j                  dt        j                  | |z        z         z  z   }|d d d d d d f   S Nr"   )r-   logfloor)r  r  rX   r   s       r4   get_llama_4_attn_scaler  n  sB    $1u{{=CZ3Z'[#[\\\G1dAt#$$r5   c                   .    e Zd ZdZdedef fdZ	 ddej                  de	ej                  ej                  f   dej                  dz  d	ej                  d
e
dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )Mistral4Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrR   	layer_idxc                 f   t         |           || _        || _        |j                  |j
                  z  | _        |j                  | _        |j                  | _        |j                  | _	        |j                  | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        d| _        | j                  ?t!        j"                  |j$                  | j                  | j                  z  d      | _        nt!        j"                  |j$                  |j                  |j(                        | _        t-        |j                        | _        t!        j"                  |j                  | j                  | j                  z  d      | _        t!        j"                  |j$                  | j                  | j                  z   |j(                        | _        t-        | j                        | _        t!        j"                  | j                  | j                  | j                  | j                  z   z  d      | _        t!        j"                  | j                  | j                  z  |j$                  |j(                        | _        | j                  dz  | _        y )NTFr   g      )r*   r+   rR   r  rh   r   r  attention_dropout	num_headsq_lora_rankqk_rope_head_dimkv_lora_rank
v_head_dimqk_nope_head_dimqk_head_dim	is_causalr   r   r2   q_projattention_biasq_a_projr&   q_a_layernormq_b_projkv_a_proj_with_mqakv_a_layernorm	kv_b_projo_projr   r1   rR   r  r3   s      r4   r+   zMistral4Attention.__init__v  s   "$*$>$>&B\B\$\!!'!9!933!-- & 7 7"// ++ & 7 7!--#))F$6$6IYIY8Y`efDKIIf&8&8&:L:LSYShShiDM!01C1C!DDIIf&8&8$..4K[K[:[bghDM"$)) 5 55&&#

 .d.?.?@NNd33dooEF
 iiNNT__,&&
 ''D1r5   Nr6   position_embeddingsr   r}   past_key_valuesr  r(   c                     |j                   d d \  }}||d| j                  f}	||d| j                  | j                  z   f}
| j                  | j                  |      }n/| j                  | j                  | j                  |                  }|j                  |	      j                  dd      }t        j                  || j                  | j                  gd      \  }}| j                  |      }t        j                  || j                  | j                  gd      \  }}| j!                  | j#                  |            j                  |
      j                  dd      }t        j                  || j                  | j                  gd      \  }}|j                  |d|| j                        }|\  }}| j$                  j&                  rt)        ||||      \  }}nt+        ||||      \  }} |j,                  g |j                   d d d }t        j.                  ||fd      }t        j.                  ||fd      }|t1        || j$                  j2                  j5                  d      | j$                  j2                  j5                  d            j7                  |j8                        z  }| |j;                  ||| j<                        \  }}t?        | j$                        rH| j                  | j                  k7  r/tA        jB                  |d| j                  | j                  z
  g      }tE        jF                  | j$                  jH                  tJ              } || ||||f| jL                  sdn| jN                  | jP                  d	|\  }}t?        | j$                        r4| j                  | j                  k7  r|d d d d d d d | j                  f   }|jS                  ||d      jU                         }| jW                  |      }||fS )
Nr9   r"   r8   rs   llama_4_scaling_beta original_max_position_embeddingsr   r   )r   r   ),rF   r&  r%  r$  r!  r(  r,  r+  r*  r   rx   r-   splitr"  r-  r#  r/  r.  rR   rope_interleaver  r   rt   ry   r  r[   getr<   r;   updater  r   r   padr   get_interface_attn_implementationr  r  r  r   r   r  r0  )r1   r6   r2  r   r}   r3  r  
batch_size
seq_lengthquery_shape	key_shapeq_statesq_passq_rotcompressed_kvk_passk_rotr	  rz   r{   query_statesr  attention_interfacer  r
  s                            r4   rC   zMistral4Attention.forward  s    "/!4!4Sb!9
J!:r43C3CDR1F1F1XY	#{{=1H}}T%7%7m8T%UVH==-771=Ht/D/DdF[F[.\bde//>MD4E4EtG\G\3]cef 3 3F ;<AA)LVVWXZ[\${{6D4I4I4??3[acd

:q*d6K6KL&S;;&&:5%cRLE5/uc3GLE54fll3B/44yy&%b9YYB7
#&<KK''++,BCKK''++,NO'
 "\
 	! &'6'='=j,X\XfXf'g$J'49I9IT__9\5543C3Cdoo3U/VWL(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ (49I9IT__9\%aA/@/@&@AK!))*j"EPPRkk+.L((r5   r   )rH   rI   rJ   r   r#   r   r+   r-   rL   rE   r	   r   r   rC   rM   rN   s   @r4   r  r  s  s    G)2~ )2# )2b )-F)||F) #5<<#=>F) t+	F)
 llF) F) -.F) 
u||U\\D0%2E2LL	MF)r5   r  c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
eej                  ej                  f   dz  dee   dej                  fdZ xZS )Mistral4DecoderLayerrR   r  c                 `   t         |           |j                  | _        t        ||      | _        ||j
                  k\  rt        |      | _        nt        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )N)rR   r  r'   )r*   r+   r2   r  	self_attnfirst_k_dense_replacer   mlpr   r&   rms_norm_epsinput_layernormpost_attention_layernormr1  s      r4   r+   zMistral4DecoderLayer.__init__  s    !--*&IN444"6*DH"6*DH.v/A/AvGZGZ[(78J8JPVPcPc(d%r5   Nr6   r   r}   r3  	use_cacher2  r  r(   c           
          |}| j                  |      } | j                  d||||||d|\  }}	||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r6   r   r}   r3  rT  r2   )rR  rN  rS  rP  )
r1   r6   r   r}   r3  rT  r2  r  residual_s
             r4   rC   zMistral4DecoderLayer.forward  s     !,,];)4>> 
')%+ 3
 
q !=0 !55mD/ =0r5   )NNNFN)rH   rI   rJ   r#   r   r+   r-   rL   
LongTensorr	   r   rE   r   r   rC   rM   rN   s   @r4   rK  rK    s    e~ e# e" /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
r5   rK  c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZg Zg Z ej(                          fd       Z xZS )Mistral4PreTrainedModelrR   modelTrK  r3  )r6   
attentionsc                    t         |   |       t        |t              r7t	        j
                  |j                  d| j                  j                         y t        |t              rmt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         y y )Nr   )r?   std)r*   _init_weightsru   r   initnormal_r/   rR   initializer_ranger   r   r   )r1   r   r3   s     r4   r`  z%Mistral4PreTrainedModel._init_weights,  s    f%f01LLSdkk6S6ST 01LL,,3DKK<Y<YZLL))9V9VW 2r5   )rH   rI   rJ   r#   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendrK  r  _can_record_outputs_keep_in_fp32_modules_strict"_keys_to_ignore_on_load_unexpectedr-   r   r`  rM   rN   s   @r4   r[  r[    s|    &*#/0#4"5N!"&-' $& )+&U]]_X Xr5   r[  c                        e Zd Zdef fdZeee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ee   defd                     Z xZS )Mistral4ModelrR   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )NrM  rR   F)r*   r+   pad_token_idpadding_idx
vocab_sizer   	Embeddingr2   embed_tokens
ModuleListrangenum_hidden_layersrK  layersr&   rQ  normrP   
rotary_embgradient_checkpointing	post_initr1  s      r4   r+   zMistral4Model.__init__8  s     !.. ++LL):):F<N<NPTP`P`ammFKFLdLdFef!&)4f
 $F$6$6F<O<OP	1@&+# 	 gs   DN	input_idsr   r}   r3  inputs_embedsrT  r  r(   c           
      <   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        | j                  ||||      }	|}
| j                  |
|      }| j                  d | j                  j                   D ]  } ||
f|	||||d|}
 | j                  |
      }
t        |
|	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsrs  r   r"   )r`   )rR   r  r   r3  r}   )r}   )r   r2  r}   r3  rT  )last_hidden_stater3  )
ValueErrorrx  r
   rR   get_seq_lengthr-   ri   rF   r`   r   r   r~  r|  r{  r}  r   )r1   r  r   r}   r3  r  rT  r  past_seen_tokenscausal_maskr6   r2  decoder_layers                r4   rC   zMistral4Model.forwardH  sL    -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oom,oW![[)H4;;+H+HI 		M)*$7) /# M		 		-0&++
 	
r5   )NNNNNN)rH   rI   rJ   r#   r+   r    r!   r   r-   rY  rL   r	   FloatTensorr   r   r   r   rC   rM   rN   s   @r4   rq  rq  6  s    ~     .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
!2
    2
r5   rq  c                   B    e Zd ZddiZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dee	j                  z  dee   defd              Z xZS )Mistral4ForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr6   logitsc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r   )
r*   r+   rq  r\  rv  r   r   r2   r  r  r   s     r4   r+   zMistral4ForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r5   Nr  r   r}   r3  r  labelsrT  logits_to_keepr  r(   c	           
      x    | j                   d||||||d|	}
|
j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|	}t        |||
j                  |
j                  |
j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Mistral4ForCausalLM

        >>> model = Mistral4ForCausalLM.from_pretrained("meta-mistral4/Mistral4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-mistral4/Mistral4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r  r   r}   r3  r  rT  N)r  r  rv  )lossr  r3  r6   r]  rV  )r\  r  ru   r   slicer  loss_functionrR   rv  r   r3  r6   r]  )r1   r  r   r}   r3  r  r  rT  r  r  outputsr6   slice_indicesr  r  s                  r4   rC   zMistral4ForCausalLM.forward  s    > ,64:: ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r5   )NNNNNNNr   )rH   rI   rJ   _tied_weights_keys_tp_plan_pp_planr+   r   r   r-   rY  rL   r	   r  r   r   r   r   r   rC   rM   rN   s   @r4   r  r    s   *,GH23H_-z:;H  .2.204(,26*.!%-.6
##d*6
 t+6
 &&-	6

 6
 ((4/6
   4'6
 $;6
 ell*6
 +,6
 
 6
  6
r5   r  c                       e Zd Zy)!Mistral4ForSequenceClassificationNrH   rI   rJ   rV  r5   r4   r  r        r5   r  c                       e Zd Zy)Mistral4ForTokenClassificationNr  rV  r5   r4   r  r    r  r5   r  )r[  rq  r  r  r  )r"   )r   r  )Ncollections.abcr   typingr   r-   torch.nn.functionalr   r   r    r   ra  activationsr   cache_utilsr	   r
   
generationr   integrationsr   r   r   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   r    utils.output_capturingr!   configuration_mistral4r#   Moduler&   rP   r   r   r   r   r   r   rL   r   r   rK   r  r  r  r  rK  r[  rq  r  r  r  __all__rV  r5   r4   <module>r     sj  ( %      & ! . ) m m / B 
 P K F & I I e e 5 2 Y'Jbii J (J(><bii ><B"))    $#ryy $# $#N2")) 2j( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2#L%%,, %e %^a %fkfrfr %
t)		 t)n,5 ,^Xo X: F
+ F
 F
R F
1? F
 F
R	(HJa 		%BD[ 	r5   