
    iV                        d Z ddlZddlmc mZ ddlmZ ddlmZ ddlm	Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5  e"jl                  e7      Z8 e!d      e G d de                    Z9 G d de3      Z: G d de      Z; G d dejx                        Z= G d  d!e)      Z> G d" d#e+      Z? G d$ d%e5      Z@ G d& d'e4      ZA G d( d)e,e      ZB G d* d+e2      ZC G d, d-e1      ZD G d. d/e-      ZE G d0 d1e/      ZF G d2 d3e0      ZG G d4 d5e.      ZHg d6ZIy)7zPyTorch MiniMax model.    N)strict)nn   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)MoeModelOutputWithPast)RopeParameters)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)OutputRecordercapture_outputs   )Gemma2RotaryEmbedding)MixtralAttentionMixtralDecoderLayerMixtralForCausalLMMixtralForQuestionAnswering MixtralForSequenceClassificationMixtralForTokenClassificationMixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralSparseMoeBlockMixtralTopKRouterzMiniMaxAI/MiniMax-Text-01-hf)
checkpointc                       e Zd ZU dZdZdgZdZdddddddd	Zd
gdgfddgdgfdgdgfdZddiZ	dZ
eed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZedz  ed<   dZeed<   d Zeed!<   d"Zeed#<   d$Zeed%<   d&Zeed'<   dZedz  ed(<   d)Zedz  ed*<   d+Zeee   z  dz  ed,<   d-Zeed.<   dZ edz  ed/<   d0Z!eez  ed1<   d+Z"eed2<   dZ#eed<   d-Z$eed3<   d4Z%eed5<   d0Z&eed6<   dZ'e(e)z  dz  ed7<   dZ*ee   dz  ed8<   d9Z+eed:<   d)Z,eez  ed;<   d)Z-eez  ed<<   d)Z.eez  ed=<   d)Z/eez  ed><   d)Z0eez  ed?<   d)Z1eez  ed@<    fdAZ2 xZ3S )BMiniMaxConfiga  
    block_size (`int`, *optional*, defaults to 256):
        The length of each attention block, determining how queries, keys, and values
        are grouped and processed for intra- and inter-block attention.
    full_attn_alpha_factor (`float`, *optional*, defaults to 1):
        Weight for residual value in residual connection after normal attention.
    full_attn_beta_factor (`float`, *optional*, defaults to 1):
        Weight for hidden state value in residual connection after normal attention.
    linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
        Weight for residual value in residual connection after lightning attention.
    linear_attn_beta_factor (`float`, *optional*, defaults to 1):
        Weight for hidden state value in residual connection after lightning attention.
    mlp_alpha_factor (`float`, *optional*, defaults to 1):
        Weight for residual value in residual connection after MLP.
    mlp_beta_factor (`float`, *optional*, defaults to 1):
        Weight for hidden state value in residual connection after MLP.

    ```python
    >>> from transformers import MiniMaxModel, MiniMaxConfig

    >>> # Initializing a MiniMax style configuration
    >>> configuration = MiniMaxConfig()

    >>> # Initializing a model from the MiniMax style configuration
    >>> model = MiniMaxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```minimaxpast_key_valuesg    .Acolwiserowwisepacked_colwisemoe_tp_experts)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projz!layers.*.mlp.experts.gate_up_projzlayers.*.mlp.experts.down_projzlayers.*.mlp.experts	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormnum_expertsnum_local_expertsi }  
vocab_sizei   hidden_sizei 8  intermediate_size    num_hidden_layersnum_attention_heads   num_key_value_headsNhead_dimsilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangeh㈵>rms_norm_epsT	use_cachepad_token_id   bos_token_idr   eos_token_idFtie_word_embeddingssliding_windowg        attention_dropoutnum_experts_per_tokoutput_router_logitsgMbP?router_aux_loss_coefrouter_jitter_noiserope_parameterslayer_types   
block_sizefull_attn_alpha_factorfull_attn_beta_factorlinear_attn_alpha_factorlinear_attn_beta_factormlp_alpha_factormlp_beta_factorc                     | j                   | j                  | _         | j                  ;t        | j                        D cg c]  }t        |dz   dz        rdnd c}| _        t        |   di | y c c}w )NrH   r   full_attentionlinear_attention )r>   r<   rS   ranger;   boolsuper__post_init__)selfkwargsi	__class__s      |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/minimax/modular_minimax.pyrc   zMiniMaxConfig.__post_init__   sz    ##+'+'?'?D$#W\]a]s]sWt RSD!a%1$5 ;MM D 	''	 s   A6)4__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planattribute_mapr7   int__annotations__r8   r9   r;   r<   r>   r?   rA   strrB   rC   floatrE   rF   ra   rG   rI   rJ   listrK   rL   rM   rN   r6   rO   rP   rQ   rR   r   dictrS   rU   rV   rW   rX   rY   rZ   r[   rc   __classcell__rg   s   @rh   r'   r'   6   s%   < J#4"5M%.%.%.%.-=*3 0 &(9:#%568IJ!"_$56
 #$78MJK"s"s!!  HcDjJ#,S,#u#L%It#L#*# L#* +,L#S	/D(, %%!%NC$J%%(us{(  s!&$&"'%'!$$48O^d*T18$(KcT!(J*+C%K+)*3;*,-cEk-+,S5[,$%cEk%#$OS5[$	( 	(    r'   c                       e Zd Zy)MiniMaxRMSNormNri   rj   rk   r_   r{   rh   r}   r}          r{   r}   c                   r     e Zd Z fdZd ZdefdZ fdZdefdZde	j                  fd	Zd
efdZ xZS )MiniMaxCachec                 0    t         |           g | _        y N)rb   __init__linear_cacherd   rg   s    rh   r   zMiniMaxCache.__init__   s    02r{   c                     t        t        | j                        |dz         D ]  }| j                  j                  g         || j                  |<   y )NrH   )r`   lenr   append)rd   	layer_idxr   _s       rh   set_linear_cachezMiniMaxCache.set_linear_cache   sK    s4,,-y1}= 	)A$$R(	)'3)$r{   r   c                 >    |t        |       k  r| j                  |   S y r   )r   r   )rd   r   s     rh   get_linear_cachezMiniMaxCache.get_linear_cache   s"    s4y $$Y//r{   c                 Z    t        t        | 	         t        | j                              S r   )maxrb   __len__r   r   r   s    rh   r   zMiniMaxCache.__len__   s"    57?$c$*;*;&<==r{   repeatsc                     t        t        |             D ]`  }| j                  |   g k7  r.| j                  |   j                  |d      | j                  |<   C| j                  |   j                  |       b y )Nr   dim)r`   r   r   repeat_interleaver3   batch_repeat_interleave)rd   r   r   s      rh   r   z$MiniMaxCache.batch_repeat_interleave   ss    s4y) 	HI  +r1/3/@/@/K/]/]^ekl/]/m!!),I&>>wG		Hr{   indicesc                     t        t        |             D ]T  }| j                  |   g k7  r"| j                  |   |df   | j                  |<   7| j                  |   j	                  |       V y )N.)r`   r   r   r3   batch_select_indices)rd   r   r   s      rh   r   z!MiniMaxCache.batch_select_indices   sk    s4y) 	EI  +r1/3/@/@/KGUXL/Y!!),I&;;GD		Er{   
max_lengthc                     t        d      )Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)rd   r   s     rh   cropzMiniMaxCache.crop   s    GHHr{   )ri   rj   rk   r   r   rs   r   r   r   torchTensorr   r   ry   rz   s   @rh   r   r      sL    34# 
>Hs HEELL EIs Ir{   r   c                       e Zd Zdedef fdZd Zd Z	 ddej                  de
ej                  ej                  f   d	ej                  dz  d
edz  dee   de
ej                  ej                  dz  e
ej                     dz  f   fdZ xZS )MiniMaxLightningAttentionconfigr   c                    t         |           || _        t        |dd       xs |j                  |j
                  z  | _        |j
                  | _        |j                  | _        |j                  | _        t        |j                     | _        t        | j                  | j
                  z        | _        t        j                  |j                  | j
                  | j                  z  dz  d      | _        t        j                  | j
                  | j                  z  |j                  d      | _        t        j                  |j                  | j
                  | j                  z  d      | _        | j'                         }| j)                  |      \  }}}| j+                  d|       | j+                  d|       | j+                  d|       | j+                  d|       y )	Nr?   r   F)bias
slope_ratequery_decay	key_decaydiagonal_decay)rb   r   r   getattrr8   r<   r?   r;   rU   r   rA   act_fnr}   r4   r   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)rd   r   r   r   r   r   r   rg   s          rh   r   z"MiniMaxLightningAttention.__init__   s   "
D9mV=O=OSYSmSm=m#)#=#= !'!9!9 ++V../"4==43K3K#KL			&"4"4d6N6NQUQ^Q^6^ab6bino		$":":T]]"JFL^L^ejk99V%7%79Q9QTXTaTa9ahmn((*
151C1CJ1O.Y\:6]K8[)4-~>r{   c                     ddd| j                   z  z  z  }t        j                  | j                         dz   }d| j                  | j                  dz
  dz   z  z
  dz   }||z  }||z  }|d d d d f   }|S )NrH   r   r=   rD   )r<   r   aranger   r;   )rd   baseexponentfactorrates        rh   r   z(MiniMaxLightningAttention.get_slope_rate   s    A!d66678<< 8 89A=T^^t'='='AD'HIIDPX~f}AtTM"r{   c                    t        j                  | j                        dz   }t        j                  | |d d d f   z        }t        j                  | | j                  |d d d f   z
  z        }|d d d f   |d d d f   z
  }|d d d d d d f   }||z  }t        j                  |dk\  | t        d            }t        j                  |      }|||fS )NrH   r   z-inf)r   r   rU   expwhererv   )rd   r   block_size_ranger   r   r   s         rh   r   z'MiniMaxLightningAttention.decay_factors   s     <<81<ii.>q$w.G GHIIzkT__?OPQSWPW?X-XYZ	)!T'25EdAg5NN'dAq(89#n4^q%8>/5QW=Y>2I~55r{   Nr0   position_embeddingsr1   r)   re   returnc                    |j                   \  }}}|| j                  z   dz
  | j                  z  }	| j                  | j                  |            }
|
j	                  ||| j
                  d| j                  z        }
t        j                  |
| j                  d      \  }}}|j                  dd      }|j                  dd      }|j                  dd      }d }||j                  | j                        }|t        j                  || j
                  | j                  | j                        j                  |      }|Q|j                  t        j                        }|j                  |j!                  d      j!                  d       d      }g }t#        |	      D ]b  }|| j                  z  }t%        || j                  z   |      }||z
  }|d d d d ||f   }|d d d d ||f   }|d d d d ||f   }| j&                  d d d |f   }| j(                  d d | d f   }| j*                  d d d d d |d |f   }t        j,                  | j.                   |z        }t        j0                  ||j                  dd            }t        j0                  ||z  |      }t        j0                  ||z  |      }||z   }|j3                  |       t        j0                  ||z  j                  dd      |      }||z  |z   }e nt        j,                  | j.                         } g }t#        |      D ]  }|d d d d ||dz   f   }|d d d d ||dz   f   }|d d d d ||dz   f   }t        j0                  |j                  dd      |      }!| |z  |!z   }t        j0                  ||      }|j3                  |        t        j4                  |d      }|j                  dd      }|j	                  ||| j
                  | j                  z        }| j7                  |      }t9        j:                  | j=                  |            |z  }| j?                  |      }||jA                  | j                  |       ||fS )	NrH   r   r   r   )dtyper   )!shaperU   r   r   reshaper<   r?   r   split	transposer   r   zerostora   masked_fill	unsqueezer`   minr   r   r   r   r   matmulr   catr4   Fsigmoidr   r   r   )"rd   r0   r   r1   r)   re   
batch_sizeseq_lenr8   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputrf   	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_inters"                                     rh   forwardz!MiniMaxLightningAttention.forward   s    ,9+>+>(
G[/!3G
[[}!=>
''
GT=U=UWX[_[h[hWhi
16Z\]1^.j,#--a3))!Q/
#--a3 "&!0!A!A$..!Q%!&Z9Q9QSWS`S`bfbobo!p!s!s"
 )!/!2!2!2!D+779Q9QRS9T9^9^_a9b8bdefK:& `/	i$//97C%,y%8"'3Aq)G:K4K'L$%/1i6G0G%H"'3Aq)G:K4K'L$&*&6&6q:M;M:M7M&N#$(NN17I6I6J3J$K!)-)<)<QCVDVCVXkYkXk=k)l&#ii(8;M(MN &+\\2FHZHdHdegikHl%m"$)LL1CF\1\^r$s! %*LL1EH[1[]o$p! '8:K&K#""#67 +0,,'*;;FFr2NPd+' &8+%EH_%_";`@ IIt./EK7^ 	8'3Aq!a!e)O'D$%/1a!a%i%@"'3Aq!a!e)O'D$-2\\:L:V:VWY[]:^`t-u*%*-?%?B\%\"&+ll3GI[&\#""#67	8 ii4 "++Aq1!))*gt?W?WZ^ZgZg?ghii,ii 0 0 ?@;NmmK0 &,,T^^=OP...r{   r   )ri   rj   rk   r'   rs   r   r   r   r   r   tupler   r   r   r   ry   rz   s   @rh   r   r      s    ?} ? ?,	6& )-_/||_/ #5<<#=>_/ t+	_/
 _/ -._/ 
u||U\\D0%2E2LL	M_/r{   r   c                       e Zd Zy)MiniMaxRotaryEmbeddingNr~   r_   r{   rh   r   r   R  r   r{   r   c                       e Zd Zy)MiniMaxAttentionNr~   r_   r{   rh   r   r   V  r   r{   r   c                       e Zd Zy)MiniMaxTopKRouterNr~   r_   r{   rh   r   r   Z  r   r{   r   c                       e Zd Zy)MiniMaxSparseMoeBlockNr~   r_   r{   rh   r   r   ^  r   r{   r   c                   D    e Zd Zdedef fdZ	 	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	e
dz  d
edz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )MiniMaxDecoderLayerr   r   c                    t         |   ||       || _        t        |d      r|j                  |   nd | _        |j                  | _        |j                  | _        | `t        |      | _        | j
                  dk(  r4t        ||      | _        |j                  | _        |j                  | _        y t!        ||      | _        |j"                  | _        |j$                  | _        y )NrS   r^   )rb   r   r   hasattrrS   
layer_typerZ   r[   mlpr   r   	self_attnrX   attn_alpha_factorrY   attn_beta_factorr   rV   rW   )rd   r   r   rg   s      rh   r   zMiniMaxDecoderLayer.__init__c  s    +";B6=;Y&,,Y7_c & 7 7%55H(0??006vyIDN%+%D%DD"$*$B$BD!-fi@DN%+%B%BD"$*$@$@D!r{   Nr0   r   r1   position_idsr)   rF   re   r   c           
      (   | j                  |      }|} | j                  d||||||d|\  }}	|| j                  z  || j                  z  z   }| j	                  |      }|}| j                  |      }|| j                  z  || j                  z  z   }|S )N)r0   r   r1   r   r)   rF   r_   )input_layernormr   r   r   post_attention_layernormr   rZ   r[   )
rd   r0   r   r1   r   r)   rF   re   residualr   s
             rh   r   zMiniMaxDecoderLayer.forwardu  s     ,,]; )4>> 
' 3)%+
 
q !4#9#99MDLaLa<aa55mD / 4#8#88=4K_K_;__r{   )NNNNF)ri   rj   rk   r'   rs   r   r   r   r   
LongTensorr   ra   r   r   FloatTensorr   ry   rz   s   @rh   r   r   b  s    A} A A* IM.204(,!&|| #5<<#=>E t+	
 &&-  $; -. 
u  %(9(95;L;L(L"MPT"TT	Ur{   r   c                   D     e Zd ZdZ eedd      eeegdZ	 fdZ
 xZS )MiniMaxPreTrainedModelFzmlp.gater   )
layer_nameindex)router_logitsr0   
attentionsc                    t         |   |       t        |t              r|j	                         }|j                  |      \  }}}t        j                  |j                  |       t        j                  |j                  |       t        j                  |j                  |       t        j                  |j                  |       y y r   )rb   _init_weights
isinstancer   r   r   initcopy_r   r   r   r   )rd   moduler   r   r   r   rg   s         rh   r  z$MiniMaxPreTrainedModel._init_weights  s    f%f78..0J5;5I5I*5U2KNJJv((*5JJv));7JJv''3JJv,,n= 9r{   )ri   rj   rk   _can_compile_fullgraphr   r   r   r   r   _can_record_outputsr  ry   rz   s   @rh   r  r    s5    "'(9jXYZ,')BC> >r{   r  c                       e Zd Zee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  de
dz  dee   d	eez  fd
              Zy)MiniMaxModelNr.   r1   r   r)   r/   rF   re   r   c           
         |d u |d uz  rt        d      |r|t               }n*|r(t        |t              st        dt        |       d      || j	                  |      }|V||j                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }| j                  j                  t        nt        }	 |	| j                  ||||      }
|}| j                  ||      }t!        | j"                        D ]5  \  }}| j                  j$                  |   dk(  r|
}n|} ||f|||||d	|}7 | j'                  |      }t)        ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   rH   )device)r   r/   r1   r)   r   r]   )r1   r   r   r)   rF   )last_hidden_stater)   )
ValueErrorr   r  typer2   get_seq_lengthr   r   r   r  r   r   rL   r   r   
rotary_emb	enumerater3   rS   r4   r   )rd   r.   r1   r   r)   r/   rF   re   past_seen_tokensmask_functioncausal_maskr0   r   rf   decoder_layerinput_attention_masks                   rh   r   zMiniMaxModel.forward  s    -t";<YZZ0*nOz/<Hefjkzf{e||}~    --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L.2kk.H.H.P*Vw#;;')+%
 &"oom\J )$++ 6 	A}{{&&q)-=='2$ (6$)3$7) /# M	" 		-0%++
 	
r{   )NNNNNN)ri   rj   rk   r   r   r   r  r   r   r  ra   r   r   r   r   r   r_   r{   rh   r  r    s     .2.204/326!%>
##d*>
 t+>
 &&-	>

 &,>
 ((4/>
 $;>
 +,>
 
'	'>
   >
r{   r  c                        e Zd Z fdZ xZS )MiniMaxForCausalLMc                 "    t        |   di |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r_   )rb   r   )rd   super_kwargsrg   s     rh   r   zMiniMaxForCausalLM.forward  s    . w...r{   )ri   rj   rk   r   ry   rz   s   @rh   r#  r#    s    / /r{   r#  c                       e Zd Zy) MiniMaxForSequenceClassificationNr~   r_   r{   rh   r'  r'    r   r{   r'  c                       e Zd Zy)MiniMaxForTokenClassificationNr~   r_   r{   rh   r)  r)  	  r   r{   r)  c                       e Zd Zy)MiniMaxForQuestionAnsweringNr~   r_   r{   rh   r+  r+    r   r{   r+  )r'   r  r  r#  r'  r)  r+  )Jrl   r   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr    r   r  activationsr   cache_utilsr   r	   configuration_utilsr
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   modeling_rope_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   r   gemma2.modeling_gemma2r   mixtral.modeling_mixtralr   r   r   r   r   r   r    r!   r"   r#   r$   
get_loggerri   loggerr'   r}   r   Moduler   r   r   r   r   r   r  r  r#  r'  r)  r+  __all__r_   r{   rh   <module>rB     sf       .  & ! . 3 R B 9 6 1 & @ @ 7 E :    
		H	% 9:\($ \(  ;\(~	^ 	"I< "IJO/		 O/d	2 		' 		) 		1 	.-/I .b>3 >&A
< A
H/+ /6	'G 		$A 		"= 	r{   