
    iD                     h   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1  ejd                  e3      Z4 G d de*      Z5 G d de&      Z6 G d de/      Z7 G d dejp                        Z9 G d d e.      Z: G d! d"ejp                        Z; G d# d$e(      Z< G d% d&e      Z= G d' d(e      Z>e G d) d*e>             Z? G d+ d,e)e>e      Z@g d-ZAy).zPyTorch AFMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_grouped_mm_availablelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )GptOssRMSNorm)LlamaAttentionLlamaForCausalLMLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Qwen2MoeExpertsQwen2MoeMLP   )AfmoeConfigc                       e Zd Zy)AfmoeRotaryEmbeddingN__name__
__module____qualname__     x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/afmoe/modular_afmoe.pyr&   r&   /       r,   r&   c                       e Zd Zy)AfmoeRMSNormNr'   r+   r,   r-   r0   r0   3   r.   r,   r0   c                       e Zd Zy)AfmoeMLPNr'   r+   r,   r-   r2   r2   7   r.   r,   r2   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )AfmoeTokenChoiceRouterz
    Token-choice top-K router for MoE routing.

    This router assigns each token to the top-K experts based on sigmoid scores, matching the released checkpoints.
    c                     t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  |j
                  d      | _
        y NFbias)super__init__confignum_experts_per_toktop_knum_expertsroute_scaler   Linearhidden_sizegateselfr;   	__class__s     r-   r:   zAfmoeTokenChoiceRouter.__init__B   s^    //
!--!--IIf00&2D2D5Q	r,   hidden_statesexpert_biasc                    |j                   \  }}}|j                  d|      }| j                  |      j                  t        j
                        }t	        j                  |      }t	        j                  ||z   | j                  d      \  }}|j                  d|      }|j                  dd      dz   }	||	z  }|| j                  z  }|||fS )Nr#   )kdim)rK   indexT)rK   keepdimg#B;)shapeviewrB   totorchfloat32sigmoidtopkr=   gathersumr?   )
rD   rF   rG   _
hidden_dimrouter_logitsscoresselected_experts
top_scoresdenominators
             r-   forwardzAfmoeTokenChoiceRouter.forwardJ   s    (..1j%**2z:		-033EMMB}-#jj+)=QRS]]q0@]A
 nnTn:UB+-
$"2"22
j*:::r,   )	r(   r)   r*   __doc__r:   rQ   Tensorr^   __classcell__rE   s   @r-   r4   r4   ;   s)    R;U\\ ; ;r,   r4   c                       e Zd Zy)AfmoeExpertsNr'   r+   r,   r-   rd   rd   Y   r.   r,   rd   c                   (     e Zd ZdZ fdZd Z xZS )AfmoeSparseMoeBlockz
    Mixture of Experts (MoE) module for AFMoE.

    This module implements a sparse MoE layer with both shared experts (always active) and
    routed experts (activated based on token-choice routing).
    c                 2   t         |           || _        t        |      | _        t        ||j                  |j                  z        | _        t        |      | _
        t        j                  t        j                  |j                        d      | _        y )NF)requires_grad)r9   r:   r;   r4   routerr2   moe_intermediate_sizenum_shared_expertsshared_expertsrd   expertsr   	ParameterrQ   zerosr>   rG   rC   s     r-   r:   zAfmoeSparseMoeBlock.__init__e   sp    ,V4&vv/K/KfNgNg/gh#F+<<F4F4F(GW\]r,   c                    |j                   \  }}}|j                  d|      }| j                  || j                        \  }}}| j	                  |      j                  |||      }	| j                  |||      j                  |||      }
|	|
z   S )NrI   )rN   rO   ri   rG   rl   rm   )rD   rF   
batch_sizeseq_lenrX   hidden_states_flatrY   r\   r[   shared_outputrouted_outputs              r-   r^   zAfmoeSparseMoeBlock.forwardm   s    *7*=*='
GZ*//J? 7;kk-QUQaQa6b3z#3 ++,>?DDZQXZde%79I:V[[
 },,r,   )r(   r)   r*   r_   r:   r^   ra   rb   s   @r-   rf   rf   ]   s    ^-r,   rf   c                        e Zd ZdZdedef fdZ	 ddej                  de	ej                  ej                  f   dej                  dz  d	e
dz  d
ee   de	ej                  ej                  f   fdZ xZS )AfmoeAttentionaJ  
    Multi-headed attention module with optional sliding window and gating.

    This attention mechanism supports both full attention and sliding window attention,
    and includes Q/K normalization and gating of the output. It inherits from [`LlamaAttention`] to minimize the amount
    of custom logic we need to maintain.
    r;   	layer_idxc                    t         |   ||       |j                  |   dk(  | _        | j                  r|j                  nd | _        t        | j                  |j                        | _        t        | j                  |j                        | _	        t        j                  |j                  |j                  | j                  z  d      | _        y )Nsliding_attentionepsFr7   )r9   r:   layer_typesis_local_attentionsliding_windowr0   head_dimrms_norm_epsq_normk_normr   r@   rA   num_attention_heads	gate_projrD   r;   rx   rE   s      r-   r:   zAfmoeAttention.__init__   s    + #)"4"4Y"?CV"V7;7N7Nf33TX"4==f6I6IJ"4==f6I6IJ6#5#5v7Q7QTXTaTa7ahmnr,   NrF   position_embeddingsattention_maskpast_key_valuekwargsreturnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      }| j	                  |      j                  |      }	| j                  |      j                  |      }
| j                  |      }| j                  |      j                  dd      }| j                  |	      j                  dd      }	|
j                  dd      }
| j                  r|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                   j"                  t$              } || ||	|
f|| j&                  sdn| j(                  | j*                  | j,                  d|\  }} |j                  g |d j/                         }|t1        j2                  |      z  }| j5                  |      }||fS )NrI   r#   r           )r   dropoutscalingr   )rN   r   q_projrO   k_projv_projr   r   	transposer   r~   r   updaterx   r   get_interfacer;   _attn_implementationr    trainingattention_dropoutr   r   
contiguousrQ   rS   o_proj)rD   rF   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesgate_statescossinattention_interfaceoutputattn_weightsattn_outputs                     r-   r^   zAfmoeAttention.forward   s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|Dnn]3{{<0::1a@[[,66q!<
#--a3""*HC';L*VY[^'_$L*%'5'<'<ZW[WeWe'f$J(?(M(MKK,,.E)
  3	
 

 *#}}C$2H2HLL..
 
 
 
 .k.2.99;%--44kk&)L((r,   )N)r(   r)   r*   r_   r$   intr:   rQ   r`   tupler   r   r   r^   ra   rb   s   @r-   rw   rw   |   s    	o{ 	os 	o  (,.)||.) #5<<#=>.) t+	.)
 .) +,.) 
u||U\\)	*.)r,   rw   c                       e Zd ZdZdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
edz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )AfmoeDecoderLayerz
    AFMoE decoder layer with dual normalization.

    This layer applies self-attention followed by either a dense MLP or MoE block,
    with dual normalization (pre and post) around each component.
    r;   rx   c                 (   t         |           |j                  | _        || _        t	        ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        t        |j                  |j                        | _        ||j                  k\  | _        | j                  rt        |      | _        y t!        |      | _        y )N)r;   rx   r{   )r9   r:   rA   rx   rw   	self_attnr0   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernormnum_dense_layersmoe_enabledrf   mlpr2   r   s      r-   r:   zAfmoeDecoderLayer.__init__   s    !--"'vK  ,F,>,>FDWDWX(4V5G5GVM`M`(a% ".f.@.@fFYFY!Z".v/A/AvGZGZ"[ %(?(??*62DH'DHr,   NrF   r   position_idsr   	use_cacher   r   r   c           
         |}| j                  |      } | j                  d||||||d|\  }}	| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)rF   r   r   r   r   r   r+   )r   r   r   r   r   r   )
rD   rF   r   r   r   r   r   r   residualrW   s
             r-   r^   zAfmoeDecoderLayer.forward   s     ! ,,];)4>> 
')%) 3
 
q 55mD =0 !..}=///> =0r,   )NNNNN)r(   r)   r*   r_   r$   r   r:   rQ   r`   
LongTensorr   boolr   r   r   FloatTensorr^   ra   rb   s   @r-   r   r      s    ({ (s (2 /304'+!%HL!||! t+! &&-	!
 ! $;! #5<<#=>E! +,! 
		!r,   r   c                        e Zd ZU dZeed<   dZdgZdgZ e	e
d      eedZg d	Zd
Zd
Zd
Z e       Zd
Zd
Z fdZ xZS )AfmoePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r;   modelr   past_key_valuesr   )rL   )rY   rF   
attentions)r   r   r   r   r   r   normrG   Tc                    t         |   |       | j                  j                  }t	        |t
              rEt        j                  |j                  d|       t        j                  |j                  d|       yt	        |t              r*t        j                  |j                  j                         yt	        |t              r t        j                  |j                         yy)zInitialize the weightsr   )meanstdN)r9   _init_weightsr;   initializer_range
isinstancerd   initnormal_gate_up_proj	down_projr4   zeros_rB   weightrf   rG   )rD   moduler   rE   s      r-   r   z"AfmoePreTrainedModel._init_weights%  s    f%kk++fl+LL,,3C@LL))= 67KK**+ 34KK**+ 5r,   )r(   r)   r*   r_   r$   __annotations__base_model_prefix_no_split_modules_skip_keys_device_placementr   r4   r   rw   _can_record_outputs_keep_in_fp32_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   _can_compile_fullgraph_supports_attention_backendsupports_gradient_checkpointingr   ra   rb   s   @r-   r   r     s    
 ,-#4"5'(>aH*$
	 N!  #'&*#
, 
,r,   r   c                        e Zd ZdZdef fdZeee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  d	edz  d
edz  dee   deez  fd                     Z xZS )
AfmoeModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AfmoeDecoderLayer`]

    Args:
        config: AfmoeConfig
    r;   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr{   r;   F)r9   r:   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrA   embed_tokens
ModuleListrangenum_hidden_layersr   layersr0   r   r   r&   
rotary_embgradient_checkpointing	post_initr   s      r-   r:   zAfmoeModel.__init__;  s     !.. ++LL):):F<N<NPTP`P`ammCHIaIaCbcivy1c
 !!3!39L9LM	.f=&+# ds   DN	input_idsr   inputs_embedsr   r   r   r   r   c           
         |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        |x}	t              s(| j                  |||d}
t        di |
t        di |
d}	|}| j                  j                  r|| j                  j                  dz  z  }| j!                  ||      }t#        | j$                        D ].  \  }} ||f|	| j                  j&                  |      ||||d	|}0 | j)                  |      }t+        ||r|
      S d 
      S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r#   )device)r;   r   r   r   )full_attentionrz   g      ?)r   r   r   r   r   )last_hidden_stater   r+   )
ValueErrorr   r;   r   get_seq_lengthrQ   arangerN   r   	unsqueezer   dictr
   r   mup_enabledrA   r   	enumerater   r}   r   r   )rD   r   r   r   r   r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsrF   r   idecoder_layers                  r-   r^   zAfmoeModel.forwardJ  s    -t";<YZZ0*$++>O  --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-F++!."0#2	K #5"C{"C%F%U%U#
 & ;;"")T[[-D-Dc-IJM"oom\J )$++ 6 		A})24;;3J3J13MN).#$7 M		 		-0%+/8O
 	
>B
 	
r,   )NNNNNN)r(   r)   r*   r_   r$   r:   r   r   r   rQ   r   r`   r   r   r   r   r   r   r   r^   ra   rb   s   @r-   r   r   2  s    {   .2.22604(,!%<
##d*<
 t+<
 ((4/	<

 &&-<
 <
 $;<
 +,<
 
'	'<
    <
r,   r   c                   B   e Zd ZddiZddiZddgdgfiZd Zee	 	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  dee	j                  z  dee   defd              Zy)AfmoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrF   logitsc                     t         j                  | |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r6   )
r   r:   r   r   r   r   r@   rA   r   r   )rD   r;   s     r-   r:   zAfmoeForCausalLM.__init__  sU    %%dF3'
 ++yy!3!3V5F5FUSr,   Nr   r   r   r   r   labelsr   output_router_logitslogits_to_keepr   r   c
                    ||n| j                   j                  } | j                  d|||||||d|
}|j                  }t	        |	t
              rt        |	 d       n|	}| j                  |d d |d d f         }d }| | j                  ||| j                  fi |
}t        |||j                  |j                  |j                  |j                        S )N)r   r   r   r   r   r   r  )lossr   r   rF   r   rY   r+   )r;   r  r   r   r   r   slicer   loss_functionr   r   r   rF   r   rY   )rD   r   r   r   r   r   r  r   r  r  r   outputsrF   slice_indicesr   r  s                   r-   r^   zAfmoeForCausalLM.forward  s      %9$D $++JjJj 	 +5$** 	+
)%+'!5	+
 	+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD(#33!//))!//
 	
r,   )	NNNNNNNNr   )r(   r)   r*   _tied_weights_keys_tp_plan_pp_planr:   r   r   rQ   r   r`   r   r   r   r   r   r   r   r^   r+   r,   r-   r   r     s'   *,GH23H_-z:;H  .2.204(,26*.!%,0-.+
##d*+
 t++
 &&-	+

 +
 ((4/+
   4'+
 $;+
 #Tk+
 ell*+
 +,+
 
#+
  +
r,   r   )r   r   r   )Br_   collections.abcr   rQ   r    r   r   cache_utilsr   r   
generationr	   masking_utilsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r   gpt_oss.modeling_gpt_ossr   llama.modeling_llamar   r   r   r   r    qwen2_moe.modeling_qwen2_moer!   r"   configuration_afmoer$   
get_loggerr(   loggerr&   r0   r2   Moduler4   rd   rf   rw   r   r   r   r   __all__r+   r,   r-   <module>r"     s    $   & . ) R 9 Q F & k k 7 E 4  H , 
		H	%	/ 		= 		{ 	;RYY ;<	? 	-")) ->B)^ B)J?2 ?D,,? ,,^ V
% V
 V
r9
')= 9
xr,   