
    ic                        d dl Z d dlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZ d	d
lmZmZmZmZmZ d	dlmZmZmZmZmZmZ  ej6                  e      Z ed      e G d de                    Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d  d!e      Z' G d" d#e      Z(g d$Z)y)%    N)strict   )PreTrainedConfig)CausalLMOutputWithPast)RopeParameters)Unpack)auto_docstringlogging   )DeepseekV3DecoderLayerDeepseekV3MLPDeepseekV3MoEDeepseekV3PreTrainedModelDeepseekV3TopkRouter)Qwen3AttentionQwen3ForCausalLM
Qwen3ModelQwen3RMSNormQwen3RotaryEmbeddingTransformersKwargszrednote-hilab/dots.llm1.base)
checkpointc                       e Zd ZU dZdZdgZdddddddddddddddd	Zd
gdgfddgdgfdgdgfdZddiZdZ	e
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
dz  ed<   dZe
dz  ed<   dZe
dz  ed<   d Ze
dz  ed!<   d Ze
dz  ed"<   dZe
dz  ed#<   d$Ze
dz  ed%<   d&Zedz  ed'<   d(Zeed)<   d*Ze
ed+<   d,Zeed-<   d.Zeed/<   d0Z eed1<   d&Z!eed2<   dZ"e#e$z  dz  ed3<   d&Z%eed4<   d5Z&ee
z  dz  ed6<   d7Z'eed8<   d9Z(e
dz  ed:<   dZ)e
dz  ed;<   dZ*e+e   dz  ed<<   dZ,e
dz  ed=<   dZ-e
dz  ed><   dZ.e
e+e
   z  dz  ed?<    fd@Z/ xZ0S )ADots1Configa  
    n_group (`int`, *optional*, defaults to 1):
        Number of groups for routed experts.
    first_k_dense_replace (`int`, *optional*, defaults to 0):
        Number of dense layers at the beginning of the model before the first MoE layer.

    Examples:

    ```python
    >>> from transformers import Dots1Model, Dots1Config
    >>> # Initializing a Dots1 style configuration
    >>> configuration = Dots1Config()
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    dots1past_key_valuescolwiserowwisereplicated_with_grad_allreducepacked_colwisemoe_tp_experts)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normz!layers.*.mlp.experts.gate_up_projzlayers.*.mlp.experts.down_projzlayers.*.mlp.expertsz%layers.*.mlp.shared_experts.gate_projz#layers.*.mlp.shared_experts.up_projz%layers.*.mlp.shared_experts.down_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormnum_local_expertsn_routed_expertsi R 
vocab_sizei   hidden_sizei*  intermediate_sizei  moe_intermediate_size>   num_hidden_layers    num_attention_headsNnum_key_value_headsn_shared_experts   n_group
topk_groupnum_experts_per_tokr   first_k_dense_replaceFnorm_topk_probsilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_epsT	use_cachetie_word_embeddingsrope_parametersattention_bias        attention_dropoutg      ?routed_scaling_factori   sliding_windowmax_window_layerslayer_typespad_token_idbos_token_ideos_token_idc                 
   | j                   | j                  | _         | j                  Et        | j                        D cg c]!  }| j
                  || j                  k\  rdnd# c}| _        t        |    di | y c c}w )Nsliding_attentionfull_attention )	r2   r1   rH   ranger/   rF   rG   super__post_init__)selfkwargsi	__class__s      x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/dots1/modular_dots1.pyrR   zDots1Config.__post_init__{   s    ##+'+'?'?D$#
 t556	   &&2qD<R<R7R $%& D 	'' s   &B )1__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planattribute_mapr*   int__annotations__r+   r,   r-   r/   r1   r2   r3   r)   r5   r6   r7   r8   r9   boolr;   strr<   r=   floatr>   r?   r@   rA   r   dictrB   rD   rE   rF   rG   rH   listrI   rJ   rK   rR   __classcell__rV   s   @rW   r   r   *   s4   " J#4"5 &/%.%.%.%E%E-=*3 01:/81:"+ )"+& &(9:#%568IJ!"_$56 	/M JK"s"!%3%s!!&(t(#'cDj'#'cDj'GS4ZJd
&*t*()3:)"'ND4K'J#'S'#u#L%It %%48O^d*T18 ND ,/us{T)/#&5&!%NC$J%$&sTz&$(KcT!(#L#*##L#*#+/L#S	/D(/( (    r   c                       e Zd Zy)Dots1RMSNormNrX   rY   rZ   rO   rj   rW   rl   rl          rj   rl   c                       e Zd Zy)Dots1RotaryEmbeddingNrm   rO   rj   rW   rp   rp      rn   rj   rp   c                       e Zd Zy)Dots1AttentionNrm   rO   rj   rW   rr   rr      rn   rj   rr   c                       e Zd Zy)Dots1MLPNrm   rO   rj   rW   rt   rt      rn   rj   rt   c                       e Zd Zy)Dots1TopkRouterNrm   rO   rj   rW   rv   rv      rn   rj   rv   c                       e Zd Zd Zy)Dots1MoEc                    |j                         }|| j                  j                  z   }|j                  d| j                  | j
                  | j                  z        j                  dd      d   j                  d      }t        j                  || j                  dd      d   }t        j                  |      }|j                  d|d       |j                  d      j                  d| j                  | j
                  | j                  z        j                  d| j
                        }|j                  |j!                          d      }t        j                  || j"                  dd      d   }|j%                  d|      }	| j&                  r|	j                  dd	
      dz   }
|	|
z  }	|	| j(                  z  }	||	fS )Nr   )dimr   F)kr{   sortedr4   rC   T)r{   keepdimg#B;)sigmoidgatee_score_correction_biasviewr5   r)   topksumtorchr6   
zeros_likescatter_	unsqueezeexpandreshapemasked_fillrc   top_kgatherr9   rE   )rS   router_logitsrouter_logits_for_choicegroup_scores	group_idx
group_mask
score_maskscores_for_choicetopk_indicestopk_weightsdenominators              rW   route_tokens_to_expertsz Dots1MoE.route_tokens_to_experts   s   %--/#04993T3T#T $))"dllD<Q<QUYUaUa<abT!T_Q SRS[ 	
 JJ|tBuUVWX	%%l3
Ay!,  $VBd&;&;t||&KLWR../ 	
 5@@*//BSASUXYzz"3tzzrRWXYZ[$++A|<&**r4*@5HKK'L#d&@&@@\))rj   N)rX   rY   rZ   r   rO   rj   rW   rx   rx      s    *rj   rx   c                       e Zd Zy)Dots1DecoderLayerNrm   rO   rj   rW   r   r      rn   rj   r   c                       e Zd ZdZy)Dots1PreTrainedModelN)rX   rY   rZ   "_keys_to_ignore_on_load_unexpectedrO   rj   rW   r   r      s    )-&rj   r   c                       e Zd Zy)
Dots1ModelNrm   rO   rj   rW   r   r      rn   rj   r   c                   .     e Zd Zdee   def fdZ xZS )Dots1ForCausalLMsuper_kwargsreturnc                 "    t        |   di |S )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Dots1ForCausalLM

        >>> model = Dots1ForCausalLM.from_pretrained("rednote-hilab/dots1.llm1.inst")
        >>> tokenizer = AutoTokenizer.from_pretrained("rednote-hilab/dots1.llm1.inst")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```rO   )rQ   forward)rS   r   rV   s     rW   r   zDots1ForCausalLM.forward   s    4 w...rj   )rX   rY   rZ   r   r   r   r   rh   ri   s   @rW   r   r      s%    /12/ 
 / /rj   r   )r   r   r   r   )*r   huggingface_hub.dataclassesr   configuration_utilsr   modeling_outputsr   modeling_rope_utilsr   processing_utilsr   utilsr	   r
    deepseek_v3.modeling_deepseek_v3r   r   r   r   r   qwen3.modeling_qwen3r   r   r   r   r   r   
get_loggerrX   loggerr   rl   rp   rr   rt   rv   rx   r   r   r   r   __all__rO   rj   rW   <module>r      s     . 3 6 1 & ,   
		H	% 9:[(" [(  ;[(|	< 		/ 		^ 		} 		* 	*} *6	. 	.4 .	 	/' /<rj   