
    i                        d dl Z d dlmZ d dl mZ ddlmZ ddlmZ ddl	m
Z
mZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ  ej2                  e      Z e
d      e G d de                    Z G d de      Z G d de      Z G d de      Z G d de      Z  G d de      Z! G d dee      Z" G d de      Z# G d d e      Z$g d!Z%y)"    N)strict)nn   )initialization)PreTrainedModel)auto_docstringlogging   )DeepseekV3Config)DeepseekV3Attention)LlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbedding)Qwen3MLPztencent/Youtu-LLM-2B)
checkpointc                       e Zd ZU dZdZddddZi ZdZee	d<   dZ
ee	d	<   d
Zee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   dZedz  e	d<   dZedz  e	d<   dZedz  e	d<   dZeee   z  dz  e	d<   dZee	d<    e       Z e       Z e       Z e       Z e       Z e       Z e       Z  e       Z! e       Z" e       Z# fdZ$d Z% xZ&S )YoutuConfiga   
    rope_interleave (`bool`, *optional*, defaults to `True`):
        Whether to interleave the rotary position embeddings.
    embedding_initializer_range (`float`, *optional*):
        The standard deviation of the truncated_normal_initializer for initializing all embedding matrices.

    ```python
    >>> from transformers import YoutuModel, YoutuConfig
    >>> # Initializing a Youtu-LLM-2B style configuration
    >>> configuration = YoutuConfig()
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```youtucolwiserowwise)zlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proji  
vocab_sizei   hidden_sizei   intermediate_size    num_hidden_layers   num_attention_headsnum_key_value_headsi   max_position_embeddingsNinitializer_rangeembedding_initializer_rangepad_token_idi  bos_token_idi eos_token_idTtie_word_embeddingsc                     | j                   1| j                  dk7  rdd| j                  z  dz  z  | _         nd| _         | j                  xs d| j                   z  | _        t        |   di | y )Nr   g       @g      @g      ?{Gz? )r#   r   r$   super__post_init__)selfkwargs	__class__s     x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/youtu/modular_youtu.pyr-   zYoutuConfig.__post_init___   sq    !!)1$),d6F6F0F3/N)N&)-&+/+K+K+ksUYUkUkOk(''    c                     t        d      )Nz$Not overwritten for the Youtu model!)AttributeError)r.   r/   s     r1   convert_rope_params_to_dictz'YoutuConfig.convert_rope_params_to_dicti   s    CDDr2   )'__name__
__module____qualname____doc__
model_typebase_model_tp_planattribute_mapr   int__annotations__r   r   r   r    r!   r"   r#   floatr$   r%   r&   r'   listr(   boolr4   n_shared_expertsn_routed_expertsrouted_scaling_factorn_group
topk_groupnum_experts_per_tokfirst_k_dense_replacenorm_topk_probpretraining_tpmoe_intermediate_sizer-   r5   __classcell__)r0   s   @r1   r   r   ,   s1    J"+ )"+
 MJK!s!s!!!!#)S)&*ut|*044#L#*#%L#*%+1L#S	/D(1 $$ &'%'*,G!J(**,#%N#%N*,(Er2   r   c                       e Zd Zy)YoutuRMSNormNr6   r7   r8   r+   r2   r1   rN   rN   m       r2   rN   c                       e Zd Zy)YoutuRotaryEmbeddingNrO   r+   r2   r1   rR   rR   q   rP   r2   rR   c                       e Zd Zy)YoutuMLPNrO   r+   r2   r1   rT   rT   u   rP   r2   rT   c                       e Zd Zy)YoutuAttentionNrO   r+   r2   r1   rV   rV   y   rP   r2   rV   c                       e Zd Zy)YoutuDecoderLayerNrO   r+   r2   r1   rX   rX   }   rP   r2   rX   c                   :    e Zd Z ej                         d        Zy)YoutuPreTrainedModelc                    t        j                  | |       t        | j                  dd      }t        | j                  dd|z        }t	        |t
        j                        rft        j                  |j                  d|       |j                  7t        j                  |j                  j                  |j                            y y y )Nr#   r*   r$   r
   g        )meanstd)r   _init_weightsgetattrconfig
isinstancer   	Embeddinginitnormal_weightpadding_idxzeros_data)r.   moduler]   	embed_stds       r1   r^   z"YoutuPreTrainedModel._init_weights   s    %%dF3dkk#6=DKK)FCP	fbll+LLSi@!!-FMM..v/A/ABC . ,r2   N)r6   r7   r8   torchno_gradr^   r+   r2   r1   rZ   rZ      s    U]]_D Dr2   rZ   c                       e Zd Zy)
YoutuModelNrO   r+   r2   r1   rn   rn      rP   r2   rn   c                       e Zd Zy)YoutuForCausalLMNrO   r+   r2   r1   rp   rp      rP   r2   rp   )r   rZ   rn   rp   )&rk   huggingface_hub.dataclassesr   r    r   rc   modeling_utilsr   utilsr   r	   %deepseek_v3.configuration_deepseek_v3r    deepseek_v3.modeling_deepseek_v3r   llama.modeling_llamar   r   r   r   r   r   qwen3.modeling_qwen3r   
get_loggerr6   loggerr   rN   rR   rT   rV   rX   rZ   rn   rp   __all__r+   r2   r1   <module>r|      s   *  .  & - , D B  , 
		H	% 12<E" <E  3<E~	< 		/ 		x 		( 		) 		D/ 	D	 		' 	r2   