
    i                        d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZ ddlmZm Z   ejB                  e"      Z# ed      e G d de                    Z$ G d de       Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z) G d de      Z* G d d e      Z+ G d! d"e      Z, G d# d$e      Z-g d%Z.y)&    )CallableN)strict   )Cache)PreTrainedConfig)FlashAttentionKwargs)RopeParameters)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringlogging   )	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward)
Qwen2ModelQwen2RotaryEmbeddingzHuggingFaceTB/SmolLM3-3B)
checkpointc                       e Zd ZU dZdZdgZdZddddddddZdgd	gfd
dgd
gfd
gd
gfdZdZ	e
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
dz  ed<   dZeed<   dZe
ed<   dZeed<   d Zeed!<   d"Zeed#<   d$Ze
dz  ed%<   d&Ze
dz  ed'<   d(Ze
ee
   z  dz  ed)<   dZeez  dz  ed*<   d+Z eed,<   dZ!e
dz  ed-<   dZ"ee
   dz  ed.<   dZ#e
ed/<   dZ$ee   dz  ed0<   d+Z%eed1<   d2Z&ee
z  ed3<   d+Z'eed4<   d"Z(eed5<    fd6Z) xZ*S )7SmolLM3Configa>  
    no_rope_layers (`List[int]`, *optional*):
        List with at least the same length as the number of layers in the model.
        A `1` at an index position indicates that the corresponding layer will use RoPE,
        while a `0` indicates that it's a NoPE layer.
    no_rope_layer_interval (`int`, *optional*, defaults to 4):
        If `no_rope_layers` is `None`, it will be created using a NoPE layer every
        `no_rope_layer_interval` layers.

    ```python
    >>> from transformers import SmolLM3Model, SmolLM3Config

    >>> # Initializing a SmolLM3 style configuration
    >>> configuration = SmolLM3Config()

    >>> # Initializing a model from the SmolLM3 style configuration
    >>> model = SmolLM3Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```smollm3past_key_valuesg    >Acolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei   hidden_sizei +  intermediate_size$   num_hidden_layers   num_attention_heads   Nnum_key_value_headssilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_epsT	use_cachei pad_token_idi  bos_token_idi eos_token_idrope_parametersFuse_sliding_windowsliding_windowno_rope_layersno_rope_layer_intervallayer_typesattention_bias        attention_dropoutmlp_biastie_word_embeddingsc                 "   | j                   | j                  | _         | j                  Dt        | j                        D cg c]   }t        |dz   | j                  z  dk7        " c}| _        | j                  g | _        t        | j                        D ]b  }| j                  |   }| j                  r*| j                  |s| j                  j                  d       H| j                  j                  d       d t        | 0  di | y c c}w )N   r   sliding_attentionfull_attention )r0   r.   r=   ranger,   intr>   r?   r;   r<   appendsuper__post_init__)selfkwargs	layer_idxhas_rope	__class__s       |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/smollm3/modular_smollm3.pyrN   zSmolLM3Config.__post_init__q   s    ##+'+'?'?D$&Y^_c_u_uYv#LUY]d&A&AAQFG#D #!D"4#9#9: >	..y9**t/B/B/NW_$$++,?@$$++,<=> 	''#s   %D)+__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planr(   rK   __annotations__r)   r*   r,   r.   r0   r2   strr3   r4   floatr5   r6   boolr7   r8   r9   listr:   r	   dictr;   r<   r=   r>   r?   r@   rB   rC   rD   rN   __classcell__rS   s   @rT   r   r   ,   s   , J#4"5M &/%.%.%."+ )"+ &(9:#%568IJ!"_$56 JK"s"s!!&'t'J#(S(#u#L%It%L#*%%L#*%+1L#S	/D(148O^d*T18$$!%NC$J%'+NDI$+"#C#$(KcT!( ND %(us{(Hd $$( (    r   c                       e Zd Zy)SmolLM3RotaryEmbeddingNrU   rV   rW   rI   rf   rT   rh   rh          rf   rh   c                        e Zd Zdedef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  f   fdZ xZS )SmolLM3AttentionconfigrQ   c                     t         |   ||       |j                  |   | _        |j                  r$|j
                  |   dk(  r|j                  | _        y d | _        y )NrG   )rM   __init__r=   use_roper;   r?   r<   )rO   rm   rQ   rS   s      rT   ro   zSmolLM3Attention.__init__   sb    +--i8 ((V-?-?	-JNa-a !! 	  	rf   Nr#   position_embeddingsr$   r   rP   returnc                 8   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  r|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                   sdn| j"                  | j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )NrF   r   rA   )dropoutscalingr<   )shapehead_dimq_projview	transposek_projv_projrp   r   updaterQ   r
   get_interfacerm   _attn_implementationr   trainingrB   rv   r<   reshape
contiguouso_proj)rO   r#   rq   r$   r   rP   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   rT   forwardzSmolLM3Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST==*HC';L*VY[^'_$L*&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((rf   )N)rU   rV   rW   r   rK   ro   torchTensortupler   r   r   r   rd   re   s   @rT   rl   rl      s    
} 
 
 )-()||() #5<<#=>() t+	()
 () -.() 
u||U\\D00	1()rf   rl   c                       e Zd Zy)SmolLM3DecoderLayerNri   rI   rf   rT   r   r      rj   rf   r   c                       e Zd Zy)SmolLM3PreTrainedModelNri   rI   rf   rT   r   r      rj   rf   r   c                       e Zd Zy)SmolLM3ModelNri   rI   rf   rT   r   r      rj   rf   r   c                       e Zd Zy)SmolLM3ForCausalLMNri   rI   rf   rT   r   r      rj   rf   r   c                       e Zd Zy) SmolLM3ForSequenceClassificationNri   rI   rf   rT   r   r      rj   rf   r   c                       e Zd Zy)SmolLM3ForTokenClassificationNri   rI   rf   rT   r   r      rj   rf   r   c                       e Zd Zy)SmolLM3ForQuestionAnsweringNri   rI   rf   rT   r   r      rj   rf   r   )r   r   r   r   r   r   r   )/collections.abcr   r   huggingface_hub.dataclassesr   cache_utilsr   configuration_utilsr   modeling_flash_attention_utilsr   modeling_rope_utilsr	   modeling_utilsr
   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   qwen2.modeling_qwen2r   r   
get_loggerrU   loggerr   rh   rl   r   r   r   r   r   r   r   __all__rI   rf   rT   <module>r      s    %  .   3 B 1 5 & ,
 
 
 D 
		H	% 56U($ U(  7U(p	1 	3)~ 3)l	+ 		1 		: 		) 		'E 		$? 		"; 	rf   