
    i>+                        d dl mZ ddlmZ ddlmZ ddlmZmZ  ej                  e
      Z ed      e G d d	e                    Z ed      e G d
 de                    Z ed      e G d de                    Zg dZy)    )strict   )PreTrainedConfig)RopeParameters)auto_docstringloggingz meta-llama/Llama-4-Scout-17B-16E)
checkpointc                      e Zd ZU dZddddddddZdZdZdZee	d	<   d
Z
ee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZeee   z  eeef   z  e	d<   dZeee   z  eeef   z  e	d<   dZee	d<   dZee	d<   dZee	d<   d Zee	d!<   d"Zee	d#<   d"Zee	d$<   d%Zee	d&<   d'Zeez  e	d(<   d'Zeez  e	d)<   d*Z e!e"z  d*z  e	d+<   y*),Llama4VisionConfigaw  
    vision_output_dim (`int`, *optional*, defaults to 7680):
        Dimensionality of the vision model output. Includes output of transformer
        encoder with intermediate layers and global transformer encoder.
    pixel_shuffle_ratio (`float`, *optional*, defaults to 0.5):
        Pixel-shuffle ratio for downsampling patch tokens. Smaller values produce fewer tokens (more downsampling).
    projector_input_dim (`int`, *optional*, defaults to 4096):
        Width of the vision adapter MLP before pixel shuffle. Larger value increases capacity and compute.
    projector_output_dim (`int`, *optional*, defaults to 4096):
        Output width of the vision adapter. Larger value yields higher-dimensional image features.
    projector_dropout (`float`, *optional*, defaults to 0.0):
        Dropout rate inside the vision adapter MLP. Higher value adds more regularization.
    colwiserowwisecolwise_gather_output)zmodel.layers.*.self_attn.q_projzmodel.layers.*.self_attn.k_projzmodel.layers.*.self_attn.v_projzmodel.layers.*.self_attn.o_projzvision_adapter.mlp.fc1zvision_adapter.mlp.fc2zpatch_embedding.linearllama4_vision_modelvision_configi   hidden_sizegelu
hidden_act"   num_hidden_layers   num_attention_headsr   num_channelsi   intermediate_sizei   vision_output_dimi  
image_size   
patch_sizeh㈵>norm_epsdefaultvision_feature_select_strategy{Gz?initializer_rangeg      ?pixel_shuffle_ratioi   projector_input_dimprojector_output_dimFmulti_modal_projector_bias        projector_dropoutattention_dropoutNrope_parameters)#__name__
__module____qualname____doc__base_model_tp_plan
model_typebase_config_keyr   int__annotations__r   strr   r   r   r   r   r   listtupler   r   floatr!   r#   r$   r%   r&   r'   boolr)   r*   r+   r   dict     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/llama4/configuration_llama4.pyr   r      s8    ,5+4+4+4"+"+"9 'J%OKJs!!L#!s!!s!47Jd3i%S/1746Jd3i%S/16He*3"C3#u#!$$## $#$',,%(us{(%(us{(48O^d*T18r<   r   c                       e Zd ZU dZdZdgZdZdddddddddddddZddddd	d	dddd
d
ZdZ	e
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZeed<   dZe
ed<   d Zeed!<   d"Zeed#<   d$Zeed%<   d&Ze
d&z  ed'<   d(Ze
d&z  ed)<   d*Ze
ee
   z  d&z  ed+<   d,Zeed-<   d.Z ee
z  ed/<   d(Z!e
ed0<   d1Z"e
ed2<   d&Z#ee
   d&z  ed3<   d(Z$e
ed4<   d$Z%eed5<   d,Z&eed6<   d7Z'eed8<   d.Z(eed9<   d&Z)e*e+z  d&z  ed:<   d&Z,ee
   d&z  ed;<   d<Z-e
ed=<   dZ.e
d&z  ed><   d&Z/ee   d&z  ed?<   d$Z0eed@<   dZ1e
edA<   dBZ2eedC<   d,Z3eedD<    fdEZ4 xZ5S )FLlama4TextConfigaO  
    intermediate_size_mlp (`int`, *optional*, defaults to 16384):
        Intermediate size of dense MLP layers. Larger value increases FFN capacity and compute.
    moe_layers (`list[int]`, *optional*):
        List of layer indices that use MoE. Overrides `interleave_moe_layer_step` when set.
    interleave_moe_layer_step (`int`, *optional*, defaults to 1):
        Spacing between MoE layers when `moe_layers` is `None`. Larger value means fewer MoE layers.
    use_qk_norm (`bool`, *optional*, defaults to `True`):
        Whether to L2-normalize queries/keys on RoPE layers. Can stabilize attention when enabled.
    no_rope_layers (`list[int]`, *optional*):
        List with at least the same length as the number of layers in the model.
        A `1` at an index position indicates that the corresponding layer will use RoPE,
        while a `0` indicates that it's a NoPE layer.
    no_rope_layer_interval (`int`, *optional*, defaults to 4):
        If `no_rope_layers` is `None`, it will be created using a NoPE layer every
        `no_rope_layer_interval` layers.
    attention_chunk_size (`int`, *optional*, defaults to 8192):
        Chunk size for the attention computation. Smaller value enforces more local attention and lowers memory.
    attn_temperature_tuning (`bool`, *optional*, defaults to `True`):
        Whether to dynamically scale the attention temperature for each query token based on sequence length.
        Recommended for long sequences (e.g., >32k tokens) to maintain stable output results.
    floor_scale (`int`, *optional*, defaults to 8192):
        Base scale (in tokens) for attention temperature tuning. Larger value delays scaling to longer positions.
    attn_scale (`float`, *optional*, defaults to 0.1):
        Strength of attention temperature tuning. Larger value increases scaling at long positions.

    Example:
    llama4_textpast_key_valuesg    Ar   r   packed_rowwise)layers.*.self_attn.q_projlayers.*.self_attn.k_projlayers.*.self_attn.v_projlayers.*.self_attn.o_projz-layers.*.feed_forward.shared_expert.gate_projz+layers.*.feed_forward.shared_expert.up_projz-layers.*.feed_forward.shared_expert.down_proj*layers.*.feed_forward.experts.gate_up_proj'layers.*.feed_forward.experts.down_projlayers.*.feed_forward.gate_projlayers.*.feed_forward.up_projlayers.*.feed_forward.down_projgrouped_gemm	ep_router)
rC   rD   rE   rF   rG   rH   rI   rJ   rK   zlayers.*.feed_forward.routeri@ 
vocab_sizei   r   i    r   i @  intermediate_size_mlp0   r   (   r      num_key_value_heads   head_dimsilur   i   max_position_embeddingsr"   r#   r   rms_norm_epsT	use_cacheNpad_token_id   bos_token_id   eos_token_idFtie_word_embeddingsr(   r*   num_experts_per_tokr   num_local_experts
moe_layersinterleave_moe_layer_stepuse_qk_normoutput_router_logitsgMbP?router_aux_loss_coefrouter_jitter_noiser+   no_rope_layers   no_rope_layer_intervalattention_chunk_sizelayer_typesattn_temperature_tuningfloor_scaleg?
attn_scaleattention_biasc                    | j                   | j                  | _         t        | j                        D cg c]   }t	        |dz   | j
                  z  dk7        " }}| j                  r| j                  n|| _        | j                  | j                  n| j                  | j                  z  | _        | j                  | j                  n6t        t        | j                  dz
  | j                  | j                              | _	        | j                  #| j                  D cg c]  }|rdnd
 c}| _        t        | 8  di | y c c}w c c}w )Nr[   r   chunked_attentionfull_attentionr;   )rS   r   ranger   r3   rj   rh   rU   r   rb   r6   rc   rl   super__post_init__)selfkwargs	layer_idxdefault_no_rope_layersno_rope	__class__s        r=   rv   zLlama4TextConfig.__post_init__   sA   ##+'+'?'?D$ V[[_[q[qUr"
HQCQ$"="==BC"
 "
 6:5H5Hd11Nd)-)BHXHX\`\t\tHt * OO22Q6**22 	 #TXTgTg IPw#4DD D 	''/"
& s   %E E
)6r,   r-   r.   r/   r1   keys_to_ignore_at_inferencedefault_thetar0   base_model_ep_planrN   r3   r4   r   r   rO   r   r   rS   rU   r   r5   rW   r#   r8   rX   rY   r9   rZ   r\   r^   r6   r_   r*   r`   ra   rb   rc   rd   re   rf   rg   r+   r   r:   rh   rj   rk   rl   rm   rn   ro   rp   rv   __classcell__r|   s   @r=   r?   r?   M   s/   : J#4"5M%.%.%.%.9B7@9B6F3<+4)2+4 &/%.%.%.6D3A+4)2+4(3 JK!s!!&3&s!!  HcJ#,S,#u#L%It#L#*# L#* +,L#S	/D(, %%%(us{(  s#'JS	D '%&s&K!&$&"'%'!$$48O^d*T18'+NDI$+"#C#'+#*+$(KcT!($(T(KJ ND ( (r<   r?   c                        e Zd ZU dZdZddddZeedZdd	iZ	d
Z
eez  d
z  ed<   d
Zeez  d
z  ed<   dZeed<   dZeed<   dZeed<   dZeed<    fdZ xZS )Llama4Configat  
    boi_token_index (`int`, *optional*, defaults to 200080):
        The begin-of-image token index to wrap the image prompt.
    eoi_token_index (`int`, *optional*, defaults to 200081):
        The end-of-image token index to wrap the image prompt.

    ```python
    >>> from transformers import Llama4Model, Llama4Config

    >>> # Initializing a Llama4 7B style configuration
    >>> configuration = Llama4Config()

    >>> # Initializing a model from the Llama4 7B style configuration
    >>> model = Llama4Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    llama4image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configr   zmulti_modal_projector.linear_1colwise_repNr   r   i i i Fr_   c                    | j                   %t               | _         t        j                  d       n4t	        | j                   t
              rt        di | j                   | _         | j                  %t               | _        t        j                  d       n4t	        | j                  t
              rt        di | j                  | _        t        | $  di | y )Nz9vision_config is None, using default llama4 vision configz5text_config is None, using default llama4 text configr;   )
r   r   loggerinfo
isinstancer:   r   r?   ru   rv   )rw   rx   r|   s     r=   rv   zLlama4Config.__post_init__   s    %!3!5DKKST**D1!3!Id6H6H!ID#/1DKKOP(($//C$2B2BCD''r<   )r,   r-   r.   r/   r1   attribute_mapr?   r   sub_configsr0   r   r:   r   r4   r   r   r3   r   r   r_   r9   rv   r   r   s   @r=   r   r      s    ( J-))M
 #3EWXK(- 59M4**T1826K((4/6!OS!!OS!#s# %%( (r<   r   )r   r?   r   N)huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r   
get_loggerr,   r   r   r?   r   __all__r;   r<   r=   <module>r      s   " / 3 1 , 
		H	% =>-9) -9  ?-9` =>{(' {(  ?{(| =>3(# 3(  ?3(l Er<   