
    i9                     T   d dl mZmZ d dlmZ ddlmZ ddlmZm	Z	 ddl
mZ  e	j                  e      Z ed      e G d	 d
e                    Z ed      e G d de                    Z ed      e G d de                    Z ed      e G d de                    Zg dZy)    )AnyLiteral)strict   )PreTrainedConfig)auto_docstringlogging)intervalzgoogle/gemma-4-e2b-it)
checkpointc                   J    e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZee   eeef   z  ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   d Zeed!<     ed"d#$      d%&      Zeed'<    fd(Z xZS ))Gemma4AudioConfiga  
    subsampling_conv_channels (`list[int]`, defaults to `[128, 32]`):
        Channel sizes for the convolutional layers in the Sub-sample Convolution Projection.
    residual_weight (`float`, defaults to `0.5`):
        Scaling applied to hidden_states prior to combining with the residual in the feedforward.
    attention_chunk_size (`int`, defaults to `12`):
        The sub-sequence size for attention processing.
    attention_context_left (`int`, defaults to `13`):
        The leftward context size for the attention chunk.
    attention_context_right (`int`, defaults to `0`):
        The rightward context size for the attention chunk.
    attention_logit_cap (`float`, defaults to `50.0`):
        Cap applied to attention weights.
    attention_invalid_logits_value (`float`, defaults to `1e-9`):
        Value to use for invalid logits in attention.
    use_clipped_linears (`bool`, defaults to `True`):
        If true, apply clipping to the Linear layers, drawing bounds from the model checkpoint.
    gradient_clipping (`float`, defaults to `1e10`):
        Clipping value used to stabilize extremely large gradient values.
    output_proj_dims (`int`, defaults to `1536`):
        Dimension of the final linear projection from `hidden_size` to the model's output.
    gemma4_audioi   hidden_size   num_hidden_layers   num_attention_headssilu
hidden_act)       subsampling_conv_channels   conv_kernel_sizeg      ?residual_weightattention_chunk_size   attention_context_leftr   attention_context_rightg      I@attention_logit_capg    eattention_invalid_logits_valueTuse_clipped_linearsư>rms_norm_epsg    _Bgradient_clippingi   output_proj_dims        g      ?)minmax{Gz?)defaultinitializer_rangec                     t        | j                  t              rt        | j                        | _        t	        |   di | y )N )
isinstancer   tuplelistsuper__post_init__selfkwargs	__class__s     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/gemma4/configuration_gemma4.pyr3   zGemma4AudioConfig.__post_init__N   s6    d44e<-1$2P2P-QD*''    ) __name__
__module____qualname____doc__
model_typer   int__annotations__r   r   r   strr   r1   r0   r   r   floatr   r   r   r    r!   r"   boolr$   r%   r&   r
   r,   r3   __classcell__r7   s   @r8   r   r      s    .  JKs  J >GtCy5c?:F c OU  "#""$C$#$S$!%%,2"E2 $$L%#u# c 9xCS9$GuG( (r9   r   c                       e Zd ZU dZdZdgZddddddddddddd	Zd
gdgfddgdgfdgdgfdZdZe	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZee
d<   dZe	e
d <   d!Zee
d"<   d#Zee
d$<   d%Zee
d&<   d'Ze	d(z  e
d)<   d*Ze	ee	   z  d(z  e
d+<   d,Ze	d(z  e
d-<   d%Zee
d.<   d(Zed(z  e
d/<   d0Z ee
d1<   d2Z!e	ez  d(z  e
d3<   d4Z"e	e
d5<   d(Z#ee   d(z  e
d6<   d(Z$ed(z  e
d7<   d(Z%e&d8   d(z  e
d9<   dZ'e	e
d:<   dZ(e	e
d;<   d(Z)e	d(z  e
d<<   d4Z*e	e
d=<   d0Z+ee
d><   d'Z,e	e
d?<   d0Z-ee
d@<   d0Z.ee
dA<   d(Z/e	d(z  e
dB<   d(Z0e	d(z  e
dC<   d(Z1e	d(z  e
dD<    fdEZ2dF Z3 xZ4S )GGemma4TextConfiga<  
    use_bidirectional_attention (`str`, *optional*):
        Controls bidirectional attention behavior. When set to `"vision"`, vision tokens
        attend bidirectionally while text tokens use causal attention. When set to `"all"`,
        all tokens use bidirectional attention.
    vocab_size_per_layer_input (`int`, defaults to 262144):
        Vocabulary size for the per-layer input embeddings. Used by models with per-layer
        residual streams where a smaller embedding is added at each decoder layer.
    hidden_size_per_layer_input (`int`, defaults to 256):
        Hidden dimension for the per-layer input embeddings. Controls the width of the
        per-layer residual embedding vectors.
    num_global_key_value_heads (`int`, *optional*):
        Number of key-value heads for global (full) attention layers. If `None`, defaults
        to `num_key_value_heads`.
    global_head_dim (`int`, defaults to 512):
        Dimension of each attention head in global (full) attention layers.
    attention_k_eq_v (`bool`, defaults to `False`):
        Whether keys and values share the same projection weights. When `True`, the key
        projection output is reused as the value projection.
    num_kv_shared_layers (`int`, defaults to 0):
        Number of consecutive decoder layers that share the same key-value projections.
        A value of 0 means no sharing (each layer has independent KV projections).
    enable_moe_block (`bool`, defaults to `False`):
        Whether to enable Mixture-of-Experts (MoE) blocks in the decoder layers. When
        `True`, eligible layers will use a sparse MoE feed-forward network.
    use_double_wide_mlp (`bool`, defaults to `False`):
        Whether to use a double-width MLP with fused gate and up projections.
    top_k_experts (`int`, *optional*):
        Number of experts activated per token in MoE layers. Only used when
        `enable_moe_block=True`.
    moe_intermediate_size (`int`, *optional*):
        Intermediate (hidden) size of each expert's feed-forward network in MoE layers.
        Only used when `enable_moe_block=True`.
    gemma4_textpast_key_valuescolwisereplicated_with_grad_allreducerowwisepacked_colwisemoe_tp_experts)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.experts.gate_up_projzlayers.*.experts.down_projzlayers.*.experts	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   
vocab_sizei 	  r   i $  intermediate_size   r   r   r      num_key_value_heads   head_dimgelu_pytorch_tanhhidden_activation   max_position_embeddingsr*   r,   r#   r$   T	use_cacher   Npad_token_id   eos_token_id   bos_token_idtie_word_embeddingsrope_parametersFattention_biasr'   attention_dropouti   sliding_windowlayer_typesfinal_logit_softcapping)allvisionuse_bidirectional_attentionvocab_size_per_layer_inputhidden_size_per_layer_inputnum_global_key_value_headsglobal_head_dimattention_k_eq_vnum_kv_shared_layersenable_moe_blockuse_double_wide_mlpnum_expertstop_k_expertsmoe_intermediate_sizec                    | j                   dk(  r| j                  dz  dz   | _        | j                  =d}t        | j                        D cg c]  }t        |dz   |z        rdnd c}| _        | j                  r<| j                  d   x}dk7  r(t        j                  d| d	       d| j                  d<   d
ddddddd}| j                  || _        t        | (  di | y c c}w )Nrn   re   rc      sliding_attentionfull_attentionz/Last layer must use `full_attention`, but got `z*`. Forcing last layer to `full_attention`.r+   g     @	rope_type
rope_thetaproportionalg      ?g    .A)r   partial_rotary_factorr   )r~   r   r.   )rp   rk   rl   ranger   rC   loggerwarningrh   r2   r3   )r5   r6   sliding_window_patternilast_layer_typedefault_rope_paramsr7   s         r8   r3   zGemma4TextConfig.__post_init__   s   ++u4#'#6#6!#;q"@D#%&" t556  (,QU6L,L'M#Scc D
 D4D4DR4H!HM] ]NNA/ARR|} $4DR  09!Q,:UYituf
 '#6D ''% s   C(c                     |S )Nr.   )r5   r6   s     r8   convert_rope_params_to_dictz,Gemma4TextConfig.convert_rope_params_to_dict   s    r9   )5r:   r;   r<   r=   r>   keys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrV   r?   r@   r   rW   r   r   rZ   r\   r^   rA   r`   r,   rB   r$   ra   rC   rb   rd   r1   rf   rg   rh   dictri   rj   rk   rl   rm   rp   r   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r3   r   rD   rE   s   @r8   rG   rG   U   s7   !F J#4"5%.%.%.%E%E%."+ )"+)9&/, &(9:#%568IJ!"_$56 JK!s!s    Hc0s0#*S*#u#L%It L#* +,L#S	/D(, L#*  $$#'OTD[' ND ,/sU{T)/NC$(KcT!(,0UT\0CG!9D!@G&--'**-1d
1OS"d" !#!"d" %%"Kt" $M3:$(,3:,(4r9   rG   c            
       R    e Zd ZU dZdZdddddddddd	ZdZdZee	d	<   d
Z
ee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   dZedz  e	d<   dZedz  e	d<   dZee	d <   dZee	d!<   d"Zee	d#<   dZee	d$<   dZee	d%<   d&Zee	d'<    fd(Z xZ S ))Gemma4VisionConfiga  
    pooling_kernel_size (`int`, *optional*):
        Spatial pooling kernel size applied after patchification.
    position_embedding_size (`int`, defaults to 10240):
        Maximum number of position embeddings for the vision encoder. Controls the size of
        the learned 2D position embedding table used by the patch embedder.
    use_clipped_linears (`bool`, defaults to `False`):
        Whether to use weight-clipped linear layers. When enabled, linear layer weights are
        clamped to a fixed range during the forward pass to improve numerical stability.
    standardize (`bool`, defaults to `False`):
        If true, applies a bias and scale to the soft tokens returned from the pooler.
    gemma4_visionrJ   rK   rL   )	z!encoder.layers.*.self_attn.q_projz!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.q_normz!encoder.layers.*.self_attn.k_normz!encoder.layers.*.self_attn.o_projzencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_proj      Y@i   r   i   rW      r   r   r   rZ   @   r\   r]   r^   r#   r$   r_   r`   FNri   r'   rj   rh   r   pooling_kernel_size
patch_sizei (  position_embedding_sizer"   standardizer*   r,   c                 P    | j                   
ddd| _         t        |   di | y )Nr+   r   r   r.   )rh   r2   r3   r4   s     r8   r3   z Gemma4VisionConfig.__post_init__  s,    '1:%#PD ''r9   )!r:   r;   r<   r=   r>   r   default_thetar   r?   r@   rW   r   r   rZ   r\   r^   rA   r$   rB   r`   ri   rC   rj   rh   r   r   r   r   r"   r   r,   r3   rD   rE   s   @r8   r   r      s    !J-6-6-6-M-M-6*3(1*3
 MK!s!s!!!!Hc0s0L%#*S*"'ND4K'&)ut|)#'OTD['  J#,S, %%K#u#( (r9   r   c                   N    e Zd ZU dZdZeeedZdZ	ee
eef   z  dz  ed<   dZee
eef   z  dz  ed<   dZee
eef   z  dz  ed<   dZedz  ed	<   d
Zedz  ed<   dZedz  ed<   dZedz  ed<   dZedz  ed<   dZedz  ed<   dZedz  ed<   dZedz  ed<   dZeed<    fdZ xZS )Gemma4Configag  
    boi_token_id (`int`, *optional*, defaults to 255999):
        The begin-of-image token index to wrap the image prompt.
    eoi_token_id (`int`, *optional*, defaults to 258882):
        The end-of-image token index to wrap the image prompt.
    boa_token_id (`int`, *optional*, defaults to 256000):
        The begin-of-audio token index to wrap the audio prompt.
    eoa_token_index (`int`, *optional*, defaults to 258883):
        The end-of-audio token index to wrap the audio prompt.

    Example:

    ```python
    >>> from transformers import (
    >>>     Gemma4AudioConfig,
    >>>     Gemma4Config,
    >>>     Gemma4ForConditionalGeneration,
    >>>     Gemma4TextConfig,
    >>>     Gemma4VisionConfig,
    >>> )

    >>> # Initializing a Gemma 4 Audio config.
    >>> audio_config = Gemma4AudioConfig()

    >>> # Initializing a Gemma 4 Text config.
    >>> text_config = Gemma4TextConfig()

    >>> # Initializing a Gemma 4 vision config.
    >>> vision_config = Gemma4VisionConfig()

    >>> # Initializing a Gemma 4 config similar to google/gemma-4-e2b-it
    >>> configuration = Gemma4Config(text_config, vision_config, audio_config)

    >>> # Initializing a model from the google/gemma-4-e2b-it configuration
    >>> model = Gemma4ForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma4)text_configvision_configaudio_configNr   r   r   i boi_token_idiB eoi_token_idi@ image_token_idiD video_token_idi  boa_token_idiC eoa_token_indexiA audio_token_idr*   r,   Trg   c                 B   | j                   %t               | _         t        j                  d       n4t	        | j                   t
              rt        di | j                   | _         | j                  t        j                  d       t	        | j                  t
              rt        di | j                  | _        | j                  t        j                  d       t	        | j                  t
              rt        di | j                  | _        t        | ,  di | y )Nz4text_config is None. Using default Gemma4TextConfig.zHvision_config is None. Gemma4Model.vision_tower will not be initialized.zFaudio_config is None. Gemma4Model.audio_tower will not be initialized.r.   )r   rG   r   infor/   r   r   r   r   r   r2   r3   r4   s     r8   r3   zGemma4Config.__post_init__J  s    #/1DKKNO(($//C$2B2BCD%KKbcd(($/!3!Id6H6H!ID$KK`ad''. 1 FD4E4E FD''r9   )r:   r;   r<   r=   r>   rG   r   r   sub_configsr   r   rA   r   r@   r   r   r   r?   r   r   r   r   r   r   r,   rB   rg   rC   r3   rD   rE   s   @r8   r   r     s    &P J'+)K =AK!DcN2T9@@DM%S#X6=D>BL#d38n4t;B&L#*&&L#*&!(NC$J(!(NC$J(&L#*&")OS4Z)!(NC$J(&*ut|* $$( (r9   r   )r   r   rG   r   N)typingr   r   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   utils.type_validatorsr
   
get_loggerr:   r   r   rG   r   r   __all__r.   r9   r8   <module>r      s      . 3 , - 
		H	% 235(( 5(  45(p 23y' y  4yx 233() 3(  43(l 23N(# N(  4N(b Zr9   