
    iV                        d dl mZ d dlmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZA  e)j                  eC      ZD e'd      e	 G d de/e                    ZE e'd      e	 G d de                    ZF G d de>      ZG G d d e;      ZH G d! d"ej                        ZJ G d# d$e3      ZK G d% d&e6      ZL G d' d(e7      ZM G d) d*e1      ZN G d+ d,e      ZOdZP G d- d.e5      ZQd/eRd0eeReReReRgeSf   fd1ZT G d2 d3e4      ZU G d4 d5e2      ZV G d6 d7ej                        ZX e+d8d9d:;      	 	 	 	 dNd<ed:ej                  d=ej                  dz  d>edz  d?ej                  dz  d@ej                  dz  dAej                  dz  dBeSdCeSdz  d0e[fdD       Z\ G dE dFe=      Z] G dG dHe<      Z^ G dI dJeQ      Z_ G dK dLeeQ      Z`g dMZay)O    )Callable)AnyOptionalN)strict   )initialization)CacheDynamicCache)PreTrainedConfig)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)maybe_autocast   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaliGemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPasttoken_type_ids_mask_function)SiglipVisionConfigzgoogle/gemma-3-4b-it)
checkpointc            
           e Zd ZU dZdZdddddddddd	Zddd	Zd
Zee	d<   dZ
ee	d<   dZee   dz  e	d<   dZedz  e	d<   dZedz  e	d<   dZedz  e	d<   dZedz  e	d<   d Zd Zy)Gemma3TextConfiga  
    query_pre_attn_scalar (`float`, *optional*, defaults to 256):
        scaling factor used on the attention scores
    final_logit_softcapping (`float`, *optional*):
        Scaling factor when applying tanh softcapping on the logits.
    attn_logit_softcapping (`float`, *optional*):
        Scaling factor when applying tanh softcapping on the attention scores.
    use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
        If True, the model will attend to all text tokens instead of using a causal mask. This does not change
        behavior for vision tokens.

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3_textcolwisereplicated_with_grad_allreducerowwise)	zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projg    .Ag     @)globallocali@  
vocab_sizei   max_position_embeddingsNlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parametersFuse_bidirectional_attentionc                 N   | j                   r| j                  dz  dz   | _        |j                  dd      | _        | j                  Et        | j                        D cg c]!  }t        |dz   | j                  z        rdnd# c}| _        t        j                  di | y c c}w )Nr      sliding_window_pattern   sliding_attentionfull_attention )
r?   sliding_windowget_sliding_window_patternr;   rangenum_hidden_layersboolr   __post_init__)selfkwargsis      z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/gemma3/modular_gemma3.pyrM   zGemma3TextConfig.__post_init__m   s    ++#'#6#6!#;q"@D (.zz2JA'N$# t556  (,QUd6R6R,R'S#Yii D
 	&&00 s   &B"c                 z   |j                  dd       }ddiddid}| j                  | j                  n|| _        || j                  d   j                  |       | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d| j
                  d                | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d	| j
                  d
                | j                          |S )Nrope_scaling	rope_typedefault)rD   rE   rE   
rope_thetar7   rD   rope_local_base_freqr8   )popr>   updaterH   
setdefaultdefault_thetastandardize_rope_params)rN   rO   rS   default_rope_paramss       rQ   convert_rope_params_to_dictz,Gemma3TextConfig.convert_rope_params_to_dict|   sI   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G ##$45=6A95MD  !12-.99&**\43E3Eh3OP	
 ##$78@9Di8PD  !4501<<&**%;T=O=OPW=XY	

 	$$&    )__name__
__module____qualname____doc__
model_typebase_model_tp_planr[   r9   int__annotations__r:   r;   liststrr<   floatr=   r>   dictr?   rL   rM   r^   rF   r_   rQ   r2   r2   >   s    , J%.%.%.%E%E%."+ )"+
  +X>MJ#*S*$(KcT!(,0UT\0+/EDL/#'OTD['/441r_   r2   c                        e Zd ZU dZdZddddZeedZdZ	ee
eef   z  dz  ed	<   dZee
eef   z  dz  ed
<   dZedz  ed<   dZedz  ed<   dZedz  ed<   dZedz  ed<   dZedz  ed<   dZedz  ed<    fdZ xZS )Gemma3Configa  
    mm_tokens_per_image (`int`, *optional*, defaults to 256):
        The number of tokens per image embedding.
    boi_token_index (`int`, *optional*, defaults to 255999):
        The begin-of-image token index to wrap the image prompt.
    eoi_token_index (`int`, *optional*, defaults to 256000):
        The end-of-image token index to wrap the image prompt.

    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configNru   rv      mm_tokens_per_imagei i  i   g{Gz?initializer_rangeTtie_word_embeddingsc                    | j                   %t               | _         t        j                  d       n4t	        | j                   t
              rt        di | j                   | _         t	        | j                  t
              rt        di | j                  | _        n0| j                  $t               | _        t        j                  d       t        | $  di | y )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.rF   )
ru   r2   loggerinfo
isinstancerk   rv   r/   superrM   )rN   rO   	__class__s     rQ   rM   zGemma3Config.__post_init__   s    #/1DKKZ[(($//C$2B2BCDd(($/!3!Id6H6H!ID'!3!5DKK`a''r_   )r`   ra   rb   rc   rd   attribute_mapr2   r/   sub_configsru   rk   ri   r   rg   rv   rx   rf   rp   rq   ro   ry   rj   rz   rL   rM   __classcell__r   s   @rQ   rm   rm      s    : J-))M (+K
 =AK!DcN2T9@@DM%S#X6=D&)t)")OS4Z)")OS4Z)$+sTz+&*ut|*'++( (r_   rm   c                       e Zd Zy)Gemma3ModelOutputWithPastNr`   ra   rb   rF   r_   rQ   r   r          r_   r   c                       e Zd Zy)Gemma3CausalLMOutputWithPastNr   rF   r_   rQ   r   r      r   r_   r   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                     t         |   |||       || _        | j                  dt	        j
                  |      d       y )Nr   F
persistent)r   __init__scalar_embed_scaleregister_buffertorchtensor)rN   r   r   r   r   r   s        rQ   r   z&Gemma3TextScaledWordEmbedding.__init__   s;    D"-]ELL,ERWXr_   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S N)r   forwardr   toweightdtype)rN   r   r   s     rQ   r   z%Gemma3TextScaledWordEmbedding.forward   s2    wy)D,<,<,?,?@Q@Q,RRRr_   )      ?)r`   ra   rb   rc   rf   rj   r   r   Tensorr   r   r   s   @rQ   r   r      sG    Ys Y3 YS Y_d Y
S S Sr_   r   c                   $     e Zd Zdef fdZ xZS )	Gemma3MLPconfigc                 $    t         |   |       y r   r   r   rN   r   r   s     rQ   r   zGemma3MLP.__init__   s     r_   )r`   ra   rb   r2   r   r   r   s   @rQ   r   r      s    !/ ! !r_   r   c                   *     e Zd Zddedef fdZ xZS )Gemma3RMSNormdimepsc                 (    t         |   ||       y )Nr   r   r   )rN   r   r   r   s      rQ   r   zGemma3RMSNorm.__init__   s    Sc*r_   )gư>)r`   ra   rb   rf   rj   r   r   r   s   @rQ   r   r      s    +C +e + +r_   r   c                       e Zd ZddefdZe	 	 	 	 ddedz  ded   dedz  dedz  de	d	e
f   f
d
       Z ej                         edd              Zy)Gemma3RotaryEmbeddingNr   c                    t         j                  j                          |j                  | _        |j                  | _        || _        t        t        |j                              | _	        i | _
        | j                  D ]  }| j                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t!        | | d|        y )	NrT   rU   
layer_type	_inv_freqFr   _original_inv_freq_attention_scaling)nnModuler   r:   max_seq_len_cachedoriginal_max_seq_lenr   rh   setr;   rT   r>   compute_default_rope_parametersr   r   clonesetattr)rN   r   devicer   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalings           rQ   r   zGemma3RotaryEmbedding.__init__   s<   
		"("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Ur_   r   ztorch.deviceseq_lenr   returnztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        rV   head_dimNr   r   r   r   r   r   )	r>   getattrhidden_sizenum_attention_headsr   arangeint64r   rj   )r   r   r   r   baser   attention_factorinv_freqs           rQ   r   z5Gemma3RotaryEmbedding.compute_default_rope_parameters  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r_   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nr   r   r   rA   mpscpuF)device_typeenabledr   r   r   )r   rj   expandshaper   r   r~   typeri   r   	transposer   catcossinr   )rN   xposition_idsr   r   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                rQ   r   zGemma3RotaryEmbedding.forward:  sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$)NNNNNNr   )r`   ra   rb   r2   r   staticmethodr   rf   ri   tuplerj   r   r   no_gradr   r   rF   r_   rQ   r   r      s    U/ U. *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <r_   r   c                        e Zd Zdedef fdZ	 	 	 ddej                  dej                  dej                  dz  dedz  d	e	e
   d
eej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma3Attentionr   	layer_idxc                 b   t         |   ||       | j                  dk(  r|j                  nd | _        | j                  dk(  | _        | j
                  j                   | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )NrD   r   )r   r   r   rG   
is_slidingr   r?   	is_causalr   r   rms_norm_epsq_normk_normrN   r   r   r   s      rQ   r   zGemma3Attention.__init__O  s    +7;J]7]f33cg//-@@![[DDD#V=P=PQ#V=P=PQr_   Nhidden_statesposition_embeddingsattention_maskpast_key_valuesrO   r   c                 d   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      }| j                  |	      }	|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  r| j$                  nd| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr   rA   r   g        )dropoutscalingrG   )r   r   q_projviewr   k_projv_projr   r   r(   rY   r   r   get_interfacer   _attn_implementationr)   trainingattention_dropoutr   rG   reshape
contiguouso_proj)rN   r   r   r   r   rO   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   attention_interfaceattn_outputattn_weightss                   rQ   r   zGemma3Attention.forwardX  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r_   )NNN)r`   ra   rb   r2   rf   r   r   r   r	   r   r   r   r   r   r   s   @rQ   r   r   N  s    R/ RC R -1.2(,*)||*) #\\*) t+	*)
 *) +,*) 
u||U\\D0%2E2LL	M*)r_   r   c                       e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	e	dz  d
e
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma3DecoderLayerr   r   c                    t         |           || _        |j                  | _        || _        t        ||      | _        t        |      | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        y )N)r   r   r   )r   r   r   r   r   r   	self_attnr   mlpr   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      rQ   r   zGemma3DecoderLayer.__init__  s    !--"()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r_   Nr   r   r   r   r   rO   r   c           	         |}| j                  |      } | j                  d|||||d|\  }}| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)r   r   r   r   r   rF   )r  r  r  r  r  r  )	rN   r   r   r   r   r   rO   residual_s	            rQ   r   zGemma3DecoderLayer.forward  s     !,,];)4>> 
' 3)%+
 
q 55mD =0 66}E/77F =0r_   r   )r`   ra   rb   r2   rf   r   r   r   
LongTensorr	   r   r   r   FloatTensorr   r   r   s   @rQ   r  r    s    
c/ 
cC 
c -1.204(,|| #\\ t+	
 &&-  +, 
u  %(9(95;L;L(L"MPT"TT	Ur_   r  c                   J    e Zd ZdZdZg dZ ej                         d        Zy)Gemma3PreTrainedModelmodel)imagetext)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                    t        j                  | |       t        |t              r t	        j
                  |j                         y d|j                  j                  v r t	        j
                  |j                         y t        |t              r+t	        j                  |j                  |j                         y t        |t              r|j                  D ]  }|j                   }|j"                  |   dk7  rt$        |j"                  |      } ||j&                  |      \  }}t	        j(                  t+        || d      |       t	        j(                  t+        || d      |        y y )NRMSNormrU   r   r   r   )r   _init_weightsr~   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightr   r`   r   r   	constant_r   r   r   r;   r   rT   r   r   copy_r   )rN   moduler   r   r   r  s         rQ   r'  z#Gemma3PreTrainedModel._init_weights  s    %%dF3f78KK99:&**333KK& =>NN6--v/H/HI 56$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 7r_   N)	r`   ra   rb   base_model_prefixinput_modalities_no_split_modulesr   r   r'  rF   r_   rQ   r  r    s4    ( U]]_^ ^r_   r  rG   r   c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 &    t        ||z
        k  S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)r3  r4  r5  r6  rG   s       rQ   
inner_maskz1_bidirectional_window_overlay.<locals>.inner_mask  s     56>"^33r_   )rf   rL   )rG   r9  s   ` rQ   _bidirectional_window_overlayr:    s3    
4c 4S 4 4c 4d 4
 r_   c                        e Zd ZU eed<   dZdef fdZ	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
edz  dee   defdZ xZS )Gemma3TextModelr   r!  c                     t         |   |       t        |j                  |j                  | j
                  | j                  j                  dz        | _        y )N      ?)r   )r   r   r   r9   r   r   r   embed_tokensr   s     rQ   r   zGemma3TextModel.__init__  sM      :v1143C3CQUQ\Q\QhQhjmQm
r_   Nr   r   r   r   inputs_embeds	use_cacherO   r   c           	         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        |x}	t              sw| j                  ||||d}
|
j                         }| j                  j                  r(d |
d<   t        | j                  j                        |d<   t!        di |
t#        di |d	}	|}i }| j                  j$                  D ]  }| j'                  |||      ||<    t)        | j*                  d | j                  j,                         D ]G  \  }} ||f|	| j                  j$                  |      || j                  j$                  |      ||d
|}I | j/                  |      }t1        ||      S )N:You must specify exactly one of input_ids or inputs_embeds)r   r   rA   r   r   rA  r   r   r   c                  L    t        j                  dt         j                        S )NTr   )r   r   rL   )argss    rQ   <lambda>z)Gemma3TextModel.forward.<locals>.<lambda>  s    TY^YcYc@d r_   or_mask_function)rE   rD   )r   r   r   r   )last_hidden_stater   rF   )
ValueErrorr@  r
   r   get_seq_lengthr   r   r   r   	unsqueezer~   rk   copyr?   r:  rG   r   r   r;   
rotary_emb	enumeratelayersrK   normr   )rN   r   r   r   r   rA  rB  rO   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr   r   r   rP   decoder_layers                    rQ   r   zGemma3TextModel.forward  s    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-F ++!."0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & ++11 	gJ.2oom\[e.f
+	g !*$++6U8U8U*V W 	A})24;;3J3J13MN$78O8OPQ8R$S) / M	 		-0&++
 	
r_   )NNNNNN)r`   ra   rb   r2   rg   r0  r   r   r  r   r	   r  rL   r   r   r   r   r   r   s   @rQ   r<  r<    s     
/ 
 .2.204(,26!%C
##d*C
 t+C
 &&-	C

 C
 ((4/C
 $;C
 +,C
 
!C
r_   r<  c                   0     e Zd ZU eed<   def fdZ xZS )Gemma3ForCausalLMr   c                 D    t         |   |       t        |      | _        y r   )r   r   r<  r  r   s     rQ   r   zGemma3ForCausalLM.__init__6  s     $V,
r_   )r`   ra   rb   r2   rg   r   r   r   s   @rQ   rZ  rZ  3  s    -/ - -r_   rZ  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r(  r   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr  r?  )kernel_sizestride)r   r   r   	Parameterr   zerosrv   r   ru   r+  r   layer_norm_epsmm_soft_emb_normrf   
image_size
patch_sizepatches_per_imagerx   tokens_per_sider^  	AvgPool2davg_poolr   s     rQ   r   z"Gemma3MultiModalProjector.__init__<  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r_   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )NrA   r   )r   r   r  rf  r  ri  flattenrc  r   matmulr+  type_as)	rN   rj  
batch_sizer  r   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            rQ   r   z!Gemma3MultiModalProjector.forwardL  s    %3%9%9"
A{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r_   )	r`   ra   rb   rm   r   r   r   r   r   r   s   @rQ   r(  r(  ;  s#    \| \ @ell @r_   r(  input_embedsz5.6.0rA  )versionnew_namer   r   r   r   token_type_idspixel_valuesis_trainingis_first_iterationc	                    |r|t        d      | j                         ||||d}
||n|du xs |j                   xs |du}||r|dk(  j                  |j                        }t
        j                  j                  |dd      ddddf   }|| z  }t        j                  |j                         d	      dz
  }t        j                  ||d      }t        |      |
d
<   t        di |
S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when trainingrF  rA   )rA   r   r   )valuer   r   rJ  rF   )rL  get_text_configis_initializedr   r   r   
functionalpadr   cumsumrf   wherer.   r   )r   rA  r   r   r   rw  rx  ry  rz  rO   rV  is_imageis_previous_imagenew_image_start	group_idss                  rQ   create_causal_mask_mappingr  _  s   & ~-VWW ((*&(*$K ) 	%g_-K-K)Kg|cgOg 
 !&8 #a'++M,@,@AMM--ha-HCRCP"&7%77LL!4!4!6A>B	KK)R8	*Fy*Q&'$3{33r_   c                       e Zd ZdZdef fdZe ed      dej                  de
e   deez  fd	              Zee	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                   d
z  dej                  d
z  ded
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  de
e   deez  fd              Z xZS )Gemma3ModelFr   c                 (    t         |   |       | `y r   )r   r   text_config_dtyper   s     rQ   r   zGemma3Model.__init__  s     "r_   zOProjects the last hidden state from the vision model into language model space.)custom_introrx  rO   r   c                 t     | j                   d|dd|}|j                  }| j                  |      |_        |S )NT)rx  return_dictrF   )vision_towerrK  multi_modal_projectorpooler_output)rN   rx  rO   rj  rK  s        rQ   get_image_featureszGemma3Model.get_image_features  sH    
 +**aRVaZ`a*<<'+'A'ABS'T$r_   Nr   r   r   r   rw  rA  labelsrB  	lm_kwargsc
           
         |d u |d uz  rt        d      |R| j                  j                  | j                  k\  r/|| j                  j                  k(  }|j	                         }d||<   n|}| | j                         |      }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }|j                  ||      }t        |x}t              s't        | j                  ||||||| j                         } | j"                  d	|||||	dd|
}t%        |j&                  |j(                  |j*                  |j,                  |      S d       S )
NrD  r   T)r  )rA  image_features)ry  )r   r   r   rA  rB  r  )rK  r   r   
attentionsimage_hidden_statesrF   )rL  r   rr   r9   r   get_input_embeddingsr  r  r   r   r   get_placeholder_maskmasked_scatterr~   rk   r  r  language_modelr   rK  r   r   r  )rN   r   rx  r   r   r   rw  rA  r  rB  r  special_image_maskllm_input_idsr  rU  outputss                   rQ   r   zGemma3Model.forward  s    -t";<YZZ  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F"< MM	# &$%% 
.%+'
 
 )%77#33!//))2>2J
 	

 QU
 	
r_   	NNNNNNNNN)r`   ra   rb   accepts_loss_kwargsrm   r   r   r   r   r  r   r   r   r   r  r  r   r	   rL   r   r   r   r   s   @rQ   r  r    ss   #| # !rs!--9?@R9S	+	+ t   .215.204(,2626*.!%A
##d*A
 ''$.A
 t+	A

 &&-A
 A
 ((4/A
 ((4/A
   4'A
 $;A
 ./A
 
*	*A
  A
r_   r  c                       e Zd ZdZee	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  deej                  z  dee   deez  fd              Z	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma3ForConditionalGenerationFNr   rx  r   r   r   rw  rA  r  rB  logits_to_keepr  r   c                     | j                   d||||||||	|d	|}|d   }t        |
t              rt        |
 d      n|
}| j	                  |dd|ddf         }d}|O|j                         }|dddddf   }|dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j                  d| j                  j                  j                        }|j                  d      j                  |j                        } |||      }t!        |||j"                  |j$                  |j&                  |j(                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        )	r   rx  rw  r   r   r   rA  rB  r  r   N.r   rA   )losslogitsr   r   r  r  rF   )r  r~   rf   slicelm_headrj   r   r   r   r  r   CrossEntropyLossr   r   ru   r9   r   r   r   r  r  )rN   r   rx  r   r   r   rw  rA  r  rB  r  r  r  r   slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                          rQ   r   z&Gemma3ForConditionalGeneration.forward  s   z $** 
%))%+'
 
  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D+#33!//)) ' ; ;
 	
r_   c                 L    t        |   |f||||||	||d|}|s|s||d<   |S )N)r   rA  r   r   rB  r  rw  rz  rx  )r   prepare_inputs_for_generation)rN   r   r   rA  r   rx  r   rw  rB  r  r  rz  rO   model_inputsr   s                 rQ   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation`  sU      w<
+')%))1
 
" Y+7L(r_   )
NNNNNNNNNr   )
NNNNNNTNNF)r`   ra   rb   r  r   r   r   r  r  r   r	   rL   rf   r   r   r   r   r   r  r   r   s   @rQ   r  r    s\      .215.204(,2626*.!%-.j
##d*j
 ''$.j
 t+	j

 &&-j
 j
 ((4/j
 ((4/j
   4'j
 $;j
 ell*j
 ./j
 
-	-j
  j
^  $ $r_   r  c                   N    e Zd Z fdZd Zd Zee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dee   defd              Z xZS )Gemma3ForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  j                  | j                  d      | _	        | j                          y )NF)bias)r   r   
num_labelsr  r  r   Linearru   r   score	post_initr   s     rQ   r   z(Gemma3ForSequenceClassification.__init__  sZ      ++ (
YYv11==tUZ[
 	r_   c                 6    | j                   j                         S r   )r  r  )rN   s    rQ   r  z4Gemma3ForSequenceClassification.get_input_embeddings  s    zz..00r_   c                 :    | j                   j                  |       y r   )r  set_input_embeddings)rN   r|  s     rQ   r  z4Gemma3ForSequenceClassification.set_input_embeddings  s    

''.r_   Nr   rx  r   r   r   rA  rw  r  rB  rO   r   c
                     | j                   |f|||||||	d|
}|j                  }| j                  |      }||j                  d   }n|j                  d   }| j                  j
                  j                  |dk7  rt        d      | j                  j
                  j                  d}n||| j                  j
                  j                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                   j"                   d       |t        j                  ||j                  	      |f   }d}|| j%                  |||| j                  
      }t'        |||j(                  |j*                  |j,                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   rx  r   r   rA  rw  rB  Nr   rA   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rE  )r  r  pooled_logitsr   )r  r  r   r   r  )r  rK  r  r   r   ru   pad_token_idrL  r   r   r   int32r   argmaxr|   warning_oncer   r`   loss_functionr   r   r   r  )rN   r   rx  r   r   r   rA  rw  r  rB  rO   transformer_outputsr   r  ro  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                       rQ   r   z'Gemma3ForSequenceClassification.forward  s   , )djj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD/ /??-;;*55
 	
r_   r  )r`   ra   rb   r   r  r  r   r   r   r  r  r   r	   rL   r   r   r   r   r   r   s   @rQ   r  r    s   1/  .215.204(,2626*.!%C
##d*C
 ''$.C
 t+	C

 &&-C
 C
 ((4/C
 ((4/C
   4'C
 $;C
 +,C
 
*C
  C
r_   r  c                        e Zd ZU dZeed<   dZy)#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    r   r=  N)r`   ra   rb   rc   r2   rg   r0  rF   r_   rQ   r  r    s    
  r_   r  )	rm   r2   r  r<  rZ  r  r  r  r  )NNFN)bcollections.abcr   typingr   r   r   torch.nnr   huggingface_hub.dataclassesr    r   r)  cache_utilsr	   r
   configuration_utilsr   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   gemma2.configuration_gemma2r    gemma2.modeling_gemma2r!   r"   r#   r$   r%   r&   r'   r(   r)   paligemma.modeling_paligemmar*   r+   r,   r-   r.   siglipr/   
get_loggerr`   r|   r2   rm   r   r   	Embeddingr   r   r   r   r   r  GEMMA3_START_DOCSTRINGr  rf   rL   r:  r<  rZ  r   r(  r   r  rk   r  r  r  r  r  __all__rF   r_   rQ   <module>r     s   %     . & . 3 m m [ u u G & R R 0 + 6
 
 
  ( 
		H	% 12W|%5 W  3Wt 12?(# ?(  3?(D	 < 		#B 	SBLL S!	 !
+M +
L<1 L<`4)o 4)n+3 +\  ^1 ^<
# 
(CcSVCWY]C]:^ 
O
k O
d-) -!@		 !@H ?K +/-1&*1414<<14 LL4'14 T\	14
 ,,%14 LL4'14 ##d*14 14 t14 
14 L14hV
. V
rW%F WtU
&; U
p!*JLa !
r_   