
    i+                     p   d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>  e1j~                  e@      ZAe e/d       G d de                     ZBe e/d       G d d e-                    ZC G d! d"ej                        ZE G d# d$ej                        ZG G d% d&ej                        ZH G d' d(ej                        ZId) ZJ ed*      d_d+       ZKd,ej                  d-eMd.ej                  fd/ZN	 	 	 d`d0ej                  d1ej                  d2ej                  d3ej                  d4ej                  dz  d5eOeMz  d6eOdz  d7eOdz  d.ePej                  ej                  f   fd8ZQ eeK       G d9 d:ej                               ZR G d; d<e      ZSe/ G d= d>e)             ZTd?eMd.eeMeMeMeMgeUf   fd@ZVe/ G dA dBeT             ZWe/ G dC dDeTe             ZX G dE dFej                        ZYdGej                  d.efdHZZ e4dIdJdKL      	 	 	 	 dadMedKej                  d4ej                  dz  dNedz  dOej                  dz  dPej                  dz  dQej                  dz  dReUdSeUdz  d.e\fdT       Z] e/dU       G dV dWeT             Z^ e/dU       G dX dYeTe             Z_ G dZ d[eT      Z` G d\ d]eeT      Zag d^Zby)b    )Callable)	dataclass)OptionalN   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)deprecate_kwarg)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)Gemma3ModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r0   torchFloatTensor__annotations__     {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/gemma3/modeling_gemma3.pyr/   r/   8   s     59**T18r9   r/   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	Gemma3CausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr0   )r1   r2   r3   r4   r=   r5   r6   r7   r>   r?   r	   r@   tuplerA   r0   r8   r9   r:   r<   r<   H   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r9   r<   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                     t         |   |||       || _        | j                  dt	        j
                  |      d       y )NrH   F
persistent)super__init__scalar_embed_scaleregister_bufferr5   tensor)selfrE   rF   rG   rH   	__class__s        r:   rM   z&Gemma3TextScaledWordEmbedding.__init__k   s;    D"-]ELL,ERWXr9   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S N)rL   forwardrH   toweightdtype)rQ   rS   rR   s     r:   rV   z%Gemma3TextScaledWordEmbedding.forwardp   s2    wy)D,<,<,?,?@Q@Q,RRRr9   )      ?)r1   r2   r3   r4   intfloatrM   r5   TensorrV   __classcell__rR   s   @r:   rD   rD   f   sG    Ys Y3 YS Y_d Y
S S Sr9   rD   c                   *     e Zd Zdef fdZd Z xZS )	Gemma3MLPconfigc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFbias)rL   rM   rb   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrQ   rb   rR   s     r:   rM   zGemma3MLP.__init__u   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r9   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rU   )rm   ro   rk   rl   )rQ   xrm   s      r:   rV   zGemma3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r9   )r1   r2   r3   r+   rM   rV   r^   r_   s   @r:   ra   ra   t   s    7/ 7r9   ra   c                   <     e Zd Zddedef fdZd Zd Zd Z xZ	S )Gemma3RMSNormdimepsc                     t         |           || _        t        j                  t        j                  |            | _        y rU   )rL   rM   rv   ri   	Parameterr5   zerosrX   )rQ   ru   rv   rR   s      r:   rM   zGemma3RMSNorm.__init__   s.    ll5;;s#34r9   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr'   T)keepdim)r5   rsqrtpowmeanrv   )rQ   rr   s     r:   _normzGemma3RMSNorm._norm   s4    5;;quuQx}}R}>IJJJr9   c                     | j                  |j                               }|d| j                  j                         z   z  }|j                  |      S )NrZ   )r   r\   rX   type_as)rQ   rr   outputs      r:   rV   zGemma3RMSNorm.forward   sC    AGGI& 3!2!2!445~~a  r9   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)rB   rX   shaperv   rQ   s    r:   
extra_reprzGemma3RMSNorm.extra_repr   s'    ))*+6$((<<r9   )gư>)
r1   r2   r3   r[   r\   rM   r   rV   r   r^   r_   s   @r:   rt   rt      s&    5C 5e 5
K!=r9   rt   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 	 ddedz  de	d   de
dz  dedz  d	ed
ef   f
d       Z ej                         edd              Z xZS )Gemma3RotaryEmbeddinginv_freqNrb   c                 v   t         |           |j                  | _        |j                  | _        || _        t        t        |j                              | _        i | _	        | j                  D ]  }| j
                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j
                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t        | | d|        y )	N	rope_typedefault
layer_type	_inv_freqFrJ   _original_inv_freq_attention_scaling)rL   rM   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrb   listsetlayer_typesr   rope_parameterscompute_default_rope_parametersr   rO   clonesetattr)	rQ   rb   devicer   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingrR   s	           r:   rM   zGemma3RotaryEmbedding.__init__   s8   "("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Ur9   r   ztorch.deviceseq_lenr   returnztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNrZ   r   r'   rY   r   rY   )	r   getattrrg   num_attention_headsr5   arangeint64rW   r\   )rb   r   r   r   baseru   attention_factorr   s           r:   r   z5Gemma3RotaryEmbedding.compute_default_rope_parameters   s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r9   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nr   r   r   r{   r)   mpscpuF)device_typeenabledr'   ru   r   )r   r\   expandr   rW   r   
isinstancetypestrr$   	transposer5   catcossinrY   )rQ   rr   position_idsr   r   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                r:   rV   zGemma3RotaryEmbedding.forward   sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$)NNNNNNrU   )r1   r2   r3   r5   r]   r7   r+   rM   staticmethodr   r[   r   rB   r\   r   no_gradr   rV   r^   r_   s   @r:   r   r      s    llU/ U. *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <r9   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr{   r'   r   )r   r5   r   )rr   x1x2s      r:   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r9   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r:   apply_rotary_pos_embr      sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr9   r@   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r)   N)r   r   reshape)r@   r   batchnum_key_value_headsslenr   s         r:   	repeat_kvr   
  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr9   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 |   || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j                         }||fS )N      r'   r   r{   )ru   rY   )ptrainingr)   )r   r   num_key_value_groupsr5   matmulr   tanhri   
functionalsoftmaxfloat32rW   rY   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs                r:   eager_attention_forwardr     s    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!#n4 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r9   c                        e Zd ZdZdedef fdZ	 	 	 ddej                  dej                  dej                  dz  d	e	dz  d
e
e   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrb   	layer_idxc                     t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        |j                  dz  | _        | j
                  j                  | _        | j
                  j                    | _        t%        j&                  |j                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  | j                  z  |j                  |j(                        | _        | j
                  j2                  | _        | j                  dk(  r|j4                  nd | _        | j                  dk(  | _        t9        |j                  |j:                        | _        t9        |j                  |j:                        | _        y )Nr   r   r   re   sliding_attention)ru   rv   ) rL   rM   hasattrr   r   rb   r   r   rg   r   r   r   r   query_pre_attn_scalarr   attention_dropoutuse_bidirectional_attention	is_causalri   rj   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_window
is_slidingrt   rms_norm_epsq_normk_normrQ   rb   r   rR   s      r:   rM   zGemma3Attention.__init__<  s   ;B6=;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>![[DDDii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;J]7]f33cg//-@@#V=P=PQ#V=P=PQr9   Nr@   position_embeddingsr   r?   r   r   c                 d   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      }| j                  |	      }	|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  r| j$                  nd| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr{   r)   r'           )r   r   r  )r   r   r   viewr   r   r   r  r  r   updater   r   get_interfacerb   _attn_implementationr   r   r   r   r  r   r   r  )rQ   r@   r	  r   r?   r   input_shapehidden_shapequery_statesr   r   r   r   attention_interfacer   r   s                   r:   rV   zGemma3Attention.forwardZ  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r9   )NNN)r1   r2   r3   r4   r+   r[   rM   r5   r]   r	   r   r   rB   rV   r^   r_   s   @r:   r   r   8  s    GR/ RC RB -1.2(,*)||*) #\\*) t+	*)
 *) +,*) 
u||U\\D0%2E2LL	M*)r9   r   c                       e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	e	dz  d
e
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma3DecoderLayerrb   r   c                    t         |           || _        |j                  | _        || _        t        ||      | _        t        |      | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        y )N)rb   r   rv   )rL   rM   rb   rg   r   r   	self_attnra   mlprt   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr  s      r:   rM   zGemma3DecoderLayer.__init__  s    !--"()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r9   Nr@   r	  r   r   r?   r   r   c           	         |}| j                  |      } | j                  d|||||d|\  }}| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)r@   r	  r   r   r?   r8   )r  r  r  r  r  r  )	rQ   r@   r	  r   r   r?   r   residual_s	            r:   rV   zGemma3DecoderLayer.forward  s     !,,];)4>> 
' 3)%+
 
q 55mD =0 66}E/77F =0r9   r   )r1   r2   r3   r+   r[   rM   r5   r]   
LongTensorr	   r   r   rB   r6   rV   r^   r_   s   @r:   r  r    s    
c/ 
cC 
c -1.204(,|| #\\ t+	
 &&-  +, 
u  %(9(95;L;L(L"MPT"TT	Ur9   r  c                        e Zd ZU eed<   dZdZg dZdgZdZ	dZ
dZdZdZeedZdZ ej&                          fd       Z xZS )	Gemma3PreTrainedModelrb   modelT)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr?   )r@   rA   )imagetextc                    t         |   |       t        |t              r t	        j
                  |j                         y d|j                  j                  v r t	        j
                  |j                         y t        |t              r+t	        j                  |j                  |j                         y t        |t              r|j                  D ]  }|j                   }|j"                  |   dk7  rt$        |j"                  |      } ||j&                  |      \  }}t	        j(                  t+        || d      |       t	        j(                  t+        || d      |        y y )NRMSNormr   r   r   r   )rL   _init_weightsr   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightrR   r1   rX   rD   	constant_rH   rN   r   r   r   r   r   rb   copy_r   )rQ   r   r   r   r   r   rR   s         r:   r,  z#Gemma3PreTrainedModel._init_weights  s   f%f78KK99:&**333KK& =>NN6--v/H/HI 56$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 7r9   )r1   r2   r3   r*   r7   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsinput_modalitiesr5   r   r,  r^   r_   s   @r:   r#  r#    sw    &*# $5"5N!"&+% )U]]_^ ^r9   r#  r  c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 &    t        ||z
        k  S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)r?  r@  rA  rB  r  s       r:   
inner_maskz1_bidirectional_window_overlay.<locals>.inner_mask  s     56>"^33r9   r[   bool)r  rE  s   ` r:   _bidirectional_window_overlayrH    s3    
4c 4S 4 4c 4d 4
 r9   c                       e Zd ZU eed<   dZdef fdZeee		 	 	 	 	 	 dde
j                  dz  de
j                  dz  de
j                  dz  dedz  d	e
j                  dz  d
edz  dee   defd                     Z xZS )Gemma3TextModelrb   r)  c           	      (   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  | j                  j                  dz        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        t%        |      | _        d| _        | j+                          y c c}w )N      ?)rH   r  F)rL   rM   pad_token_idrG   
vocab_sizerD   rg   rb   embed_tokensri   
ModuleListrangenum_hidden_layersr  layersrt   r  normr   
rotary_embgradient_checkpointing	post_initr  s      r:   rM   zGemma3TextModel.__init__  s     !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmDI&JbJbDcdy	2d
 "&"4"4&:M:MN	/7&+# 	 es   "DNrS   r   r   r?   inputs_embeds	use_cacher   r   c           	         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        |x}	t              sw| j                  ||||d}
|
j                         }| j                  j                  r(d |
d<   t        | j                  j                        |d<   t!        di |
t#        di |d	}	|}i }| j                  j$                  D ]  }| j'                  |||      ||<    t)        | j*                  d | j                  j,                         D ]G  \  }} ||f|	| j                  j$                  |      || j                  j$                  |      ||d
|}I | j/                  |      }t1        ||      S )N:You must specify exactly one of input_ids or inputs_embedsrb   r   r)   r   rb   rY  r   r?   r   c                  L    t        j                  dt         j                        S )NTr   )r5   rP   rG  )argss    r:   <lambda>z)Gemma3TextModel.forward.<locals>.<lambda>.  s    TY^YcYc@d r9   or_mask_function)full_attentionr   )r   r	  r   r?   )last_hidden_stater?   r8   )
ValueErrorrP  r
   rb   get_seq_lengthr5   r   r   r   r   r   dictcopyr   rH  r  r   r   r   rV  	enumeraterT  rS  rU  r   )rQ   rS   r   r   r?   rY  rZ  r   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr@   r	  r   idecoder_layers                    r:   rV   zGemma3TextModel.forward  s    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-F ++!."0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & ++11 	gJ.2oom\[e.f
+	g !*$++6U8U8U*V W 	A})24;;3J3J13MN$78O8OPQ8R$S) / M	 		-0&++
 	
r9   )NNNNNN)r1   r2   r3   r+   r7   r=  rM   r%   r&   r   r5   r!  r]   r	   r6   rG  r   r   r   rV   r^   r_   s   @r:   rJ  rJ    s     / &   .2.204(,26!%C
##d*C
 t+C
 &&-	C

 C
 ((4/C
 $;C
 +,C
 
!C
    C
r9   rJ  c                   T    e Zd ZU ddiZddiZddgdgfiZeed<   def fdZe	e
	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  dej                  d	z  dej                  d	z  ded	z  deej                  z  dee   defd              Z xZS )Gemma3ForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr@   r>   rb   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y rd   )
rL   rM   rJ  r$  rO  ri   rj   rg   rt  rX  rp   s     r:   rM   zGemma3ForCausalLM.__init__V  sU     $V,
 ++yy!3!3V5F5FUS 	r9   NrS   r   r   r?   rY  labelsrZ  logits_to_keepr   r   c	           
          | j                   d||||||d|	}
|
j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                  ||| j                  fi |	}t        |||
j                  |
j                  |
j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM

        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)rS   r   r   r?   rY  rZ  Nr=   r>   r?   r@   rA   r8   )r$  re  r   r[   slicert  rb   final_logit_softcappingr5   r   loss_functionrO  r   r?   r@   rA   )rQ   rS   r   r   r?   rY  rw  rZ  rx  r   outputsr@   slice_indicesr>   r=   s                  r:   rV   zGemma3ForCausalLM.forward_  s   @ ,64:: ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
r9   )NNNNNNNr   )r1   r2   r3   _tied_weights_keys_tp_plan_pp_planr+   r7   rM   r    r   r5   r!  r]   r	   r6   rG  r[   r   r   r   rV   r^   r_   s   @r:   rr  rr  O  s%   *,GH23H_-z:;H/   .2.204(,26*.!%-.;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
   4';
 $;;
 ell*;
 +,;
 
 ;
  ;
r9   rr  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r-  rb   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr  rM  )kernel_sizestride)rL   rM   ri   rx   r5   ry   vision_configrg   text_configr0  rt   layer_norm_epsmm_soft_emb_normr[   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider  	AvgPool2davg_poolrp   s     r:   rM   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r9   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr)   r'   )r   r   r   r  r   r  flattenr  r5   r   r0  r   )	rQ   r  
batch_sizer   rg   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r:   rV   z!Gemma3MultiModalProjector.forward  s    %3%9%9"
A{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r9   )	r1   r2   r3   r*   rM   r5   r]   rV   r^   r_   s   @r:   r-  r-    s#    \| \ @ell @r9   r-  	group_idsc           
      P     dt         dt         dt         dt         dt        f
 fd}|S )au  
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    Args:
        group_ids (`torch.Tensor`):
            A tensor of shape `(bs, len)` assigning each token to a vision group. Tokens with the same group
            come from the same input image. Text is denoted by `-1`.
    r?  r@  rA  rB  r   c                    	j                   d   }|j                  |dz
        }|j                  |dz
        }	| |f   }	| |f   }t        j                  ||k  |d      }t        j                  ||k  |d      }||k(  |dk\  z  S )Nr{   r)   )maxr   )r   clampr5   where)
r?  r@  rA  rB  
seq_lengthq_idx_clampedkv_idx_clampedq_groupkv_groupr  s
            r:   rE  z0token_type_ids_mask_function.<locals>.inner_mask  s    __R(
 
Q7*q.9 I}45Y67++ej0'2>;;v
2HbA8#155r9   rF  )r  rE  s   ` r:   token_type_ids_mask_functionr    s3    6c 6S 6 6c 6d 6 r9   input_embeds5.6.0rY  versionnew_namerb   r?   r   token_type_idspixel_valuesis_trainingis_first_iterationc	                    |r|t        d      | j                         ||||d}
||n|du xs |j                   xs |du}||r|dk(  j                  |j                        }t
        j                  j                  |dd      ddddf   }|| z  }t        j                  |j                         d	      dz
  }t        j                  ||d      }t        |      |
d
<   t        di |
S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when trainingr_  r)   )r)   r   r   )r   r{   r   rc  r8   )rf  get_text_configis_initializedrW   r   ri   r   padr5   cumsumr[   r  r  r   )rb   rY  r   r?   r   r  r  r  r  r   rm  is_imageis_previous_imagenew_image_startr  s                  r:   create_causal_mask_mappingr    s   & ~-VWW ((*&(*$K ) 	%g_-K-K)Kg|cgOg 
 !&8 #a'++M,@,@AMM--ha-HCRCP"&7%77LL!4!4!6A>B	KK)R8	*Fy*Q&'$3{33r9   zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c                       e Zd ZdZdef fdZd Zd Ze e	d      de
j                  d	ee   d
eez  fd              Zde
j"                  de
j                  de
j                  fdZee		 	 	 	 	 	 	 	 	 dde
j"                  dz  de
j                  dz  de
j&                  dz  de
j"                  dz  dedz  de
j"                  dz  de
j                  dz  de
j"                  dz  dedz  dee   d
eez  fd              Z xZS )Gemma3ModelFrb   c                 2   t         |   |       t        j                  |j                        | _        t        |      | _        |j                  j                  | _	        t        j                  |j                        }|| _
        | j                          y )Nr]  )rL   rM   r(   from_configr  vision_towerr-  multi_modal_projectorr  rO  language_modelrX  )rQ   rb   r  rR   s      r:   rM   zGemma3Model.__init__  sq     %119M9MN%>v%F" ,,77"..f6H6HI,r9   c                 6    | j                   j                         S rU   )r  get_input_embeddingsr   s    r:   r  z Gemma3Model.get_input_embeddings&  s    ""7799r9   c                 :    | j                   j                  |       y rU   )r  set_input_embeddingsrQ   r   s     r:   r  z Gemma3Model.set_input_embeddings)  s    007r9   zOProjects the last hidden state from the vision model into language model space.r,   r  r   r   c                 t     | j                   d|dd|}|j                  }| j                  |      |_        |S )NT)r  return_dictr8   )r  re  r  pooler_output)rQ   r  r   r  re  s        r:   get_image_featureszGemma3Model.get_image_features,  sH    
 +**aRVaZ`a*<<'+'A'ABS'T$r9   rS   rY  image_featuresc                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )rY   r   r{   r   r)   z6Image features and image tokens do not match, tokens: z, features: )r  r5   rP   rb   image_token_idlongr   allsumr   r   	expand_asrW   r"   numel)rQ   rS   rY  r  special_image_maskn_image_tokensn_image_featuress          r:   get_placeholder_maskz Gemma3Model.get_placeholder_mask7  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r9   Nr   r   r?   r  rw  rZ  	lm_kwargsc
           
         |du |duz  rt        d      |R| j                  j                  | j                  k\  r/|| j                  j                  k(  }|j	                         }d||<   n|}| | j                         |      }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }|j                  ||      }t        |x}t              s't        | j                  ||||||| j                         } | j"                  d
|||||	dd|
}t%        |j&                  |j(                  |j*                  |j,                  |	      S d	      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nr\  r   T)r  )rY  r  )r  )r   r   r?   rY  rZ  r  )re  r?   r@   rA   r0   r8   )rf  rb   r  rO  r   r  r  r  rW   r   rY   r  masked_scatterr   rh  r  r   r  r/   re  r?   r@   rA   )rQ   rS   r  r   r   r?   r  rY  rw  rZ  r  r  llm_input_idsr  rl  r~  s                   r:   rV   zGemma3Model.forwardO  s   X -t";<YZZ  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F"< MM	# &$%% 
.%+'
 
 )%77#33!//))2>2J
 	

 QU
 	
r9   	NNNNNNNNN)r1   r2   r3   accepts_loss_kwargsr*   rM   r  r  r    r   r5   r6   r   r   rB   r   r  r!  r  r]   r	   rG  r/   rV   r^   r_   s   @r:   r  r    s     | :8 !rs!--9?@R9S	+	+ t "))":?:K:K"]b]n]n"0  .215.204(,2626*.!%^
##d*^
 ''$.^
 t+	^

 &&-^
 ^
 ((4/^
 ((4/^
   4'^
 $;^
 ./^
 
*	*^
  ^
r9   r  c                       e Zd ZddiZdZdef fdZd Zd Ze	de
j                  d	ee   fd
       Zee		 	 	 	 	 	 	 	 	 	 dde
j                   dz  de
j                  dz  de
j"                  dz  de
j                   dz  dedz  de
j                   dz  de
j                  dz  de
j                   dz  dedz  dee
j"                  z  dee   deez  fd              Z	 	 	 	 	 	 	 	 	 	 d fd	Ze eddd      	 	 d dede
j"                  de
j"                  dz  dedz  de
j"                  dz  de
j"                  dz  dedz  defd              Z xZS )!Gemma3ForConditionalGenerationrs  z(model.language_model.embed_tokens.weightFrb   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y rd   )rL   rM   r  r$  ri   rj   r  rg   rO  rt  rX  rp   s     r:   rM   z'Gemma3ForConditionalGeneration.__init__  sS      (
yy!3!3!?!?ASASA^A^ejkr9   c                 6    | j                   j                         S rU   r$  r  r   s    r:   r  z3Gemma3ForConditionalGeneration.get_input_embeddings      zz..00r9   c                 :    | j                   j                  |       y rU   r$  r  r  s     r:   r  z3Gemma3ForConditionalGeneration.set_input_embeddings      

''.r9   r  r   c                 <     | j                   j                  |fi |S rU   )r$  r  )rQ   r  r   s      r:   r  z1Gemma3ForConditionalGeneration.get_image_features  s    ,tzz,,\DVDDr9   NrS   r   r   r?   r  rY  rw  rZ  rx  r  r   c                     | j                   d||||||||	|d	|}|d   }t        |
t              rt        |
 d      n|
}| j	                  |dd|ddf         }d}|O|j                         }|dddddf   }|dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j                  d| j                  j                  j                        }|j                  d      j                  |j                        } |||      }t!        |||j"                  |j$                  |j&                  |j(                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        )	rS   r  r  r   r   r?   rY  rZ  rw  r   N.r{   r)   )r=   r>   r?   r@   rA   r0   r8   )r$  r   r[   r{  rt  r\   r   rW   r   r   ri   CrossEntropyLossr  rb   r  rO  r<   r?   r@   rA   r0   )rQ   rS   r  r   r   r?   r  rY  rw  rZ  rx  r  r~  r@   r  r>   r=   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                          r:   rV   z&Gemma3ForConditionalGeneration.forward  s   z $** 
%))%+'
 
  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D+#33!//)) ' ; ;
 	
r9   c                 L    t        |   |f||||||	||d|}|s|s||d<   |S )N)r?   rY  r   r   rZ  rx  r  r  r  )rL   prepare_inputs_for_generation)rQ   rS   r?   rY  r   r  r   r  rZ  rx  rw  r  r   model_inputsrR   s                 r:   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation;  sU      w<
+')%))1
 
" Y+7L(r9   r  r  r  r  c           
          t        | |||||fd|i|j                         D 	ci c]  \  }}	|dk7  s||	 c}	}S c c}	}w )Nr  r  )r  items)
rb   rY  r   r?   r   r  r  r   r   vs
             r:   r   z8Gemma3ForConditionalGeneration.create_masks_for_generatea  s]     *	
  2	
 !'F1!~2Eq!tF	
 		
 Gs   <<)
NNNNNNNNNr   )
NNNNNNTNNF)NF)r1   r2   r3   r  r  r*   rM   r  r  r   r5   r6   r   r   r  r    r!  r]   r	   rG  r[   rB   r<   rV   r  r   r#   r   rh  r   r^   r_   s   @r:   r  r    sQ    +,VW  | 1/ Eu/@/@ EFSeLf E E  .215.204(,2626*.!%-.j
##d*j
 ''$.j
 t+	j

 &&-j
 j
 ((4/j
 ((4/j
   4'j
 $;j
 ell*j
 ./j
 
-	-j
  j
^  $L ^WO /3*/
 
||
 t+
 	

 llT)
 t+
 !4K
 

 P 
r9   r  c                   N    e Zd Z fdZd Zd Zee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dee   defd              Z xZS )Gemma3ForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  j                  | j                  d      | _	        | j                          y rd   )rL   rM   
num_labelsr  r$  ri   rj   r  rg   scorerX  rp   s     r:   rM   z(Gemma3ForSequenceClassification.__init__{  sZ      ++ (
YYv11==tUZ[
 	r9   c                 6    | j                   j                         S rU   r  r   s    r:   r  z4Gemma3ForSequenceClassification.get_input_embeddings  r  r9   c                 :    | j                   j                  |       y rU   r  r  s     r:   r  z4Gemma3ForSequenceClassification.set_input_embeddings  r  r9   NrS   r  r   r   r?   rY  r  rw  rZ  r   r   c
                     | j                   |f|||||||	d|
}|j                  }| j                  |      }||j                  d   }n|j                  d   }| j                  j
                  j                  |dk7  rt        d      | j                  j
                  j                  d}n||| j                  j
                  j                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                   j"                   d       |t        j                  ||j                  	      |f   }d}|| j%                  |||| j                  
      }t'        |||j(                  |j*                  |j,                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   r  r   r?   rY  r  rZ  Nr   r)   z=Cannot handle batch sizes > 1 if no padding token is defined.r{   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r^  )r>   rw  pooled_logitsrb   rz  )r$  re  r  r   rb   r  rN  rf  rW   r   r5   int32r   argmaxloggerwarning_oncerR   r1   r}  r   r?   r@   rA   )rQ   rS   r  r   r   r?   rY  r  rw  rZ  r   transformer_outputsr@   r>   r  last_non_pad_tokennon_pad_masktoken_indicesr  r=   s                       r:   rV   z'Gemma3ForSequenceClassification.forward  s   , )djj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD/ /??-;;*55
 	
r9   r  )r1   r2   r3   rM   r  r  r    r   r5   r!  r6   r]   r	   rG  r   r   r   rV   r^   r_   s   @r:   r  r  z  s   1/  .215.204(,2626*.!%C
##d*C
 ''$.C
 t+	C

 &&-C
 C
 ((4/C
 ((4/C
   4'C
 $;C
 +,C
 
*C
  C
r9   r  c                        e Zd ZU dZeed<   dZy)#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    rb   rK  N)r1   r2   r3   r4   r+   r7   r=  r8   r9   r:   r  r    s    
  r9   r  )r#  rJ  rr  r  r  r  r  )r)   )r  NN)NNFN)ccollections.abcr   dataclassesr   typingr   r5   torch.nnri    r   r.  activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   integrationsr   r   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    r!   r"   utils.deprecationr#   utils.genericr$   r%   utils.output_capturingr&   autor(   configuration_gemma3r*   r+   
get_loggerr1   r  r/   r<   	EmbeddingrD   Modulera   rt   r   r   r   r]   r[   r   r\   rB   r   r   r  r#  rG  rH  rJ  rr  r-  r  r6   rh  r  r  r  r  r  __all__r8   r9   r:   <module>r     s  * % !    & ! . 3 ) I m m [  L F & w w 0 G 5  @ 
		H	% 
9 7 9 9 
9; 9 90SBLL S		  =BII =(N<BII N<b( *+ ,2	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%D )*K)bii K) +K)\+3 +\ (^O (^ (^V
# 
(CcSVCWY]C]:^ 
 ]
+ ]
 ]
@ L
- L
 L
^!@		 !@HELL X 6 ?K +/-1&*1414<<14 LL4'14 T\	14
 ,,%14 LL4'14 ##d*14 14 t14 
14 L14h 
W
' W

W
t 
@
%:O @

@
FU
&; U
p!*JLa !r9   