
    i                    p
   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	Z	d dl	m
Z
 d dlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z: ddl;m<Z<m=Z= ddl>m?Z? ddl@mAZAmBZBmCZCmDZD e e5d       G d d e'                    ZEe e5d!       G d" d#e3                    ZFee5 G d$ d%e(                    ZG G d& d'e
j                        ZI G d( d)e
j                        ZJ G d* d+e
j                        ZK G d, d-e
j                        ZL G d. d/e
j                        ZM G d0 d1e
j                        ZN G d2 d3e
j                        ZO G d4 d5e
j                        ZQ G d6 d7e
j                        ZR G d8 d9e
j                        ZS G d: d;e
j                        ZT G d< d=e
j                        ZU G d> d?e
j                        ZV G d@ dAe
j                        ZWdB ZXddCe	j                  dDe	j                  dEe	j                  dFeZfdGZ[dHe	j                  dIeZdJe	j                  fdKZ\	 	 	 ddLe
j                  dMe	j                  dNe	j                  dOe	j                  dPe	j                  dz  dQe]eZz  dRe]dz  dSe]dz  dJe^e	j                  e	j                  f   fdTZ_	 ddCe	j                  dDe	j                  dEe	j                  dUe	j                  dFeZdJe	j                  fdVZ` ee[       G dW dXe
j                               Za G dY dZe%      Zb G d[ d\e
j                        Zc G d] d^e
j                        Zd G d_ d`e
j                        Ze ee[       G da dbe
j                               Zfe G dc dde
j                               Zg G de dfe
j                        Zh G dg dhe%      Zi G di dje
j                        Zk G dk dle/      Zl e5dm       G dn doel             Zm e5dp       G dq drele             Zndse^eZeZf   dJefdtZo G du dvel      Zp G dw dxel      Zq G dy dze
j                        Zrd{e	j                  dz  d|e	j                  dz  dJedz  fd}Zs	 	 	 	 dd~ede	j                  dPe	j                  dz  dedz  dUe	j                  dz  de	j                  dz  de	j                  dz  deudeudz  dJevfdZw e5d       G d del             Zx e5d       G d dele             Zyg dZzy)    N)Callable)	dataclass)cached_property)Optional)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)use_experts_implementationuse_kernelized_func)create_bidirectional_maskcreate_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel   )Gemma4AudioConfigGemma4ConfigGemma4TextConfigGemma4VisionConfigzK
    Base class for Gemma4 outputs, with hidden states and attentions.
    custom_introc                   b    e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   y)Gemma4ModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nimage_hidden_statesaudio_hidden_states)	__name__
__module____qualname____doc__r4   torchFloatTensor__annotations__r5        {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/gemma4/modeling_gemma4.pyr3   r3   8   s5     59**T1848**T18r>   r3   zR
    Base class for Gemma4 causal language model (or autoregressive) outputs.
    c                   "   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   dZej                  dz  ed	<   y)
Gemma4CausalLMOutputWithPastaF  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr4   r5   )r6   r7   r8   r9   rB   r:   r;   r<   rC   rD   r   rE   tuplerF   r4   r5   r=   r>   r?   rA   rA   R   s    $ &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T1848**T18r>   rA   c                   :    e Zd ZU dZdZej                  dz  ed<   y)Gemma4AudioModelOutputz
    attention_mask (`torch.BoolTensor`, *optional*):
        A torch.BoolTensor of shape `(batch_size, num_frames)`. True for valid positions, False for padding.
    Nattention_mask)r6   r7   r8   r9   rJ   r:   
BoolTensorr<   r=   r>   r?   rI   rI   u   s    
 /3NE$$t+2r>   rI   c                   n     e Zd Zdeez  dededdf fdZdej                  dej                  fdZ	 xZ
S )	Gemma4ClippableLinearconfigin_featuresout_featuresreturnNc                    t         |           |j                  | _        t        j                  ||d      | _        | j                  r| j                  dt        j                  t        d                    | j                  dt        j                  t        d                   | j                  dt        j                  t        d                    | j                  dt        j                  t        d                   y y )NFbias	input_mininf	input_max
output_min
output_max)
super__init__use_clipped_linearsr   Linearlinearregister_bufferr:   tensorfloat)selfrN   rO   rP   	__class__s       r?   r[   zGemma4ClippableLinear.__init__   s     	#)#=#= ii\F##  ellE%L=.IJ  ell5<.HI  u||U5\M/JK  u||E%L/IJ	 $r>   rE   c                    | j                   r+t        j                  || j                  | j                        }| j                  |      }| j                   r+t        j                  || j                  | j                        }|S N)r\   r:   clamprU   rW   r^   rX   rY   )rb   rE   s     r?   forwardzGemma4ClippableLinear.forward   s\    ##!KKt~~t~~VMM2##!KKtXMr>   )r6   r7   r8   r/   r,   intr[   r:   Tensorrg   __classcell__rc   s   @r?   rM   rM      sT    K"%66K K 	K
 
K 	U\\ 	ell 	r>   rM   c                        e Zd Zd	dededef fdZdej                  fdZ	dej                  dej                  fdZ
 xZS )
Gemma4RMSNormdimeps
with_scalec                     t         |           || _        || _        | j                  r0t	        j
                  t        j                  |      d      | _        y y )NT)requires_grad)	rZ   r[   ro   rp   r   	Parameterr:   onesweight)rb   rn   ro   rp   rc   s       r?   r[   zGemma4RMSNorm.__init__   sB    $??,,uzz#dKDK r>   rE   c                     |j                  d      j                  dd      | j                  z   }|t        j                   |d      z  S )Nr)   T)keepdim      )powmeanro   r:   )rb   rE   mean_squareds      r?   _normzGemma4RMSNorm._norm   sA    $((+00T0BTXXMuyyt<<<r>   rQ   c                     | j                  |j                               }| j                  r|| j                  j                         z  }|j	                  |      S re   )r}   ra   rp   ru   type_as)rb   rE   normed_outputs      r?   rg   zGemma4RMSNorm.forward   sH    

=#6#6#89??)DKK,=,=,??M$$]33r>   )gư>T)r6   r7   r8   rh   ra   boolr[   r:   ri   r}   rg   rj   rk   s   @r?   rm   rm      sL    LC Le L L=5<< =
4U\\ 4ell 4r>   rm   c                        e Zd ZU dZej
                  ed<   def fdZ ej                         dej
                  dej
                  fd       Z
 xZS ) Gemma4AudioRelPositionalEncodingzSinusoidal relative positional encoding for the audio encoder.

    Produces position embeddings of shape [1, 2*context_size - 1, hidden_size] with
    concatenated [sin..., cos...] layout matching the original Gemma4 convention.
    inv_timescalesrN   c                    t         |           |j                  | _        |j                  |j                  z   dz
  |j
                  z   | _        d}d}| j                  dz  }t        j                  ||z        t        |dz
  d      z  }|t        j                  t        j                  |      | z        z  }| j                  d|j                  d      j                  d      d       y )	Nr+         ?     @r)   r   r   F
persistent)rZ   r[   hidden_sizeattention_chunk_sizeattention_context_leftattention_context_rightcontext_sizemathlogmaxr:   exparanger_   	unsqueeze)rb   rN   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr   rc   s          r?   r[   z)Gemma4AudioRelPositionalEncoding.__init__   s    !--''&*G*GG!KfNlNll 	 ))Q."&((==+H"ICP^abPbdeLf"f&5<<3OSjRj3j)kk-~/G/G/J/T/TUV/Wdijr>   rE   rQ   c                 Z   t        j                  ddd|j                        }|d   }|| j                  j	                  |j                        z  }t        j
                  t        j                  |      t        j                  |      gd      }|j	                  |j                        S )N   rw   device.Nrn   dtype)	r:   r   r   r   tocatsincosr   )rb   rE   position_idsscaled_time	pos_embeds        r?   rg   z(Gemma4AudioRelPositionalEncoding.forward   s    ||BB}7K7KL#I."T%8%8%;%;=CWCW%;%XXIIuyy5uyy7MNTVW	||-"5"5|66r>   )r6   r7   r8   r9   r:   ri   r<   r,   r[   no_gradrg   rj   rk   s   @r?   r   r      sU     LL k0 k U]]_7U\\ 7ell 7 7r>   r   c                   P    e Zd ZdZdedef fdZdej                  dej                  fdZ	dej                  dej                  fdZ
d	ej                  dej                  fd
Z	 ddej                  dej                  dej                  dz  deej                  df   fdZ xZS )Gemma4AudioAttentionz3Chunked local attention with relative position biasrN   	layer_idxc                     t         |           || _        || _        |j                  | _        |j                  |j                  z  | _        |j                  | _	        | j                  dz  t        j                  d      z  | _        t        j                  dt        j                  z         t        j                  d      z  | _        |j                  | _        |j"                  dz
  | _        |j&                  | _        | j                   | j$                  z   | j(                  z   | _        t-        ||j                  | j                  | j                  z        | _        t-        ||j                  | j                  | j                  z        | _        t-        ||j                  | j                  | j                  z        | _        t-        ||j                  |j                        | _        t7        j8                  |j                  | j                  | j                  z  d      | _        t7        j<                  t?        j@                  | j                              | _!        | jE                  dt?        jF                  | j
                        d       y )Nry   r)   r+   FrS   softcapr   )$rZ   r[   rN   r   attention_logit_capattention_logits_soft_capr   num_attention_headshead_dim	num_headsr   r   q_scaleek_scaler   
chunk_sizer   max_past_horizonr   max_future_horizonr   rM   q_projk_projv_projpostr   r]   relative_k_projrs   r:   zerosper_dim_scaler_   r`   rb   rN   r   rc   s      r?   r[   zGemma4AudioAttention.__init__   s   ")/)C)C&**f.H.HH33t+txx{:xxDFF
+dhhqk9 55 & = = A"("@"@ OOd.C.CCdF]F]]+FF4F4FY]YfYfHfg+FF4F4FY]YfYfHfg+FF4F4FY]YfYfHfg)&&2D2DfFXFXY	!yy););T^^dmm=[bgh\\%++dmm*DEYT5S5S(Tafgr>   rE   rQ   c           	         |j                   \  }}}}|| j                  z   dz
  | j                  z  }|| j                  z  |z
  }t        j                  |ddddd|f      }|j	                  ||| j                  ||      j                         S )zSplits a `(batch_size, seq_len, num_heads, head_dim)` tensor into non-overlapping blocks of `chunk_size` along the sequence dim.r+   r   )shaper   Fpadreshape
contiguous)rb   rE   
batch_sizeseq_lenr   r   
num_blocksr   s           r?   _convert_to_blockz&Gemma4AudioAttention._convert_to_block   s    3@3F3F0
GY/!3G
4??*W4maAq!S-AB$$ZT__iYabmmoor>   c           
      @   |j                   \  }}}}t        j                  |dddd| j                  | j                  | j
                  z   dz
  f      }|j                  d| j                  | j
                        }t        j                  |dd      }|j                         S )z`Extracts overlapping context windows of `context_size` for every block, strided by `chunk_size`.r   r+   rw   r)   )r   r   r   r   r   r   unfoldr   r:   movedimr   )rb   rE   r   r   r   r   s         r?   _extract_block_contextz+Gemma4AudioAttention._extract_block_context   s    3@3F3F0
GYAq!Q(=(=t?V?VY]YhYh?hkl?lm
 &,,Q0A0A4??SmR;''))r>   xc                     |j                   \  }}}}}| j                  }t        j                  |d|dz   |z
  f      }|j	                  |||||dz   z        }|dd||z  f   }|j	                  |||||      S )zjRelative position shift for blocked attention. See appendix B of https://huggingface.co/papers/1901.02860.r   r+   .N)r   r   r   r   view)rb   r   r   r   r   
block_sizeposition_lengthr   s           r?   
_rel_shiftzGemma4AudioAttention._rel_shift   s    IJF
Iz:((EE!a)O;<=FF:y*jLSTDT6UVc.Z,.../vvj)Z\RRr>   Nposition_embeddingsrJ   c                    |j                   \  }}}||| j                  | j                  f}| j                  |      j	                         j                  |      }| j                  |      j	                         j                  |      }	| j                  |      j	                         j                  |      }
|| j                  z  t        j                  | j                        z  }|	| j                  z  }	| j                  |      }| j                  |	      }	| j                  |
      }
|j                   d   }| j                  |      }|j                  d| j                  | j                        }|j!                  |j"                        }|j%                  ddddd      }||	j%                  ddddd      z  }|j'                  || j                  d| j                        }||j%                  ddd      z  }|j'                  || j                  || j(                  d      }| j+                  |      }||z   }|| j,                  z  }t/        j0                  |      }|| j,                  z  }|4|j3                  |j5                         | j6                  j8                        }t        j:                  |dt.        j<                        j!                  |
j"                        }||
j%                  ddddd      z  }|j%                  ddddd      j'                  ||| j(                  z  d      }|d d d |f   j?                         }| jA                  |j!                  | j@                  jB                  jD                  j"                              }||fS )	Nr+   rw   r   r   r	   r)      rn   r   )#r   r   r   r   ra   r   r   r   r   r   softplusr   r   r   r   r   r   r   permuter   r   r   r   r:   tanhmasked_filllogical_notrN   attention_invalid_logits_valuesoftmaxfloat32r   r   r^   ru   )rb   rE   r   rJ   r   
seq_length_hidden_shapequery_states
key_statesvalue_statesr   relative_key_statesqueries	matrix_acqueries_flat	matrix_bdattn_weightsattn_outputs                      r?   rg   zGemma4AudioAttention.forward	  s    %2$7$7!
J"JN{{=1779>>|L[[/557<<\J
{{=1779>>|L#dll2QZZ@R@R5SS$,,.
--l;00<
22<@!''*
"223FG166r4>>4==Y144<;M;M4N&&q!Q15j00Aq!Q??	z4>>2t}}U #6#>#>q!Q#GG	%%j$..*doo_ab	OOI.	 9,#dll2zz,/#dll2%'33**,dkk.X.XL yy2U]]KNN|OaOab"\%9%9!Q1a%HH!))!Q1a8@@ZZ^ZiZiMikmn!![j[.1<<>iiTYY5E5E5L5L5R5R STL((r>   re   )r6   r7   r8   r9   r,   rh   r[   r:   ri   r   r   r   rK   rG   rg   rj   rk   s   @r?   r   r      s    =h0 hS h4pu|| p p*ELL *U\\ *SELL SU\\ S 37	1)||1) #\\1) ((4/	1)
 
u||T!	"1)r>   r   c                   ^     e Zd Z fdZddej
                  dej
                  dz  fdZ xZS )'Gemma4AudioSubSampleConvProjectionLayerc                     t         |           t        j                  ||dddd      | _        t        j
                  ||dd      | _        t        j                         | _        y )N)r	   r	   )r)   r)   r+   F)in_channelsout_channelskernel_sizestridepaddingrT   T)ro   elementwise_affinerT   )	rZ   r[   r   Conv2dconv	LayerNormnormReLUact)rb   r   r   norm_epsrc   s       r?   r[   z0Gemma4AudioSubSampleConvProjectionLayer.__init__>  sW    II#%
	 LL8PT[`a	779r>   NrE   maskc           
         |,|j                  |j                        }||d d d d d d f   z  }| j                  |j                  | j                  j                  j                              }| j                  | j                  |j                  dddd            j                  dddd      j                               }||d d d d df   }||fS )Nr   r   r)   r	   r+   )	r   r   r   ru   r   r   r   r   r   )rb   rE   r   s      r?   rg   z/Gemma4AudioSubSampleConvProjectionLayer.forwardK  s    77-"6"677D)DD!T1A,BBM		-"2"24993C3C3I3I"JK=+@+@Aq!+L!M!U!UVWYZ\]_`!a!l!l!no3Q3<Dd""r>   re   )r6   r7   r8   r[   r:   ri   rg   rj   rk   s   @r?   r   r   =  s(    #U\\ #9L #r>   r   c            	            e Zd Zdef fdZ	 ddej                  dej                  dz  deej                  ej                  f   fdZ xZ	S )	"Gemma4AudioSubSampleConvProjectionrN   c                 v   t         |           t        d|j                  d   |j                        | _        t        |j                  d   |j                  d   |j                        | _        |j                  d   dz  |j                  d   z  }t        j                  ||j                  d      | _
        y )Nr+   r   )r   r   r   r   FrS   )rZ   r[   r   subsampling_conv_channelsrms_norm_epslayer0layer1r   r]   r   input_proj_linear)rb   rN   proj_input_dimrc   s      r?   r[   z+Gemma4AudioSubSampleConvProjection.__init__Z  s    =99!<((

 >88;99!<((

 !::1=BfFfFfghFii!#>6;M;MTY!Zr>   Ninput_featuresinput_features_maskrQ   c                 &   |j                  d      }| j                  ||      \  }}| j                  ||      \  }}|j                  \  }}}}|j	                  dddd      j                         j                  ||d      }| j                  |      |fS )Nr+   r   r)   r	   rw   )r   r  r  r   r   r   r   r  )rb   r	  r
  rE   r   r   r   r   s           r?   rg   z*Gemma4AudioSubSampleConvProjection.forwardi  s    
 '003"kk-9LMt"kk->t$1$7$7!
Aw%--aAq9DDFNNz[bdfg%%m4d::r>   re   )
r6   r7   r8   r,   r[   r:   ri   rG   rg   rj   rk   s   @r?   r  r  Y  sW    [0 [$ 48;; #\\D0; 
u||U\\)	*	;r>   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Gemma4AudioFeedForwardrN   c                    t         |           || _        t        ||j                  |j                  dz        | _        t        ||j                  dz  |j                        | _        t        |j                        | _        t        |j                        | _	        t        |j                     | _        |j                  | _        |j                  | _        y )Nr   )rZ   r[   rN   rM   r   ffw_layer_1ffw_layer_2rm   pre_layer_normpost_layer_normr   
hidden_actact_fngradient_clippingresidual_weightpost_layer_scalerb   rN   rc   s     r?   r[   zGemma4AudioFeedForward.__init__x  s    09K9KVM_M_bcMcd09K9Ka9OQWQcQcd+F,>,>?,V-?-?@V../!'!9!9 & 6 6r>   rE   rQ   c                    t        | j                  t        j                  | j                  j
                  j                  j                        j                        }|}t        j                  || |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }t        j                  || |      }| j                  |      }|| j                  z  }||z  }|S re   )minr  r:   finfor  r^   ru   r   r   rf   r  r  r  r  r  )rb   rE   r  residuals       r?   rg   zGemma4AudioFeedForward.forward  s     6 6DDTDTD[D[DbDbDhDh8i8m8mn M4E3EGXY++M:((7M2((7M4E3EGXY,,];...!r>   	r6   r7   r8   r,   r[   r:   ri   rg   rj   rk   s   @r?   r  r  w  s+    70 7U\\ ell r>   r  c                   `     e Zd Zed        Zdej                  dej                  f fdZ xZS )Gemma4AudioCausalConv1dc                 p    | j                   d   dz
  | j                  d   z  dz   }|| j                  d   z
  S )Nr   r+   )r   dilationr   )rb   effective_kernel_sizes     r?   left_padz Gemma4AudioCausalConv1d.left_pad  s>    !%!1!1!!4q!8DMM!<L Lq P$t{{1~55r>   r   rQ   c                 z    t         j                  j                  || j                  df      }t        |   |      S Nr   )r   r   r   r#  rZ   rg   )rb   r   rc   s     r?   rg   zGemma4AudioCausalConv1d.forward  s3     MMa$--!34wq!!r>   )	r6   r7   r8   r   r#  r:   ri   rg   rj   rk   s   @r?   r  r    s;     6 6"<<" 
	" "r>   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Gemma4AudioLightConv1drN   c                 6   t         |           || _        t        ||j                  |j                  dz        | _        t        ||j                  |j                        | _        t        |j                  |j                  |j                  |j                  d      | _	        t        |j                  |j                  d      | _        t        |j                  |j                  d      | _        t        |j                     | _        |j"                  | _        y )Nr)   F)r   r   r   groupsrT   Tro   rp   )rZ   r[   rN   rM   r   linear_start
linear_endr  conv_kernel_sizedepthwise_conv1drm   r  r  	conv_normr   r  r  r  r  s     r?   r[   zGemma4AudioLightConv1d.__init__  s    1&&:L:LfN`N`cdNde/8J8JFL^L^_ 7**++//%%!
 ,F,>,>FDWDWdhi&v'9'9v?R?R_cdV../!'!9!9r>   rE   rQ   c                 H   |}| j                  |      }| j                  |      }t        j                  j	                  |d      }| j                  |j                  dd            j                  dd      }t        | j                  t        j                  | j                  j                  j                  j                        j                        }t        j                  || |      }| j!                  |      }| j#                  |      }| j%                  |      }||z  }|S )Nrw   r   r+   r)   )r  r+  r   r   glur.  	transposer  r  r:   r  r^   ru   r   r   rf   r/  r  r,  )rb   rE   r  r  s       r?   rg   zGemma4AudioLightConv1d.forward  s     ++M:))-8))-R)@--m.E.Ea.KLVVWXZ[\   6 6DDUDUD\D\DcDcDiDi8j8n8noM4E3EGXY}5M26!r>   r  rk   s   @r?   r'  r'    s+    :0 :(U\\ ell r>   r'  c            
            e Zd Zdedef fdZdej                  dej                  dz  dej                  de	e
   d	ej                  f
d
Z xZS )Gemma4AudioLayerrN   r   c                 p   t         |           || _        t        |      | _        t        |      | _        t        ||      | _        t        |      | _	        t        |j                        | _        t        |j                        | _        t        |j                        | _        |j                  | _        y re   )rZ   r[   rN   r  feed_forward1feed_forward2r   	self_attnr'  lconv1drm   r   norm_pre_attnnorm_post_attnnorm_outr  r   s      r?   r[   zGemma4AudioLayer.__init__  s    3F;3F;-fi@-f5*6+=+=>+F,>,>?%f&8&89!'!9!9r>   rE   rJ   Nr   kwargsrQ   c                 @   t        | j                  t        j                  | j                  j
                  j                        j                        }| j                  |      }|}t        j                  || |      }| j	                  |      }| j                  |||      \  }}t        j                  || |      }| j                  |      }||z  }| j                  |      }| j                  |      }t        j                  || |      }| j                  |      }|S )N)rE   r   rJ   )r  r  r:   r  r:  ru   r   r   r6  rf   r8  r;  r9  r7  r<  )rb   rE   rJ   r   r=  r  r  r   s           r?   rg   zGemma4AudioLayer.forward  s      6 6DDVDVD]D]DcDc8d8h8hi**=9 M4E3EGXY**=9>>' 3) * 
q M4E3EGXY++M:!]3**=9M4E3EGXYm4r>   )r6   r7   r8   r,   rh   r[   r:   ri   rK   r   r!   rg   rj   rk   s   @r?   r4  r4    si    :0 :S : ||  ((4/  #\\	 
 +,  
 r>   r4  c                        e Zd Zdef fdZdej                  dej                  dej                  fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	Gemma4VisionPatchEmbedderrN   c                    t         |           || _        |j                  | _        |j                  | _        |j
                  | _        t        j                  d| j                  dz  z  | j                  d      | _        t        j                  t        j                  d| j
                  | j                              | _        y )Nr	   r)   FrS   )rZ   r[   rN   r   
patch_sizeposition_embedding_sizer   r]   
input_projrs   r:   rt   position_embedding_tabler  s     r?   r[   z"Gemma4VisionPatchEmbedder.__init__  s    !-- ++'-'E'E$))A(:$:D<L<LSXY(*UZZ4C_C_aeaqaq5r(s%r>   pixel_position_idspadding_positionsrQ   c                 T   |j                  d      }t        j                  || j                        }|j	                  dddd      j                  | j                        }|| j                  z  }|j                  d      }t        j                  |j                  d      d	|      }|S )
zDPrepare patch positions map for matmul with positon embedding table.r   r  num_classesr)   r+   r	   r   rw           )rf   r   one_hotrC  r   r   rE  sumr:   wherer   )rb   rF  rG  clamped_positionsrM  r   s         r?   _position_embeddingsz.Gemma4VisionPatchEmbedder._position_embeddings&  s     /444;))-4;W;WX//!Q1-001N1NO%(E(EE155!5<#kk*;*E*Eb*I3Pcd""r>   pixel_valuesc                     d|dz
  z  }| j                  |j                  | j                   j                  j                              }| j	                  ||      }||z   S )Nr)         ?)rD  r   ru   r   rQ  )rb   rR  rF  rG  rE   r   s         r?   rg   z!Gemma4VisionPatchEmbedder.forward3  s[     L3./8N8N8T8T(UV"778JL]^222r>   )
r6   r7   r8   r/   r[   r:   ri   rQ  rg   rj   rk   s   @r?   r@  r@    su    t1 t#u|| #X]XdXd #iniuiu #3!LL3>Cll3_d_k_k3	3r>   r@  c                   .    e Zd ZdZdef fdZdej                  dej                  dede	ej                  ej                  f   fdZ
	 ddej                  dej                  d
ej                  ded	z  de	ej                  ej                  f   f
dZ xZS )Gemma4VisionPoolerz9Scaling and optional spatial pooling for vision encodingsrN   c                 l    t         |           |j                  | _        | j                  dz  | _        y )NrT  )rZ   r[   r   root_hidden_sizer  s     r?   r[   zGemma4VisionPooler.__init__@  s/    !-- $ 0 0# 5r>   rE   rF  lengthrQ   c                    |j                   d   }t        ||z  dz        }|dz  }||z  |k7  r%t        d|j                    d| d|d|d| d	      |j                  d
      }|d   j	                  dd      d
   dz   }t        j                  ||d      }	|	d   ||z  |	d   z  z   }	t        j                  |	j                         |      j                         |z  }
|
j                  dd      |j                         z  }t        j                  |
d
k(  j                  d            }|j                  |j                        |fS )z
        2D spatial pooling according to patch positions.
        Pools the input tokens by averaging patches within a `k^2` grid, where `k` is determined by the ratio between
        input and output lengths
        r+   rT  r)   zCannot pool z to z: k=z^2 times length=z	 must be .r   rI  .r   rw   Trn   rx   floor)rounding_mode).r+   r   )r   rh   
ValueErrorrf   r   r:   divr   rM  longra   r2  r   allr   r   )rb   rE   rF  rY  input_seq_lenk	k_squaredrP  max_xkernel_idxsweightsoutputr   s                r?   _avg_pool_by_positionsz)Gemma4VisionPooler._avg_pool_by_positionsE  sh    &++A.&(S01qD	v.}2234xu!EVviW`an`oopq  /444;!&)--"d-CAFJii 11GL!&)UaZ;v;N,NN))K,,.7==?)K""1a(=+>+>+@@  'Q,!3!3!3!:;yy,,-t33r>   NrG  output_lengthc                    ||j                   d   kD  rt        d| d|j                   d    d      |j                  |j                  d      d      }|j                   d   |k7  r| j	                  |||      \  }}|| j
                  z  }||fS )Nr+   z*Cannot output more soft tokens (requested z) than there are patches (z9). Change the value of `num_soft_tokens` when processing.rw   rL  )r   r`  r   r   rk  rX  )rb   rE   rF  rG  rl  s        r?   rg   zGemma4VisionPooler.forward`  s     =..q11<]O L"((+,,eg 
 &112C2M2Mb2QSVWq!]2/3/J/J1=0,M, 	...///r>   re   )r6   r7   r8   r9   r/   r[   r:   ri   rh   rG   rk  rg   rj   rk   s   @r?   rV  rV  =  s    C61 6
4"\\4?D||4UX4	u||U\\)	*4@ %)0||0 "LL0 !<<	0
 Tz0 
u||U\\)	*0r>   rV  c                   *     e Zd Zdef fdZd Z xZS )Gemma4VisionMLPrN   c                    t         |           || _        |j                  | _        |j                  | _        t        || j                  | j                        | _        t        || j                  | j                        | _        t        || j                  | j                        | _        t        |j                     | _        y re   )rZ   r[   rN   r   intermediate_sizerM   	gate_projup_proj	down_projr   hidden_activationr  r  s     r?   r[   zGemma4VisionMLP.__init__y  s    !--!'!9!9.vt7G7GI_I_`,VT5E5EtG]G]^.vt7M7MtO_O_`V556r>   c                     | j                  | j                  | j                  |            | j                  |      z        }|S re   rt  r  rr  rs  rb   r   rt  s      r?   rg   zGemma4VisionMLP.forward  6    NN4;;t~~a/@#ADLLQRO#ST	r>   )r6   r7   r8   r/   r[   rg   rj   rk   s   @r?   ro  ro  x  s    71 7r>   ro  c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  dej                  dz  de
dz  dedef   fd	       Z ej                         ed
               Z xZS )Gemma4VisionRotaryEmbeddinginv_freqNrN   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr|  Fr   original_inv_freq)rZ   r[   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrN   rope_parametersr~  compute_default_rope_parametersr   attention_scalingr_   clone)rb   rN   r   rope_init_fnr|  rc   s        r?   r[   z$Gemma4VisionRotaryEmbedding.__init__  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr>   r   r   rQ   torch.Tensorc                 $   | j                   d   }t        | dd      xs | j                  | j                  z  }|dz  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetar   Nr)   r   r   r   r   r   	r  getattrr   r   r:   r   int64r   ra   )rN   r   r   basern   spatial_dimattention_factorr|  s           r?   r  z;Gemma4VisionRotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c QhQQekkBEEV[`[f[fEgjuuw
 )))r>   c                 .   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}g g }}t        d      D ]  }|d d d d |f   }|d d d d d f   j                         }	t        |d      5  |j                         |	j                         z  j                  dd      }
t        j                  |
|
fd	      }|j                         | j                  z  }|j!                         | j                  z  }d d d        |j#                         |j#                          t        j                  |d	      j	                  |j$                  
      }t        j                  |d	      j	                  |j$                  
      }||fS # 1 sw Y   xY w)Nr   rw   r+   mpscpur)   Fdevice_typeenabledr   r   )r|  ra   expandr   r   r   
isinstancetypestrranger%   r2  r:   r   r   r  r   appendr   )rb   r   r   inv_freq_expandedr  all_cosall_sinidim_position_idsdim_position_ids_expandedfreqsembr   r   s                 r?   rg   z#Gemma4VisionRotaryEmbedding.forward  s    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr'1!((--'E!((--[`J`ahhmmfk rq 
	 A+Aq!G4(8D!(D(J(J(L%KG 9*0025N5T5T5VVaabcefgiiB7ggi$"8"88ggi$"8"88	9
 NN3NN3
	  iiR(++!''+:iiR(++!''+:Cx9 9s   4BHH	re   NNN)r6   r7   r8   r:   ri   r<   r/   r[   staticmethodr   rh   rG   ra   r  r   r   rg   rj   rk   s   @r?   r{  r{    s    llV1 V  ,0&*" *"T) *t# * t * 
~u$	%	 *  *D U]]_  r>   r{  c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nrw   r)   r   )r   r:   r   )r   x1x2s      r?   rotate_halfr    sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r>   r   r   r   unsqueeze_dimc                 n    |j                  |      }|j                  |      }| |z  t        |       |z  z   S )a\  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r  r   r   r   r  s       r?   apply_rotary_pos_embr    s8    " --
&C
--
&CGA,--r>   rE   n_reprQ   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r+   N)r   r  r   )rE   r  batchnum_key_value_headsslenr   s         r?   	repeat_kvr    so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr>   modulequerykeyvaluerJ   dropoutscalingr   c                 |   || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j                         }||fS )Nry   r)   r	   rw   r   )ptrainingr+   )r   r  num_key_value_groupsr:   matmulr2  r   r   r   r   r   r   r   r  r  r   )r  r  r  r  rJ   r  r  r   r=  r   r   r   r   s                r?   eager_attention_forwardr     s    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!#n4 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r>   r   c           	         |j                   d   }| j                   d   }d|d|z  z  z  }|dk  rt        d| d| d| d      |g|z  }t        j                  | |d      }	t        j                  ||d      }
t        j                  ||d      }t	        |      D cg c]  }t        |	|   |
|   ||   |	       }}t        j                  |d      S c c}w )
ak  Applies multidimensional RoPE to inputs.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            If position_ids.ndim + 2 == x.ndim, then this function passes through to `apply_rotary_pos_emb()`.
            Otherwise, position_ids is used to split the inputs, x, into multiple pieces, where each piece is fed to
            `apply_rotary_pos_emb()`, and then concatenated back together.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.

    Returns:
      Tensor of shape [B, L, N, H] with RoPE applied.
    rw   r)   r   zEInvalid configuration: num_rotated_channels_per_dim must be > 0, got z (num_input_channels=z, ndim=)r   r  )r   r`  r:   splitr  r  r   )r   r   r   r   r  ndimnum_input_channelsnum_rotated_channels_per_dimsplit_sizesx_parts	cos_parts	sin_partsre  y_partss                 r?   apply_multidimensional_roper  "  s   8 b!D#$(:q4x(H#I #q(,--BCUBV WF!
 	
 0047Kkk![b1GC"5IC"5I t  	aj!!'		
G  99W"%%s   Cc                       e Zd ZdZdedef fdZ	 	 	 ddej                  dej                  dej                  dz  d	ej                  dz  d
e
e   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma4VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperrN   r   c                    t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        d| _        | j
                  j                  | _        d| _        t!        ||j                  |j                  | j                  z        | _        t!        ||j                  |j                  | j                  z        | _        t!        ||j                  |j                  | j                  z        | _        t!        ||j                  | j                  z  |j                        | _        t+        |j                  |j,                        | _        t+        |j                  |j,                        | _        t+        | j                  |j,                  d      | _        y )Nlayer_typesr   r   Frn   ro   r*  )rZ   r[   hasattrr  
layer_typerN   r   r  r   r   r   r  r  r  attention_dropout	is_causalrM   r   r   r   o_projrm   r  q_normk_normv_normr   s      r?   r[   zGemma4VisionAttention.__init__^  s   ;B6=;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>+FF4F4FHbHbeiererHrs+FF4F4FHbHbeiererHrs+FF4F4FHbHbeiererHrs+FF4N4NQUQ^Q^4^`f`r`rs#V=P=PQ#V=P=PQ#DMMv7J7JW\]r>   NrE   r   rJ   r   r=  rQ   c                 N   |j                   d d }g |d| j                  }|\  }}	| j                  |      j                  |      }
| j	                  |
      }
t        |
||	|      }
|
j                  dd      }
| j                  |      j                  |      }| j                  |      }t        |||	|      }|j                  dd      }| j                  |      j                  |      }| j                  |      }|j                  dd      }t        j                  | j                  j                  t              } || |
|||f| j                   r| j"                  nd| j$                  d|\  }} |j&                  g |d j)                         }| j+                  |      }||fS )Nrw   r+   r)   rL  )r  r  )r   r   r   r   r  r  r2  r   r  r   r  r   get_interfacerN   _attn_implementationr  r  r  r  r   r   r  )rb   rE   r   rJ   r   r=  input_shaper   r   r   r   r   r   attention_interfacer   r   s                   r?   rg   zGemma4VisionAttention.forwardq  s    $))#2.88b8$--8&S{{=166|D{{<02<c<X#--a3[[/44\B
[[,
0S#|T
))!Q/
{{=166|D{{<0#--a3(?(M(MKK,,.E)
 %8	%
 /3mmD**LL	%
 	%
!\ *k));;;;FFHkk+.L((r>   r  )r6   r7   r8   r9   r/   rh   r[   r:   ri   
LongTensorr   r!   rG   rg   rj   rk   s   @r?   r  r  Z  s    G^1 ^c ^, -1.204,)||,) #\\,) t+	,)
 &&-,) +,,) 
u||U\\D0%2E2LL	M,)r>   r  c                       e Zd Zdedef fdZ	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	e	e
   d
eej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma4VisionEncoderLayerrN   r   c                    t         |           || _        |j                  | _        || _        t        ||      | _        t        |      | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        y )NrN   r   ro   )rZ   r[   rN   r   r   r  r8  ro  mlprm   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      r?   r[   z!Gemma4VisionEncoderLayer.__init__  s    !--".f	R"6*,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r>   NrE   r   rJ   r   r=  rQ   c                     |}| j                  |      } | j                  d||||d|\  }}| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)rE   r   rJ   r   r=   )r  r8  r  r  r  r  )rb   rE   r   rJ   r   r=  r  r   s           r?   rg   z Gemma4VisionEncoderLayer.forward  s     !,,];)4>> 
' 3)%	

 
q 55mD =0 66}E/77F =0r>   r  )r6   r7   r8   r/   rh   r[   r:   ri   r  r   r!   rG   r;   rg   rj   rk   s   @r?   r  r    s    
c1 
cc 
c -1.204|| #\\ t+	
 &&- +, 
u  %(9(95;L;L(L"MPT"TT	Ur>   r  c                        e Zd Zdef fdZ	 d
dej                  dej                  dej                  dz  dee	   de
f
d	Z xZS )Gemma4VisionEncoderrN   c           	         t         |           || _        |j                  | _        t        |      | _        t        j                  t        | j                        D cg c]  }t        ||       c}      | _        y c c}w )Nr  )rZ   r[   rN   num_hidden_layers
num_layersr{  
rotary_embr   
ModuleListr  r  layers)rb   rN   r  rc   s      r?   r[   zGemma4VisionEncoder.__init__  sc     225f=mmKPQUQ`Q`Kaba%VqAb
bs   A?Ninputs_embedsrJ   rF  r=  rQ   c                     t        | j                  ||      }|}| j                  ||      }| j                  d| j                  j                   D ]  } ||f|||d|} t        |      S )z
        pixel_position_ids (torch.Tensor):
            Patch positions as (x, y) coordinates in the image as [batch, num_patches, 2].
        )rN   r  rJ   N)rJ   r   r   last_hidden_state)r   rN   r  r  r  r   )rb   r  rJ   rF  r=  rE   r   decoder_layers           r?   rg   zGemma4VisionEncoder.forward  s     3;;')
 &"oom=OP "[[)H4;;+H+HI 	M)-$7/	
 M	 'GGr>   re   )r6   r7   r8   r/   r[   r:   ri   r  r   r!   r   rg   rj   rk   s   @r?   r  r    si    
1 
 7;	H||H H ",,t3	H
 +,H 
!Hr>   r  c                   .     e Zd Zdedef fdZd Z xZS )Gemma4TextMLPrN   r   c                 J   t         |           |j                  |j                  z
  }||cxk\  xr dkD  nc }|j                  xr |}|| _        |j                  | _        |j                  |rdndz  | _        t        j                  | j                  | j                  d      | _
        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        |j                     | _        y )Nr   r)   r+   FrS   )rZ   r[   r  num_kv_shared_layersuse_double_wide_mlprN   r   rq  r   r]   rr  rs  rt  r   ru  r  )rb   rN   r   first_kv_shared_layer_idxis_kv_shared_layerr  rc   s         r?   r[   zGemma4TextMLP.__init__  s    $*$<$<v?Z?Z$Z!&*CGaG$88O=O!--!'!9!9BUQ[\!]4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r>   c                     | j                  | j                  | j                  |            | j                  |      z        }|S re   rw  rx  s      r?   rg   zGemma4TextMLP.forward  ry  r>   )r6   r7   r8   r.   rh   r[   rg   rj   rk   s   @r?   r  r    s    7/ 7C 7r>   r  c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 	 ddedz  de	d   de
dz  dedz  d	ed
ef   f
d       Z ej                         edd              Z xZS )Gemma4TextRotaryEmbeddingr|  NrN   c                    t         
|           |j                  | _        |j                  | _        || _        t        |j                        | _        i | _        i | _	        | j                  D ]  }| j
                  j                  |   }||d   x}dk7  r
t        |   }n| j                  }|| j                  |<   || j                  |<   ||d}|dk(  r
|dk(  rd|d<    || j
                  fi |\  }}	| j                  | d|d	
       | j                  | d|j                         d	
       t        | | d|	        y )Nr~  r  )r   r  full_attentionproportionalglobal_head_dimhead_dim_key	_inv_freqFr   _original_inv_freq_attention_scaling)rZ   r[   r  r  r  rN   setr  rope_init_fnsr~  r  r   r  r_   r  setattr)rb   rN   r   r  rope_paramsr~  r  rope_init_fn_kwargscurr_inv_freqcurr_attention_scalingrc   s             r?   r[   z"Gemma4TextRotaryEmbedding.__init__  se   "("@"@$*$B$B!v112SU)+** 	UJ++55jAK"(55	)C29=#CC-9Dz*)2DNN:&-3:"N--)~2M6G#N34@4dPc4d1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST)	Ur>   r   ztorch.devicer   r  rQ   r  c                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        r  r   Nr   r   r)   r   r  r  )rN   r   r   r  r  rn   r  r|  s           r?   r  z9Gemma4TextRotaryEmbedding.compute_default_rope_parameters.  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r>   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nr
  r  r   rw   r+   r  r  Fr  r)   r   r   )r  ra   r  r   r   r   r  r  r  r%   r2  r:   r   r   r   r   )rb   r   r   r  r|  r  r  position_ids_expandedr  r  r  r   r   s                r?   rg   z!Gemma4TextRotaryEmbedding.forwardR  sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$NN)NNNNre   )r6   r7   r8   r:   ri   r<   r.   r[   r  r   rh   r  rG   ra   r  r   r   rg   rj   rk   s   @r?   r  r    s    llU/ U@ *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <r>   r  c                       e Zd ZdZdedef fdZ	 ddej                  dej                  dej                  dz  d	e	ee
ej                  ej                  f   f   d
edz  dee   de
ej                  ej                  dz  f   fdZ xZS )Gemma4TextAttentionr  rN   r   c                 `   t         |           t        |d      r|j                  |   nd | _        || _        || _        | j                  dk(  | _        | j                  r|j                  nd | _        | j                  s|j                  r|j                  n|j                  | _
        |j                  xr | j                   | _        | j                  r|j                  n|j                  }|j                  |z  | _        d| _        | j
                  j$                  | _        |j&                  dk7  | _        | j
                  j*                  t-        | j
                  dd      z
  }||cxk\  xr dkD  nc | _        |j                  d | }| j.                  r@t1        |      dz
  |d d d   j3                  |j                  |         z
  | _        d	| _        nBd | _        |t1        |      dz
  |d d d   j3                  |j                  |         z
  k(  | _        t9        j:                  |j<                  |j                  | j                  z  |j>                  
      | _         tC        | j                  |jD                        | _#        | j.                  stC        | j                  |jD                        | _$        tC        | j                  |jD                  d	      | _%        t9        j:                  |j<                  || j                  z  |j>                  
      | _&        | j                  s9t9        j:                  |j<                  || j                  z  |j>                  
      nd | _'        t9        j:                  |j                  | j                  z  |j<                  |j>                  
      | _(        y )Nr  sliding_attentionr   rc  r  r   r+   rw   FrS   r  r*  ))rZ   r[   r  r  r  rN   r   
is_slidingsliding_windowr  r   attention_k_eq_vuse_alternative_attentionnum_global_key_value_headsr  r   r  r  r  use_bidirectional_attentionr  r  r  r  lenindexkv_shared_layer_indexstore_full_length_kvr   r]   r   attention_biasr   rm   r  r  r  r  r   r   r  )rb   rN   r   r  r   prev_layersrc   s         r?   r[   zGemma4TextAttention.__init__i  s   ;B6=;Y&,,Y7_c"//-@@7;f33D6:oo&J`J`..flfufu)/)@)@)XEX&151O1OF--U[UoUo 	 %+$>$>BU$U!!%!>!>;;uD %)KK$A$AGDKKYoqrDs$s!"+/H"L1"L(()C*CD""),[)9A)=DbD@Q@W@WX^XjXjktXu@v)vD&(-D%)-D&(1S5E5IKX\Z\X\L]LcLc""9-M 6 )D% ii : :T]] JQWQfQf
 $6;N;NO &&'DMMv?R?RSDK'6;N;N[`aDK))""$7$--$GfNcNcDK
 55 		&,,.ADMM.QX^XmXmn K ii&&68J8JQWQfQf
r>   NrE   r   rJ   shared_kv_statesrD   r=  rQ   c                    |j                   d d }g |d| j                  }|\  }	}
| j                  |      j                  |      }| j	                  |      }t        ||	|
d      }|j                  dd      }| j                  rI|| j                     \  }}|j                  |j                        }|j                  |j                        }n| j                  |      j                  |      }| j                   | j                  |      j                  |      n|}| j                  |      }t        ||	|
d      }|j                  dd      }| j                  |      }|j                  dd      }|,| j                  s |j                  ||| j                         \  }}| j"                  r||f|| j                   <   t$        }| j&                  j(                  dk7  rt*        | j&                  j(                     } || ||||f| j,                  r| j.                  nd| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j9                  |      }||fS )Nrw   r)   )r  r+   eagerrL  )r  r  r  )r   r   r   r   r  r  r2  r  r$  r   r   r   r   r  r  updater   r%  r  rN   r  r   r  r  r  r  r   r   r  )rb   rE   r   rJ   r(  rD   r=  r  r   r   r   r   r   r   r  r   r   s                    r?   rg   zGemma4TextAttention.forward  s^    $))#2.88b8$--8&S{{=166|D{{<0+L#sRST#--a3
 ""'78R8R'S$J#|':':;J'??<+>+>?L]388FJLPKKLc4;;}5::<HisLZ0J-j#sRSTJ#--a3J;;|4L'11!Q7L&t/F/F'6'='=j,X\XfXf'g$J$$/9</GT^^,(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r>   re   )r6   r7   r8   r9   r.   rh   r[   r:   ri   dictrG   r   r   r   rg   rj   rk   s   @r?   r  r  e  s    G6
/ 6
C 6
| )-=)||=) #\\=) t+	=)
 sE%,,*D$EEF=) =) -.=) 
u||U\\D00	1=)r>   r  c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	Gemma4TextExpertsz2Collection of expert weights stored as 3D tensors.rN   c                    t         |           |j                  | _        |j                  | _        |j
                  | _        t        j                  t        j                  | j                  d| j                  z  | j                              | _        t        j                  t        j                  | j                  | j                  | j                              | _        t        |j                     | _        y )Nr)   )rZ   r[   num_expertsr   
hidden_dimmoe_intermediate_sizeintermediate_dimr   rs   r:   emptygate_up_projrt  r   ru  r  r  s     r?   r[   zGemma4TextExperts.__init__  s    !-- ,, & < <LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV556r>   rE   top_k_indextop_k_weightsrQ   c                 f   t        j                  |      }t        j                         5  t         j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        D ]  }|d   }|| j                  k(  rt        j                  |         \  }}	||	   }
t        j                  j                  |
| j                  |         j                  dd      \  }}| j                  |      |z  }t        j                  j                  || j                   |         }|||	|d f   z  }|j#                  d|	|j%                  |j&                                |S # 1 sw Y   xY w)NrJ  r)   r+   r   )rw   r   rw   )r:   
zeros_liker   r   r   rM  r0  r   greaterrN  nonzerorO  r^   r5  chunkr  rt  
index_add_r   r   )rb   rE   r6  r7  final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statess                 r?   rg   zGemma4TextExperts.forward  s    $..}=]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 % 
	nJ#AJT---#(;;{:/F#G Iy))4M}}++M4;L;LZ;XY__`agi_jHD"$(KK$5$:!$&MM$8$89NPTP^P^_iPj$k!$9M)U^`dJd<e$e!**1i9N9Q9QReRkRk9lm
	n #"#	S 	Ss   A=F&&F0)
r6   r7   r8   r9   r.   r[   r:   ri   rg   rj   rk   s   @r?   r.  r.    sN    <7/ 7#||# \\# ||	#
 
#r>   r.  c                   z     e Zd Zdef fdZdej                  deej                  ej                  f   fdZ xZ	S )Gemma4TextRouterrN   c                 (   t         |           || _        |j                  | _        | j                  dz  | _        |j
                  | _        t        | j                  | j                  d      | _        t        j                  |j                  |j                  d      | _        t        j                  t        j                  | j                              | _        t        j                  t        j                  |j                              | _        y )Nry   Fr*  rS   )rZ   r[   rN   r   scalar_root_sizer  ro   rm   r   r   r]   r0  projrs   r:   rt   scaleper_expert_scaler  s     r?   r[   zGemma4TextRouter.__init__
  s    !-- $ 0 0$ 6&&!$"2"2US	IIf00&2D2D5Q	\\%**T-=-=">?
 "UZZ8J8J-K Lr>   rE   rQ   c                 x   | j                  |      }|| j                  z  | j                  z  }| j                  |      }t        j
                  j                  |d      }t        j                  || j                  j                  d      \  }}||j                  dd      z  }|| j                  |   z  }|||fS )Nrw   r   )re  rn   Tr]  )r   rN  rL  rM  r   r   r   r:   topkrN   top_k_expertsrN  rO  )rb   rE   expert_scoresrouter_probabilitiesr7  r6  s         r?   rg   zGemma4TextRouter.forward  s    		-0%

2T5J5JJ		-0!}}44]4K &+ZZ kk''&
"{ 	**r4*@@ &(=(=k(JJ#]K??r>   )
r6   r7   r8   r.   r[   r:   ri   rG   rg   rj   rk   s   @r?   rJ  rJ  	  s>    
M/ 
M@U\\ @eELL%,,<V6W @r>   rJ  c                   0    e Zd Zdeez  def fdZ	 	 	 	 	 	 ddej                  dej                  de	ee
ej                  ej                  f   f   dz  dej                  d	ej                  dz  d
ej                  dz  dedz  dej                  fdZ xZS )Gemma4TextDecoderLayerrN   r   c                    t         |           || _        |j                  | _        || _        t        ||      | _        t        ||      | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        | j                  dt!        j"                  d             |j$                  | _        | j$                  rt&        |j(                     | _        t-        j.                  | j                  | j$                  d      | _        t-        j.                  | j$                  | j                  d      | _        t        | j                  |j                        | _        |j6                  | _        | j6                  rt9        |      | _        t=        |      | _        t        | j                  |j                        | _         t        | j                  |j                        | _!        t        | j                  |j                        | _"        y y )Nr  r  layer_scalarr+   FrS   )#rZ   r[   rN   r   r   r  r8  r  r  rm   r  r  r  r  r  r_   r:   rt   hidden_size_per_layer_inputr   ru  r  r   r]   per_layer_input_gateper_layer_projectionpost_per_layer_input_normenable_moe_blockrJ  routerr.  expertspost_feedforward_layernorm_1post_feedforward_layernorm_2pre_feedforward_layernorm_2r   s      r?   r[   zGemma4TextDecoderLayer.__init__.  s   !--",FiP 3,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'^UZZ];+1+M+M(++ !9!9:DK(*		$2B2BDDdDdkp(qD%(*		$2R2RTXTdTdkp(qD%-:4;K;KQWQdQd-eD* & 7 7  *62DK,V4DL0=d>N>NTZTgTg0hD-0=d>N>NTZTgTg0hD-/<T=M=MSYSfSf/gD, !r>   NrE   per_layer_inputr(  r   rJ   r   rD   rQ   c           
      &   |}	| j                  |      } | j                  d||||||d|\  }}
| j                  |      }|	|z   }|}	| j                  |      }| j	                  |      }| j
                  r| j                  |      }|	j                  d|	j                  d         }| j                  |      \  }
}}| j                  |      }| j                  |||      }|j                  |	j                        }| j                  |      }||z   }| j                  |      }|	|z   }| j                  rP|}	| j                  |      }| j!                  |      }||z  }| j#                  |      }| j%                  |      }|	|z   }|| j&                  z  }|S )N)rE   r   rJ   r(  r   rD   rw   r=   )r  r8  r  r  r  r]  r`  r   r   r^  rb  r_  ra  r  rY  rZ  r  r[  r\  rX  )rb   rE   rc  r(  r   rJ   r   rD   r=  r  r   hidden_states_1hidden_states_flatr7  r6  hidden_states_2s                   r?   rg   zGemma4TextDecoderLayer.forwardJ  s    !,,];)4>> 
' 3)-%+
 
q 55mD =0 66}E/  "??NO "*!1!1"hnnR6H!I,0KK8J,K)A}k">>?QRO"ll?KWO-55hnnEO"??PO ,o=M77F =0++$H 55mDM KK6M)O;M 55mDM ::=IM$}4M***r>   )NNNNNN)r6   r7   r8   r.   r/   rh   r[   r:   ri   r,  rG   r  r   rg   rj   rk   s   @r?   rV  rV  -  s    h/2DD hQT h> )-PT,0.204(,9||9 9 sE%,,*D$EEFM	9
 #\\9 t+9 &&-9 9 
9r>   rV  c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma4TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                     t         |   |||       || _        | j                  dt	        j
                  |      d       y )Nrm  Fr   )rZ   r[   scalar_embed_scaler_   r:   r`   )rb   rj  rk  rl  rm  rc   s        r?   r[   z&Gemma4TextScaledWordEmbedding.__init__  s;    D"-]ELL,ERWXr>   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S re   )rZ   rg   rm  r   ru   r   )rb   rp  rc   s     r?   rg   z%Gemma4TextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRr>   )r   )r6   r7   r8   r9   rh   ra   r[   r:   ri   rg   rj   rk   s   @r?   ri  ri    sG    Ys Y3 YS Y_d Y
S S Sr>   ri  c                   ~     e Zd ZU eed<   dZdZdZdZdZ	dZ
g dZddgZdZ ej                          fd       Z xZS )Gemma4PreTrainedModelrN   T)rV  r  r4  rD   r(  )imagetextvideoaudioc                 	   t         |   |       t        |t              r t	        j
                  |j                         y t        |t              rd}d}|j                  dz  }t        j                  ||z        t        |dz
  d      z  }|t        j                  t        j                  |      | z        z  }t	        j                  |j                   |j#                  d      j#                  d             y t        |t$              rJt	        j&                  |j(                  |j*                         t	        j,                  |j.                         y t        |t0              r|j2                  j5                         D ]  \  }}d|i}	|dk(  r|j6                  |   dk(  rd	|	d
<    ||j8                  fi |	\  }
}t	        j                  t;        || d      |
       t	        j                  t;        || d      |
        y t        |t<              r|j6                  dk7  rt>        |j6                     n|j@                  } ||j8                        \  }}t	        j                  |jB                  |       t	        j                  |jD                  |       y t        |tF              r+t	        j&                  |jH                  |jJ                         y t        |tL              r?t	        j
                  |jN                         t	        j
                  |jP                         y t        |tR              r[| j8                  jT                  }t	        jV                  |jX                  d|       t	        jV                  |jZ                  d|       y t        |t\              r t	        j
                  |j^                         y t        |t`              r|jb                  rt	        j&                  |jd                  tg        d              t	        j&                  |jh                  tg        d             t	        j&                  |jj                  tg        d              t	        j&                  |jl                  tg        d             y t        |tn              rV|j8                  jp                  r?t	        j,                  |jr                         t	        j
                  |jt                         y y y )Nr   r   r)   r+   r   r  r  r  r  r	  r
  r  r  rL  )r{   stdrV   );rZ   _init_weightsr  r@  initones_rE  r   r   r   r   r   r:   r   r   copy_r   r   r   	constant_r   r   zeros_r   r  r  itemsr~  rN   r  r{  r   r  r|  r  ri  rm  ro  rJ  rN  rO  r.  initializer_rangenormal_r5  rt  rV  rX  rM   r\   rU   ra   rW   rX   rY   Gemma4VisionModelstandardizestd_bias	std_scale)rb   r  r   r   r   r   r   r  r  r  r  r   rope_fnbuffer_valuery  rc   s                  r?   rz  z#Gemma4PreTrainedModel._init_weights  s   f%f78JJv667 @AM#M#//14N&*hh}}/L&MPSTbefTfhiPj&j#*UYYu||N7SWnVn7n-ooNJJv,,n.F.Fq.I.S.STU.VW 45NN6>>6+K+KLKK,,- 9:,2,@,@,F,F,H ^(
L'3Z&@#!11f6F6Fz6RVd6d:K'7#/#UAT#U q

76j\+CDmT

76j\9K+LM}]^  ;< ##y0 $F$4$45;; 
 &fmm4OL!JJv5JJv//> =>NN6--v/H/HI 01JJv||$JJv../ 12++//CLL,,3C@LL))= 67JJv**+ 566;U;UNN6++eEl];NN6++U5\:NN6,,uU|m<NN6,,eEl; 12v}}7P7PKK(JJv''( 8Q2r>   )r6   r7   r8   r-   r<   supports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backend_no_split_modules_skip_keys_device_placementinput_modalitiesr:   r   rz  rj   rk   s   @r?   rs  rs    s^    &*#N!"&b#46H"I:U]]_2) 2)r>   rs  zAThe base Gemma 4 language model without a language modeling head.c                       e Zd ZU eed<   dZ eed      ee	dZ
def fdZeee	 	 	 	 	 	 	 ddej                   dz  d	ej"                  dz  d
ej                   dz  dedz  dej&                  dz  dej"                  dz  dedz  dee   defd                     Zdej"                  dz  dej"                  dz  dej"                  fdZ	 ddej"                  dej"                  dz  dej"                  fdZ xZS )Gemma4TextModelrN   )ru  r   )r#  )router_logitsrE   rF   c           
         t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  | j                  j                  dz        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        t%        |      | _        d| _        t+        | j                  j,                        | _        |j0                  | _        | j0                  rt        |j2                  |j                  |j0                  z  | j                  |j0                  dz        | _        d| _        t        j8                  |j                  |j                  |j0                  z  d      | _        |j                  dz  | _        t        |j0                  |j                         | _        g | _         tC        | j                        D ]K  \  }}|jD                  jF                  s| j@                  jI                  dD cg c]
  }d	| d
|  c}       M | jK                          y c c}w c c}w )NrT  )rm  r  Fg;f?rS   ry   )r   r   r  r  zlayers.z.self_attn.)&rZ   r[   pad_token_idrl  
vocab_sizeri  r   rN   embed_tokensr   r  r  r  rV  r  rm   r  r   r  r  gradient_checkpointingr  r  unique_layer_typesrY  vocab_size_per_layer_inputembed_tokens_per_layerper_layer_input_scaler]   per_layer_model_projection per_layer_model_projection_scaleper_layer_projection_norm"_keys_to_ignore_on_load_unexpected	enumerater8  r  extend	post_init)rb   rN   r   r  layernamerc   s         r?   r[   zGemma4TextModel.__init__  s!    !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmHMfNfNfHgh9#FI6h
 "&"4"4&:M:MN	3F;&+#"%dkk&=&=">+1+M+M(++*G11((6+M+MM  ">>C	+D' *3D&.0ii""((6+M+MM/D+
 5;4F4F4LD1-:6;];]cicvcv-wD* 35/!$++. 	HAu1177>>@hiwqcTF3i	 	C i: js   "JJ

Nrp  rJ   r   rD   r  per_layer_inputs	use_cacher=  rQ   c           
         |du |duz  rt        d      || j                  |      }| j                  r&|| j                  ||      }| j	                  ||      }|r|t        | j                        }|V||j                         nd}	t        j                  |j                  d   |j                        |	z   }|j                  d      }t        |x}
t              s)| j                  ||||d}t        di |t!        di |d}
|}i }| j"                  D ]  }| j%                  |||      ||<    i }t'        | j(                  d| j                  j*                         D ]\  \  }}||dddd|ddf   nd} |||f||| j                  j,                  |      |
| j                  j,                  |      ||d	|}^ | j/                  |      }t1        ||
      S )uq  
        per_layer_inputs (`torch.Tensor` of shape `(batch_size, sequence_length, num_hidden_layers, hidden_size_per_layer_input)`, *optional*):
            Pre-computed per-layer input embeddings. When provided, these are used directly instead of being
            computed from `input_ids` via `get_per_layer_inputs()`. This is primarily used by the multimodal
            model (`Gemma4Model`) which pre-computes per-layer inputs from the original `input_ids` *before*
            merging multimodal soft tokens into `inputs_embeds` — at which point the original token ids are
            no longer recoverable.
        N:You must specify exactly one of input_ids or inputs_embedsrN   r   r+   r   rN   r  rJ   rD   r   r  r  )r(  r   rJ   r   rD   )r  rD   r=   )r`  r  rY  get_per_layer_inputsproject_per_layer_inputsr   rN   get_seq_lengthr:   r   r   r   r   r  r,  r   r   r  r  r  r  r  r  r   r   )rb   rp  rJ   r   rD   r  r  r  r=  past_seen_tokenscausal_mask_mappingmask_kwargsrE   r   r  r(  r  r  rc  s                      r?   rg   zGemma4TextModel.forward  s3   , -t";<YZZ  --i8M++'#'#<#<Y#V #<<]L\]0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-F ++!."0#2 ,K #5"C{"C%F%U%U# & 11 	gJ.2oom\[e.f
+	g  !*$++6U8U8U*V W 	A}>N>Z.q!Qz:`dO)	 "2$78O8OPQ8R$S24;;3J3J13MN) /	 	M	 		-0&++
 	
r>   c                    | j                   st        d| j                         |t        j                         5  |d d d d d d d f   | j
                  j                  d d d d d d f   | j                  j                  dz  z  k(  j                  d      j                         d d df   }	 |j                  |j                  d d       }	 d d d         | j                  |      j                  g |j                  | j                  j                  | j                    S # t        $ r t        d      w xY w# 1 sw Y   nxY w)Nz}Attempting to call get_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. rT  r	   r   r)   a)  It seems like you tried to call `forward` from `inputs_embeds` without providing `input_ids`, and that the `inputs_embeds` you provided do not exactly match the embedding weights. Since Gemma4 needs to reverse the embedding to compute another embedding, make sure you provide exact `inputs_embeds`)rY  RuntimeErrorrN   r:   r   r  ru   r   rc  r<  r   r   r  r   r  )rb   rp  r  s      r?   r  z$Gemma4TextModel.get_per_layer_inputsh  sS   //**.++8    &aD!m4,,33D$14DEH_H_adHdde SQSZWYq!t%  )}/B/B2A/F GI$ >t**95== 
__
KK))
 ,,
 	
 $ &r  s   A1D9-D!!D66D99Ec                 V   | j                   st        d| j                         | j                  |      | j                  z  } |j
                  g |j                  d d | j                  j                  | j                    }| j                  |      }||S ||z   | j                  z  S )NzAttempting to call project_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. rw   )
rY  r  rN   r  r  r   r   r  r  r  )rb   r  r  r[  s       r?   r  z(Gemma4TextModel.project_per_layer_inputs  s    
 //226++@ 
  $>>}MPTPuPuu;3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''$'774;U;UUUr>   )NNNNNNNre   )r6   r7   r8   r.   r<   r  r'   rJ  rV  r  _can_record_outputsr[   r&   r(   r"   r:   r  ri   r   r;   r   r   r!   r   rg   r  r  rj   rk   s   @r?   r  r    s    '(8B/)+/ +Z   .2.204(,2604!%S
##d*S
 t+S
 &&-	S

 S
 ((4/S
  ,,-S
 $;S
 +,S
 
!S
    S
j 
ellT.A  
RWR^R^aeRe  
jojvjv  
J 15V||V  ,,-V 
	Vr>   r  z>The base Gemma 4 language model with a language modeling head.c                   X    e Zd ZU ddiZddiZddgdgfiZeed<   dZdef fd	Z	e
e	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  ded
z  dej                   d
z  dej                  d
z  ded
z  deej                  z  dee   defd              Z xZS )Gemma4ForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrE   rC   rN   modelc                 J   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                  j                  D cg c]  }d| 	 c}| _	        | j                          y c c}w NFrS   zmodel.)rZ   r[   r  r  r  r   r]   r   r  r  r  rb   rN   r  rc   s      r?   r[   zGemma4ForCausalLM.__init__  s     $V,
 ++yy!3!3V5F5FUS )-

(U(U3
 $fTFO3
/
 	3
s   ;B Nrp  rJ   r   rD   r  labelsr  logits_to_keepr=  rQ   c	           
          | j                   d||||||d|	}
|
j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                  ||| j                  fi |	}t        |||
j                  |
j                  |
j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma4ForCausalLM

        >>> model = Gemma4ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)rp  rJ   r   rD   r  r  N)rB   rC   rD   rE   rF   r=   )r  r  r  rh   slicer  rN   final_logit_softcappingr:   r   loss_functionr  r   rD   rE   rF   )rb   rp  rJ   r   rD   r  r  r  r  r=  outputsrE   slice_indicesrC   rB   s                  r?   rg   zGemma4ForCausalLM.forward  s   @ ,64:: ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
r>   )NNNNNNNr   )r6   r7   r8   _tied_weights_keys_tp_plan_pp_planr.   r<   base_model_prefixr[   r#   r"   r:   r  ri   r   r;   r   rh   r   r!   r   rg   rj   rk   s   @r?   r  r    s+   *,GH23H_-z:;H/   .2.204(,26*.!%-.;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
   4';
 $;;
 ell*;
 +,;
 
 ;
  ;
r>   r  r  c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zL
    This creates uni/bidirectional attention mask with sliding window.
    	batch_idxhead_idxq_idxkv_idxrQ   c                 P    	\  }}||z
  }|dk\  ||k  z  }|dk  | |k  z  }||z  S r%  r=   )
r  r  r  r  left_window_sizeright_window_sizedist	left_mask
right_maskr  s
            r?   
inner_maskz0sliding_window_mask_function.<locals>.inner_mask  sM    .<++v~QY4*:#:;	QhD5+<#<=
:%%r>   rh   r   )r  r  s   ` r?   sliding_window_mask_functionr    s3    
&c &S & &c &d & r>   c                   ,    e Zd ZU dZeed<   dZdZee	dZ
def fdZdej                  dej                  fd	Zee ed
      	 ddej                  dej                  dz  dee   deej                  ej*                  f   fd                     Z xZS )Gemma4AudioModelznAn audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.rN   r	  zmodel.audio_towerrE   rF   c           	         t         |   |       || _        t        |      | _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  |j                  d      | _        | j#                          y c c}w )NTrS   )rZ   r[   rN   r  subsample_conv_projectionr   rel_pos_encr   r  r  r  r4  r  r]   r   output_proj_dimsoutput_projr  r   s      r?   r[   zGemma4AudioModel.__init__  s     )KF)S&;FCmmBGH`H`BabYfi0b
 99V%7%79P9PW[\	 cs   B?mask_4drQ   c                    |j                   \  }}}}|j                  }| j                  j                  }| j                  j                  dz
  }| j                  j
                  }||z   dz
  |z  }	|	|z  }
|
|z
  }t        j                  |d|d|fd      }|j                  |d|	||
      }t        j                  |||fd      }t        j                  |	|      |z  }t        j                  ||z   |z   |      }|dddf   |dddf   z   }|dddddddf   j                  |dd|d      }|j                  d|      S )z
        Convert a standard 4D attention mask `[batch_size, 1, seq_len, seq_len]` to the 5D blocked format
        `[batch_size, 1, num_blocks, chunk_size, context_size]` expected by the chunked local attention,
        r+   r   F)r  r   Nrw   )r   r   rN   r   r   r   r   r   r   r:   r   r  gather)rb   r  r   r   r   r   r   r   r   r   padded_seq_len
pad_amountmask_5dblock_startsoffsets
kv_indicess                   r?   _convert_4d_mask_to_blocked_5dz/Gemma4AudioModel._convert_4d_mask_to_blocked_5d   sN   
 %,MM!
Aw[[55
;;==A![[@@
*Q.:=
#j0#g-
%%!ZJ!?uM//*aZX%%"24F!GuU||Jv>K,,z,<<?QQZ`a!!T'*WT1W-==
dAtQ 67>>z1bR\^`a
~~b*--r>   z&Encodes audio features to soft tokens.r0   NrJ   r=  c           	         | j                  ||      \  }}| j                  |      }t        | j                  ||t	        | j                  j
                  dz
  | j                  j                  f            }| j                  |      }| j                  d | j                  j                   D ]  } ||f||d|} | j                  |      }t        ||      S )Nr+   )rN   r  rJ   and_mask_function)rJ   r   )r  rJ   )r  r  r   rN   r  r   r   r  r  r  r  rI   )rb   r	  rJ   r=  rE   output_maskr   encoder_layers           r?   rg   zGemma4AudioModel.forward;  s     &*%C%CNTb%c"{"..}=2;;'&:33a79\9\]	
 <<^L![[)H4;;+H+HI 	M)-$7 	M	 ((7%Vabbr>   re   )r6   r7   r8   r9   r,   r<   main_input_namer  r4  r   r  r[   r:   ri   r  r&   r(   r"   r   r!   rG   rK   rg   rj   rk   s   @r?   r  r    s    x&O+)*
0 .ell .u|| .6  !IJ /3cc t+c +,	c
 
u||U---	.c K   cr>   r  c                        e Zd ZdZeZeedZdef fdZ	e
e ed      dej                  dej                  d	ee   d
efd                     Z xZS )r  zThe Gemma 4 Vision Encoder.r  rN   c                    t         |   |       t        |      | _        t	        |      | _        t        |      | _        | j                  j                  rr| j                  dt        j                  | j                  j                               | j                  dt        j                  | j                  j                               | j                          y )Nr  r  )rZ   r[   r@  patch_embedderr  encoderrV  poolerrN   r  r_   r:   r4  r   r  r  s     r?   r[   zGemma4VisionModel.__init__f  s     7?*62(0;;""  U[[9P9P-QR  ekk$++:Q:Q.RSr>   z1Encodes image pixels to soft tokens from patches.r0   rR  rF  r=  rQ   c                    | j                   j                  }|j                  d   ||z  z  }|dk(  j                  d      }| j	                  |||      } | j
                  d|| |d|}| j                  |j                  |||      \  }	}
|	|
   }	| j                   j                  r|	| j                  z
  | j                  z  }	t        |	      S )a  
        pixel_values (`torch.FloatTensor` or `list[torch.FloatTensor]`):
            The images to encode. Either a single `[batch, channels, height, width]` tensor
            (all images same size) or a list of `[1, channels, height, width]` tensors (different sizes).
        pixel_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`):
            The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
        r9  rw   r   )r  rJ   rF  )rE   rF  rG  rl  r  r=   )rN   pooling_kernel_sizer   rc  r  r  r  r  r  r  r  r   )rb   rR  rF  r=  r  rl  rG  r  rj  rE   pooler_masks              r?   rg   zGemma4VisionModel.forwardr  s      #kk==$**2.3FI\3\]/25::r:B++L:LN_` 
'--1
 	
 &*[[ 221/'	 &1 &
"{ &k2;;""*T]]:dnnLM&GGr>   )r6   r7   r8   r9   r/   rN   r  r  r  r[   r&   r(   r"   r:   r;   r  r   r!   r   rg   rj   rk   s   @r?   r  r  ]  s    %F1+

1 
  !TU&H''&H ",,&H +,	&H
 
!&H V   &Hr>   r  c                   j     e Zd ZdZdeez  def fdZdej                  dej                  fdZ
 xZS )Gemma4MultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 N   t         |           t        |d|j                        | _        |j
                  | _        |j                  | _        t        j                  | j                  | j                  d      | _
        t        | j                  | j                  d      | _        y )Nr  FrS   r*  )rZ   r[   r  r   multimodal_hidden_sizer  ro   text_hidden_sizer   r]   embedding_projectionrm   embedding_pre_projection_norm)rb   r  r  rc   s      r?   r[   z!Gemma4MultimodalEmbedder.__init__  s    
 	&-.?ASUfUrUr&s#$11 + 7 7$&IId.I.I4K`K`gl$m!-:4;V;V\`\d\dqv-w*r>   r  rQ   c                 F    | j                  |      }| j                  |      S )a:  Embeds token ids or soft tokens for multimodal content into language model space.
        Args:
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.
        Returns:
            A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        )r  r  )rb   r  embs_normeds      r?   rg   z Gemma4MultimodalEmbedder.forward  s%     88G((55r>   )r6   r7   r8   r9   r,   r/   r.   r[   r:   ri   rg   rj   rk   s   @r?   r  r    sA    [x,/AAx &x6U\\ 6ell 6r>   r  token_type_idsimage_group_idsc           
      V    | ydt         dt         dt         dt         dt        f
fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    Nr  r  r  r  rQ   c                    	j                   d   }|j                  |dz
        }|j                  |dz
        }	| |f   }	| |f   }t        j                  ||k  |d      }t        j                  ||k  |d      }||k(  |dk\  z  S )Nrw   r+   )r   r   )r   rf   r:   rO  )
r  r  r  r  r   q_idx_clampedkv_idx_clampedq_groupkv_groupr  s
            r?   r  z0token_type_ids_mask_function.<locals>.inner_mask  s    $**2.
 
Q7*q.9 ")]":;"9n#<=++ej0'2>;;v
2HbA8#155r>   r  )r  r  r  s    ` r?   token_type_ids_mask_functionr    s>     6c 6S 6 6c 6d 6 r>   rN   r  rD   mm_token_type_idsrR  is_trainingis_first_iterationc	                    |r|t        d      | j                         ||||d}
|
j                         }||n|du xs |j                   xs |du}||r|dk(  |dk(  z  }t	        j
                  |dd      }d|d	<   || z  }t	        j                  |j                         d
      dz
  }t	        j                  ||d      }t        |j                  |j                        |      |d<   t        di |
t        di |dS )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma4 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz>`mm_token_type_ids` is required as a model input when trainingr  r+   r)   rw   )shiftsdimsFr\  r   or_mask_functionr  r=   )r`  get_text_configcopyis_initializedr:   rollcumsumrh   rO  r  r   r   r   r   )rN   r  rJ   rD   r   r  rR  r  r  r=  r  sliding_mask_kwargs	is_visionis_prev_visionnew_vision_startsvision_group_idss                   r?   create_causal_mask_mappingr    sH   $ (0YZZ ((*&(*$K &**, ) 	%g_-K-K)Kg|cgOg 
 $); '!+0AQ0FG	IabA!&v%7 <<(9(=(=(?QG!K ;;y2BBG2N  !5!568H3
./
 -;{;>UATU r>   z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c            "           e Zd ZdZdef fdZd Zd Ze e	d      	 dd	e
j                  d
e
j                  dz  dee   defd              Z	 	 d de
j                  dz  de
j                  dz  dee
j$                  e
j$                  e
j$                  f   fdZeee		 	 	 	 	 	 	 	 	 	 	 	 	 d!de
j                  dz  d	e
j                  dz  de
j                  dz  de
j                  dz  de
j*                  dz  de
j*                  dz  de
j                  dz  dedz  de
j                  dz  de
j                  dz  dedz  d
e
j                  dz  de
j                  dz  dee   defd                     Ze e	d      de
j*                  de
j*                  dee   deez  fd              Ze e	d      	 dde
j                  de
j                  dz  dee   defd              Z xZS )"Gemma4ModelFrN   c                    t         |   |       |j                  j                  | _        t	        j
                  |j                        }|| _        |j                  j                  | _        |j                  t	        j
                  |j                        nd | _	        |j                   t        |j                  |j                        nd | _        |j                  t	        j
                  |j                        nd | _        |j                   t        |j                  |j                        nd | _        | j                  j                  D cg c]  }d| 	 c}| _        | j!                          y c c}w )Nr  zlanguage_model.)rZ   r[   r  r  r*   from_configlanguage_modelr  vision_configvision_towerr  embed_visionaudio_configaudio_towerembed_audior  r  )rb   rN   r$  r  rc   s       r?   r[   zGemma4Model.__init__  sG     ,,77"..f6H6HI,*0*<*<*W*W'KQK_K_KkI11&2F2FGqu ##/ %V%9%96;M;MN 	
 JPI\I\Ih9001D1DEnr "". %V%8%8&:L:LM 	 261D1D1g1g3
)-odV$3
/ 	3
s   E5c                 6    | j                   j                         S re   )r$  get_input_embeddingsrb   s    r?   r,  z Gemma4Model.get_input_embeddings7  s    ""7799r>   c                 :    | j                   j                  |       y re   )r$  set_input_embeddingsrb   r  s     r?   r/  z Gemma4Model.set_input_embeddings:  s    007r>   zOProjects the last hidden state from the vision model into language model space.r0   NrR  image_position_idsr=  rQ   c                 v     | j                   d||d|}|j                  }| j                  |      |_        |S )z
        image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
            The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
        rR  rF  r  r=   )r&  r  r'  pooler_output)rb   rR  r1  r=  vision_outputsr  s         r?   get_image_featureszGemma4Model.get_image_features=  sV     +** 
%1
 

 +<<'+'8'8GX'8'Y$r>   rp  r  c                 &   |M|| j                   j                  k(  }|| j                   j                  k(  }|| j                   j                  k(  }n>| | j	                         t        j                  | j                   j                  t
        j                  |j                              k(  j                  d      }| | j	                         t        j                  | j                   j                  t
        j                  |j                              k(  j                  d      }| | j	                         t        j                  | j                   j                  t
        j                  |j                              k(  j                  d      }|||fS )a  
        Obtains mask for multimodal placeholders (replaced by soft tokens) and hard text tokens.

        Masks will be obtained from `mm_token_type_ids`, `input_ids`, or `inputs_embeds` as available and in that
        precedence order. If passing `input_ids` or `inputs_embeds`, the image mask will be derived using
        `config.image_token_id`. Same goes for audio and video masks

        Args:
            input_ids: A tensor containing the hard token IDs from the text tokenizer.
            inputs_embeds: A tensor containing the embeddings for all hard text tokens.

        Returns:
            image_mask, video_mask, audio_mask
        )r   r   rw   )
rN   image_token_idvideo_token_idaudio_token_idr,  r:   r`   rb  r   rc  )rb   rp  r  special_image_maskspecial_video_maskspecial_audio_masks         r?   get_placeholder_maskz Gemma4Model.get_placeholder_maskR  sP   &  !*dkk.H.H!H!*dkk.H.H!H!*dkk.H.H!H .4,,.LL!;!;5::VcVjVjk c"g  .4,,.LL!;!;5::VcVjVjk c"g  .4,,.LL!;!;5::VcVjVjk c"g  "#57IIIr>   pixel_values_videosr	  rJ   r
  r   rD   r  r  video_position_idsc                 H   |du |
duz  rt        d      | j                  ||
      \  }}}||z  |z  }d}|
I|j                         }| j                  j                  j
                  ||<    | j                         |      }
| j                  j                         j                  r| j                  j                  j                  | j                  j                  j
                  ddf   }t        j                  |d   |j                  ddd      |
      }| j                  j                  ||      }nd}|| j!                  ||d      j"                  }|j%                  |
j&                  |
j(                        }|j+                         }|j-                  d      j/                  |
      j%                  |
j&                        }t1        |
|   j3                         |j3                         k(  d| d	|j4                  d
           |
j7                  |j%                  |
j&                        |j%                  |
j&                              }
|| j9                  ||d      j"                  }|j%                  |
j&                  |
j(                        }|j+                         }|j-                  d      j/                  |
      j%                  |
j&                        }t1        |
|   j3                         |j3                         k(  d| d	|j4                  d
           |
j7                  |j%                  |
j&                        |j%                  |
j&                              }
||| j;                  ||d      }|j"                  }|j<                  }||   }|j+                         }|j-                  d      j/                  |
      j%                  |
j&                        }t1        |
|   j3                         |j3                         k(  d| d	|j4                  d
   |j4                  d   z          |
j7                  |j%                  |
j&                        |j%                  |
j&                              }
|V||j?                         nd
}t        j@                  |
j4                  d   |
j&                        |z   }|j-                  d
      }tC        |x} tD              sh| j                  j                         jF                  dk(  r(tI        | j                  |
||||	|| jJ                        } ntM        | j                  |
|||      }  | j                  d|| |||
|dd|}!tO        |!jP                  |!jR                  |!jT                  |!jV                  |nd|      S d      S )  
        input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
            The attention mask for the input audio.
        image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
            2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
            Passed through to the vision encoder for positional embedding computation.
        video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
            2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
            Passed through to the vision encoder for positional embedding computation.
        Nr  r   r+   rw   T)return_dictz6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: z6Audio features and audio tokens do not match, tokens: r   vision)r  )r  rJ   r   rD   r  r  rD  )r  rD   rE   rF   r4   r5   r=   ),r`  r?  r  rN   r  r  r,  r  rY  r$  r  ru   r:   rO  r   r  r7  r5  r   r   r   rN  r   	expand_asr$   numelr   masked_scatterget_video_featuresget_audio_featuresrJ   r  r   r  r,  r!  r  r  r   r3   r  rD   rE   rF   )"rb   rp  rR  r@  r	  rJ   r
  r   rD   r  r  r  r1  rA  r=  
image_mask
video_mask
audio_maskmultimodal_maskllm_input_idspad_embeddingllm_inputs_embedsr  image_featuresn_image_tokensvideo_featuresn_video_tokensaudio_outputaudio_featuresaudio_mask_from_encodern_audio_tokensr  r  r  s"                                     r?   rg   zGemma4Model.forward  s   < -t";<YZZ-1-F-FyR_-`*
J
$z1J>  %OO-M-1[[-D-D-Q-QM/*7D557FM;;&&(DD //<<CCDKKD[D[DhDhjkDklM %OI,FHZHZ[\^_acHdfs t#22GGWhi# #!44\CUcg4hvvN+..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M *!44#%7T 5 m  ,..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M
 %*=*I22>CVdh2iL)77N&2&A&A#
 ,,CDN'^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+n.B.B1.EEFH *88m223^5F5F}G[G[5\M
 CRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-F{{**,HHHT&@KK!"# %  $	'# '@KK!"# '# &$%% 	
-.%+'	
 	
 )%77#33!//))2>2JPT2@2L
 	
 SW
 	
r>   zPProjects the last hidden state from the audio encoder into language model space.c                     | j                   t        d       | j                   ||fddi|}| j                  |j                        |_        |S )a0  
        input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
            The tensors corresponding to the input audio.
        input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
            The attention mask for the input audio.
        zAudio features were requested, but the model was initialized without an audio_config. Cannot process audio without an audio tower and audio embedder.rD  Tr4  )r)  r`  r*  r  r5  )rb   r	  r
  r=  audio_outputss        r?   rJ  zGemma4Model.get_audio_features	  sh     #R 
 )((9LiZ^ibhi&*&6&6]EdEd&6&e#r>   zQProjects the last hidden state from the vision encoder into language model space.c                     |j                  dd      }|j                  dd      } | j                  d||d|}|j                  }| j                  |      |_        |S )a9  
        video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
            2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
            Passed through to the vision encoder for positional embedding computation.
        r   r+   r3  r4  r=   )flattenr&  r  r'  r5  )rb   r@  rA  r=  r6  r  s         r?   rI  zGemma4Model.get_video_features7	  s|     299!Q?/771=*** 
,1
 

 +<<'+'8'8GX'8'Y$r>   re   r  )NNNNNNNNNNNNN)r6   r7   r8   accepts_loss_kwargsr-   r[   r,  r/  r#   r"   r:   r;   r  r   r!   r   r7  rG   rK   r?  r&   ri   r   r   r3   rg   rI   rJ  rI  rj   rk   s   @r?   r!  r!    s      | 2:8 !rs 7;'' ",,t3 +,	
 
$ t * .226+J##d*+J ((4/+J 
u!1!153C3CC	D	+JZ   .2158<37.23704(,5926!%6:6:Z
##d*Z
 ''$.Z
 #..5	Z

 ))D0Z
 t+Z
 #\\D0Z
 &&-Z
 Z
 !++d2Z
 ((4/Z
 $;Z
 ",,t3Z
 ",,t3Z
 +,Z
  
#!Z
    Z
x !st #\\ +,	
 
'	' u . !tu 7;".. ",,t3 +,	
 
$ v r>   r!  z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            %       B    e Zd ZddiZdZdef fdZd Zd Ze		 dd	e
j                  d
e
j                  dz  dee   fd       Zee		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d de
j                  dz  d	e
j                  dz  de
j                  dz  de
j                  dz  de
j"                  dz  de
j"                  dz  de
j                  dz  d
e
j                  dz  de
j                  dz  dedz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  dee
j"                  z  dee   def"d              Z	 	 	 	 	 	 	 	 	 	 	 	 	 d! fd	Ze	 	 d"dede
j"                  de
j"                  dz  dedz  de
j"                  dz  de
j"                  dz  dedz  defd       Z xZS )#Gemma4ForConditionalGenerationr  z(model.language_model.embed_tokens.weightr  rN   c                 P   t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                  j                  D cg c]  }d| 	 c}| _
        | j                          y c c}w r  )rZ   r[   r!  r  r   r]   r  r   r  r  r  r  r  s      r?   r[   z'Gemma4ForConditionalGeneration.__init__Z	  s      (
yy!3!3!?!?ASASA^A^ejk )-

(U(U3
 $fTFO3
/ 	3
s   >B#c                 6    | j                   j                         S re   )r  r,  r-  s    r?   r,  z3Gemma4ForConditionalGeneration.get_input_embeddingsd	  s    zz..00r>   c                 :    | j                   j                  |       y re   )r  r/  r0  s     r?   r/  z3Gemma4ForConditionalGeneration.set_input_embeddingsg	  s    

''.r>   NrR  r1  r=  c                 >     | j                   j                  ||fi |S )a-  
        image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
            2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
            Passed through to the vision encoder for positional embedding computation.
        )r  r7  )rb   rR  r1  r=  s       r?   r7  z1Gemma4ForConditionalGeneration.get_image_featuresj	  s$     -tzz,,\;MXQWXXr>   rp  r@  r	  rJ   r
  r   rA  rD   r  r  r  r  r  rQ   c                     | j                   d	||||||||
||||||	dd|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }| j                  j                         j                  x}||z  }t        j                  |      }||z  }d}|S|j                         }|dddddf   }|dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t!        j"                         }|j%                  d| j                  j                         j&                        }|j%                  d      j                  |j                        } |||      }t)        |||j*                  |j,                  |j.                  |j0                  |j2                        S )
rC  T)rp  rR  r@  r	  rJ   r
  r   rD   r  r  r  r  r1  rA  rD  N.rw   r+   r   )rB   rC   rD   rE   rF   r4   r5   r=   )r  r  r  rh   r  r  rN   r  r  r:   r   ra   r   r   r   r   r   CrossEntropyLossr   r  rA   rD   rE   rF   r4   r5   )rb   rp  rR  r@  r	  rJ   r
  r   r1  rA  rD   r  r  r  r  r  r=  r  rE   r  rC   r  rB   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                r?   rg   z&Gemma4ForConditionalGeneration.forwardx	  sY   > $** 
% 3)) 3%+/'11
  !
&  118B>SV8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0K0K0M0X0XYK&++B/22<3F3FGKK5D+#33!//)) ' ; ; ' ; ;
 	
r>   c                 j    t        |   |f|||||||
|d|}|s|s||d<   ||d<   ||d<   |	|d<   |S )N)rD   r  rJ   r   r  r  r  r  rR  r@  r	  r
  )rZ   prepare_inputs_for_generation)rb   rp  rD   r  r   rR  r@  r	  rJ   r
  r  r  r  r  r  r=  model_inputsrc   s                    r?   rn  z<Gemma4ForConditionalGeneration.prepare_inputs_for_generation	  sv    & w<
+')%))1
 
 Y+7L(2EL./-;L)*2EL./r>   r  c           
          t        | j                         dd       dk(  r;t        | |||||fd|i|j                         D 	ci c]  \  }}	|dk7  s||	 c}	}S t	        | ||||fi |S c c}	}w )Nr!  rE  r  rR  )r  r  r  r  r   )
rN   r  rJ   rD   r   r  r  r=  re  vs
             r?   r   z8Gemma4ForConditionalGeneration.create_masks_for_generate	  s     6))+-JDQU]]-!	 $6	 %+LLNJDAqa>6I1a4J	 	 -~X^ 	 Ks   A*A*re   )NNNNNNNNNNNNNNr   )NNNNNNNNNTNNF)NF)r6   r7   r8   r  r  r-   r[   r,  r/  r"   r:   r;   r  r   r!   r7  r#   ri   r   r   rh   rA   rg   rn  r  r   r,  r   rj   rk   s   @r?   r`  r`  P	  s    +,VW| 1/  7;Y''Y ",,t3Y +,	Y Y  .2158<37.237046:6:(,5926*.!%-.!W
##d*W
 ''$.W
 #..5	W

 ))D0W
 t+W
 #\\D0W
 &&-W
 ",,t3W
 ",,t3W
 W
 !++d2W
 ((4/W
   4'W
 $;W
  ell*!W
" +,#W
$ 
&%W
  W
x    'R  26*/ || t+ 	
 llT) !<<$. !4K 
 r>   r`  )r  r  r`  r!  rs  r  r  )r+   )rL  NN)r)   )NNFN){r   collections.abcr   dataclassesr   	functoolsr   typingr   r:   r   torch.nnr   r    r
   r{  activationsr   cache_utilsr   r   configuration_utilsr   
generationr   integrationsr   r   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr    r!   r"   r#   r$   utils.genericr%   r&   utils.output_capturingr'   r(   auto.modeling_autor*   configuration_gemma4r,   r-   r.   r/   r3   rA   rI   ModulerM   rm   r   r   r   r  r  Conv1dr  r'  r4  r@  rV  ro  r{  r  ri   rh   r  r  ra   rG   r  r  r  r  r  r  r  r  r.  rJ  rV  	Embeddingri  rs  r  r  r  r  r  r  r  r;   r   r,  r  r!  r`  __all__r=   r>   r?   <module>r     s  *  $ ! %    $ & ! . 3 ) K  C 9 k k K F & n n G E * g g 
9 7 9 9( 
9; 9 9: 37 3  3BII :4BII 4*7ryy 7>i)299 i)X#bii #8; ;< RYY  H"bii "B&RYY &R0ryy 0l3		 3D80 80vbii  L")) L^(.ELL .u|| .%,, ._b .,	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%N 5&||5&	5& 
5& ,,	5&
 5& \\5&p )*B)BII B) +B)J)9 )X)H")) )HXBII &W<		 W<t )*x)")) x) +x)v $#		 $# $#N!@ryy !@HV7 VrSBLL S"?)O ?)D `aFV+ FV bFVR ]^Q
- Q
 _Q
hsCx X  Rc, Rcj>H- >HB6ryy 68LL4'\\D( _H .2-1&*99<<9 LL4'9 T\	9
 ,,%9 ||d*9 ##d*9 9 t9 
9x s' ssl	 A%:O AAHr>   