
    i                     (   d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlmZ d dl	m
Z ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJ ddlKmLZL ddlMmNZN ddlOmPZPmQZQmRZRmSZS  e.j                  eU      ZV G d d eD      ZW G d! d"eA      ZXee, G d# d$e!                    ZY G d% d&ej                        Z[ G d' d(eF      Z\ G d) d*ej                        Z] G d+ d,ej                        Z^ G d- d.ej                        Z_ G d/ d0ej                        Z` G d1 d2ej                        Za G d3 d4ej                        Zc G d5 d6ej                        Zd G d7 d8ej                        Ze G d9 d:ej                        Zf G d; d<ej                        Zg G d= d>e<      Zh	 d~d?ej                  d@ej                  dAej                  dBej                  dCejdDej                  fdEZk G dF dGeJ      Zl G dH dIe9      Zm G dJ dKe:      Zn G dL dMej                        Zo G dN dOe<      Zp G dP dQe=      Zq eeG       G dR dSej                               Zr G dT dUeL      Zs G dV dWej                        Zt G dX dYe:      Zu G dZ d[e?      Zv G d\ d]e'      Zw e,d^_       G d` dae>             Zx e,db_       G dc dde;             Zy G de dfew      Zz G dg dhew      Z{ G di djeE      Z|dkej                  dz  dlej                  dz  dDedz  fdmZ}	 	 	 	 ddnedoej                  dpej                  dz  dqedz  dBej                  dz  drej                  dz  dsej                  dz  dteduedz  dDefdvZ e,dw_       G dx dyeC             Z e,dz_       G d{ d|eB             Zg d}Zy)    N)Callable)	dataclass)cached_property)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)use_kernelized_func)create_bidirectional_maskcreate_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel)Gemma3AttentionGemma3DecoderLayerGemma3ForCausalLM	Gemma3MLPGemma3RotaryEmbeddingGemma3TextModelGemma3TextScaledWordEmbedding)Gemma3nCausalLMOutputWithPastGemma3nForConditionalGenerationGemma3nModelGemma3nModelOutputWithPastGemma3nMultimodalEmbedderGemma3nRMSNormapply_rotary_pos_embeager_attention_forward)LlamaRotaryEmbedding)MixtralExperts)sliding_window_mask_function   )Gemma4AudioConfigGemma4ConfigGemma4TextConfigGemma4VisionConfigc                       e Zd Zy)Gemma4ModelOutputWithPastN__name__
__module____qualname__     z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/gemma4/modular_gemma4.pyr>   r>   N       rD   r>   c                       e Zd Zy)Gemma4CausalLMOutputWithPastNr?   rC   rD   rE   rH   rH   R   rF   rD   rH   c                   :    e Zd ZU dZdZej                  dz  ed<   y)Gemma4AudioModelOutputz
    attention_mask (`torch.BoolTensor`, *optional*):
        A torch.BoolTensor of shape `(batch_size, num_frames)`. True for valid positions, False for padding.
    Nattention_mask)r@   rA   rB   __doc__rK   torch
BoolTensor__annotations__rC   rD   rE   rJ   rJ   V   s    
 /3NE$$t+2rD   rJ   c                   n     e Zd Zdeez  dededdf fdZdej                  dej                  fdZ	 xZ
S )	Gemma4ClippableLinearconfigin_featuresout_featuresreturnNc                    t         |           |j                  | _        t        j                  ||d      | _        | j                  r| j                  dt        j                  t        d                    | j                  dt        j                  t        d                   | j                  dt        j                  t        d                    | j                  dt        j                  t        d                   y y )NFbias	input_mininf	input_max
output_min
output_max)
super__init__use_clipped_linearsr   Linearlinearregister_bufferrM   tensorfloat)selfrR   rS   rT   	__class__s       rE   r_   zGemma4ClippableLinear.__init__b   s     	#)#=#= ii\F##  ellE%L=.IJ  ell5<.HI  u||U5\M/JK  u||E%L/IJ	 $rD   hidden_statesc                    | j                   r+t        j                  || j                  | j                        }| j                  |      }| j                   r+t        j                  || j                  | j                        }|S N)r`   rM   clamprY   r[   rb   r\   r]   )rf   rh   s     rE   forwardzGemma4ClippableLinear.forwardr   s\    ##!KKt~~t~~VMM2##!KKtXMrD   )r@   rA   rB   r<   r9   intr_   rM   Tensorrl   __classcell__rg   s   @rE   rQ   rQ   a   sT    K"%66K K 	K
 
K 	U\\ 	ell 	rD   rQ   c                       e Zd Zy)Gemma4RMSNormNr?   rC   rD   rE   rr   rr   ~   rF   rD   rr   c                        e Zd ZU dZej
                  ed<   def fdZ ej                         dej
                  dej
                  fd       Z
 xZS ) Gemma4AudioRelPositionalEncodingzSinusoidal relative positional encoding for the audio encoder.

    Produces position embeddings of shape [1, 2*context_size - 1, hidden_size] with
    concatenated [sin..., cos...] layout matching the original Gemma4 convention.
    inv_timescalesrR   c                    t         |           |j                  | _        |j                  |j                  z   dz
  |j
                  z   | _        d}d}| j                  dz  }t        j                  ||z        t        |dz
  d      z  }|t        j                  t        j                  |      | z        z  }| j                  d|j                  d      j                  d      d       y )	Nr8         ?     @r$   ru   r   F
persistent)r^   r_   hidden_sizeattention_chunk_sizeattention_context_leftattention_context_rightcontext_sizemathlogmaxrM   exparangerc   	unsqueeze)rf   rR   min_timescalemax_timescalenum_timescaleslog_timescale_incrementru   rg   s          rE   r_   z)Gemma4AudioRelPositionalEncoding.__init__   s    !--''&*G*GG!KfNlNll 	 ))Q."&((==+H"ICP^abPbdeLf"f&5<<3OSjRj3j)kk-~/G/G/J/T/TUV/WdijrD   rh   rU   c                 Z   t        j                  ddd|j                        }|d   }|| j                  j	                  |j                        z  }t        j
                  t        j                  |      t        j                  |      gd      }|j	                  |j                        S )N   device.Ndimdtype)	rM   r   r   ru   tocatsincosr   )rf   rh   position_idsscaled_time	pos_embeds        rE   rl   z(Gemma4AudioRelPositionalEncoding.forward   s    ||BB}7K7KL#I."T%8%8%;%;=CWCW%;%XXIIuyy5uyy7MNTVW	||-"5"5|66rD   )r@   rA   rB   rL   rM   rn   rO   r9   r_   no_gradrl   ro   rp   s   @rE   rt   rt      sU     LL k0 k U]]_7U\\ 7ell 7 7rD   rt   c                   P    e Zd ZdZdedef fdZdej                  dej                  fdZ	dej                  dej                  fdZ
d	ej                  dej                  fd
Z	 ddej                  dej                  dej                  dz  deej                  df   fdZ xZS )Gemma4AudioAttentionz3Chunked local attention with relative position biasrR   	layer_idxc                     t         |           || _        || _        |j                  | _        |j                  |j                  z  | _        |j                  | _	        | j                  dz  t        j                  d      z  | _        t        j                  dt        j                  z         t        j                  d      z  | _        |j                  | _        |j"                  dz
  | _        |j&                  | _        | j                   | j$                  z   | j(                  z   | _        t-        ||j                  | j                  | j                  z        | _        t-        ||j                  | j                  | j                  z        | _        t-        ||j                  | j                  | j                  z        | _        t-        ||j                  |j                        | _        t7        j8                  |j                  | j                  | j                  z  d      | _        t7        j<                  t?        j@                  | j                              | _!        | jE                  dt?        jF                  | j
                        d       y )N      r$   r8   FrW   softcapry   )$r^   r_   rR   r   attention_logit_capattention_logits_soft_capr{   num_attention_headshead_dim	num_headsr   r   q_scaleek_scaler|   
chunk_sizer}   max_past_horizonr~   max_future_horizonr   rQ   q_projk_projv_projpostr   ra   relative_k_proj	ParameterrM   zerosper_dim_scalerc   rd   rf   rR   r   rg   s      rE   r_   zGemma4AudioAttention.__init__   s   ")/)C)C&**f.H.HH33t+txx{:xxDFF
+dhhqk9 55 & = = A"("@"@ OOd.C.CCdF]F]]+FF4F4FY]YfYfHfg+FF4F4FY]YfYfHfg+FF4F4FY]YfYfHfg)&&2D2DfFXFXY	!yy););T^^dmm=[bgh\\%++dmm*DEYT5S5S(TafgrD   rh   rU   c           	         |j                   \  }}}}|| j                  z   dz
  | j                  z  }|| j                  z  |z
  }t        j                  |ddddd|f      }|j	                  ||| j                  ||      j                         S )zSplits a `(batch_size, seq_len, num_heads, head_dim)` tensor into non-overlapping blocks of `chunk_size` along the sequence dim.r8   r   )shaper   Fpadreshape
contiguous)rf   rh   
batch_sizeseq_lenr   r   
num_blocksr   s           rE   _convert_to_blockz&Gemma4AudioAttention._convert_to_block   s    3@3F3F0
GY/!3G
4??*W4maAq!S-AB$$ZT__iYabmmoorD   c           
      @   |j                   \  }}}}t        j                  |dddd| j                  | j                  | j
                  z   dz
  f      }|j                  d| j                  | j
                        }t        j                  |dd      }|j                         S )z`Extracts overlapping context windows of `context_size` for every block, strided by `chunk_size`.r   r8   r   r$   )r   r   r   r   r   r   unfoldr   rM   movedimr   )rf   rh   r   r   r   r   s         rE   _extract_block_contextz+Gemma4AudioAttention._extract_block_context   s    3@3F3F0
GYAq!Q(=(=t?V?VY]YhYh?hkl?lm
 &,,Q0A0A4??SmR;''))rD   xc                     |j                   \  }}}}}| j                  }t        j                  |d|dz   |z
  f      }|j	                  |||||dz   z        }|dd||z  f   }|j	                  |||||      S )zjRelative position shift for blocked attention. See appendix B of https://huggingface.co/papers/1901.02860.r   r8   .N)r   r   r   r   view)rf   r   r   r   r   
block_sizeposition_lengthr   s           rE   
_rel_shiftzGemma4AudioAttention._rel_shift   s    IJF
Iz:((EE!a)O;<=FF:y*jLSTDT6UVc.Z,.../vvj)Z\RRrD   Nposition_embeddingsrK   c                    |j                   \  }}}||| j                  | j                  f}| j                  |      j	                         j                  |      }| j                  |      j	                         j                  |      }	| j                  |      j	                         j                  |      }
|| j                  z  t        j                  | j                        z  }|	| j                  z  }	| j                  |      }| j                  |	      }	| j                  |
      }
|j                   d   }| j                  |      }|j                  d| j                  | j                        }|j!                  |j"                        }|j%                  ddddd      }||	j%                  ddddd      z  }|j'                  || j                  d| j                        }||j%                  ddd      z  }|j'                  || j                  || j(                  d      }| j+                  |      }||z   }|| j,                  z  }t/        j0                  |      }|| j,                  z  }|4|j3                  |j5                         | j6                  j8                        }t        j:                  |dt.        j<                        j!                  |
j"                        }||
j%                  ddddd      z  }|j%                  ddddd      j'                  ||| j(                  z  d      }|d d d |f   j?                         }| jA                  |j!                  | j@                  jB                  jD                  j"                              }||fS )	Nr8   r   r   r   r   r$      )r   r   )#r   r   r   r   re   r   r   r   r   r   softplusr   r   r   r   r   r   r   permuter   r   r   r   rM   tanhmasked_filllogical_notrR   attention_invalid_logits_valuesoftmaxfloat32r   r   rb   weight)rf   rh   r   rK   r   
seq_length_hidden_shapequery_states
key_statesvalue_statesr   relative_key_statesqueries	matrix_acqueries_flat	matrix_bdattn_weightsattn_outputs                      rE   rl   zGemma4AudioAttention.forward   s    %2$7$7!
J"JN{{=1779>>|L[[/557<<\J
{{=1779>>|L#dll2QZZ@R@R5SS$,,.
--l;00<
22<@!''*
"223FG166r4>>4==Y144<;M;M4N&&q!Q15j00Aq!Q??	z4>>2t}}U #6#>#>q!Q#GG	%%j$..*doo_ab	OOI.	 9,#dll2zz,/#dll2%'33**,dkk.X.XL yy2U]]KNN|OaOab"\%9%9!Q1a%HH!))!Q1a8@@ZZ^ZiZiMikmn!![j[.1<<>iiTYY5E5E5L5L5R5R STL((rD   rj   )r@   rA   rB   rL   r9   rm   r_   rM   rn   r   r   r   rN   tuplerl   ro   rp   s   @rE   r   r      s    =h0 hS h4pu|| p p*ELL *U\\ *SELL SU\\ S 37	1)||1) #\\1) ((4/	1)
 
u||T!	"1)rD   r   c                   ^     e Zd Z fdZddej
                  dej
                  dz  fdZ xZS )'Gemma4AudioSubSampleConvProjectionLayerc                     t         |           t        j                  ||dddd      | _        t        j
                  ||dd      | _        t        j                         | _        y )N)r   r   )r$   r$   r8   F)in_channelsout_channelskernel_sizestridepaddingrX   T)epselementwise_affinerX   )	r^   r_   r   Conv2dconv	LayerNormnormReLUact)rf   r   r   norm_epsrg   s       rE   r_   z0Gemma4AudioSubSampleConvProjectionLayer.__init__  sW    II#%
	 LL8PT[`a	779rD   Nrh   maskc           
         |,|j                  |j                        }||d d d d d d f   z  }| j                  |j                  | j                  j                  j                              }| j                  | j                  |j                  dddd            j                  dddd      j                               }||d d d d df   }||fS )Nr   r   r$   r   r8   )	r   r   r   r   r   r   r   r   r   )rf   rh   r   s      rE   rl   z/Gemma4AudioSubSampleConvProjectionLayer.forward  s    77-"6"677D)DD!T1A,BBM		-"2"24993C3C3I3I"JK=+@+@Aq!+L!M!U!UVWYZ\]_`!a!l!l!no3Q3<Dd""rD   rj   )r@   rA   rB   r_   rM   rn   rl   ro   rp   s   @rE   r   r     s(    #U\\ #9L #rD   r   c            	            e Zd Zdef fdZ	 ddej                  dej                  dz  deej                  ej                  f   fdZ xZ	S )	"Gemma4AudioSubSampleConvProjectionrR   c                 v   t         |           t        d|j                  d   |j                        | _        t        |j                  d   |j                  d   |j                        | _        |j                  d   dz  |j                  d   z  }t        j                  ||j                  d      | _
        y )Nr8   r   )r   r   r   r   FrW   )r^   r_   r   subsampling_conv_channelsrms_norm_epslayer0layer1r   ra   r{   input_proj_linear)rf   rR   proj_input_dimrg   s      rE   r_   z+Gemma4AudioSubSampleConvProjection.__init__*  s    =99!<((

 >88;99!<((

 !::1=BfFfFfghFii!#>6;M;MTY!ZrD   Ninput_featuresinput_features_maskrU   c                 &   |j                  d      }| j                  ||      \  }}| j                  ||      \  }}|j                  \  }}}}|j	                  dddd      j                         j                  ||d      }| j                  |      |fS )Nr8   r   r$   r   r   )r   r   r   r   r   r   r   r   )rf   r   r  rh   r   r   r   r   s           rE   rl   z*Gemma4AudioSubSampleConvProjection.forward9  s    
 '003"kk-9LMt"kk->t$1$7$7!
Aw%--aAq9DDFNNz[bdfg%%m4d::rD   rj   )
r@   rA   rB   r9   r_   rM   rn   r   rl   ro   rp   s   @rE   r   r   )  sW    [0 [$ 48;; #\\D0; 
u||U\\)	*	;rD   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Gemma4AudioFeedForwardrR   c                    t         |           || _        t        ||j                  |j                  dz        | _        t        ||j                  dz  |j                        | _        t        |j                        | _        t        |j                        | _	        t        |j                     | _        |j                  | _        |j                  | _        y )Nr   )r^   r_   rR   rQ   r{   ffw_layer_1ffw_layer_2rr   pre_layer_normpost_layer_normr
   
hidden_actact_fngradient_clippingresidual_weightpost_layer_scalerf   rR   rg   s     rE   r_   zGemma4AudioFeedForward.__init__H  s    09K9KVM_M_bcMcd09K9Ka9OQWQcQcd+F,>,>?,V-?-?@V../!'!9!9 & 6 6rD   rh   rU   c                    t        | j                  t        j                  | j                  j
                  j                  j                        j                        }|}t        j                  || |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }t        j                  || |      }| j                  |      }|| j                  z  }||z  }|S rj   )minr  rM   finfor  rb   r   r   r   rk   r  r  r  r	  r  )rf   rh   r  residuals       rE   rl   zGemma4AudioFeedForward.forwardV  s     6 6DDTDTD[D[DbDbDhDh8i8m8mn M4E3EGXY++M:((7M2((7M4E3EGXY,,];...!rD   	r@   rA   rB   r9   r_   rM   rn   rl   ro   rp   s   @rE   r  r  G  s+    70 7U\\ ell rD   r  c                   `     e Zd Zed        Zdej                  dej                  f fdZ xZS )Gemma4AudioCausalConv1dc                 p    | j                   d   dz
  | j                  d   z  dz   }|| j                  d   z
  S )Nr   r8   )r   dilationr   )rf   effective_kernel_sizes     rE   left_padz Gemma4AudioCausalConv1d.left_pady  s>    !%!1!1!!4q!8DMM!<L Lq P$t{{1~55rD   r   rU   c                 z    t         j                  j                  || j                  df      }t        |   |      S )Nr   )r   r   r   r  r^   rl   )rf   r   rg   s     rE   rl   zGemma4AudioCausalConv1d.forward~  s3     MMa$--!34wq!!rD   )	r@   rA   rB   r   r  rM   rn   rl   ro   rp   s   @rE   r  r  k  s;     6 6"<<" 
	" "rD   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Gemma4AudioLightConv1drR   c                 6   t         |           || _        t        ||j                  |j                  dz        | _        t        ||j                  |j                        | _        t        |j                  |j                  |j                  |j                  d      | _	        t        |j                  |j                  d      | _        t        |j                  |j                  d      | _        t        |j                     | _        |j"                  | _        y )Nr$   F)r   r   r   groupsrX   Tr   
with_scale)r^   r_   rR   rQ   r{   linear_start
linear_endr  conv_kernel_sizedepthwise_conv1drr   r   r  	conv_normr
   r
  r  r  r  s     rE   r_   zGemma4AudioLightConv1d.__init__  s    1&&:L:LfN`N`cdNde/8J8JFL^L^_ 7**++//%%!
 ,F,>,>FDWDWdhi&v'9'9v?R?R_cdV../!'!9!9rD   rh   rU   c                 H   |}| j                  |      }| j                  |      }t        j                  j	                  |d      }| j                  |j                  dd            j                  dd      }t        | j                  t        j                  | j                  j                  j                  j                        j                        }t        j                  || |      }| j!                  |      }| j#                  |      }| j%                  |      }||z  }|S )Nr   r   r8   r$   )r  r"  r   r   glur%  	transposer  r  rM   r  rb   r   r   r   rk   r&  r  r#  )rf   rh   r  r  s       rE   rl   zGemma4AudioLightConv1d.forward  s     ++M:))-8))-R)@--m.E.Ea.KLVVWXZ[\   6 6DDUDUD\D\DcDcDiDi8j8n8noM4E3EGXY}5M26!rD   r  rp   s   @rE   r  r    s+    :0 :(U\\ ell rD   r  c            
            e Zd Zdedef fdZdej                  dej                  dz  dej                  de	e
   d	ej                  f
d
Z xZS )Gemma4AudioLayerrR   r   c                 p   t         |           || _        t        |      | _        t        |      | _        t        ||      | _        t        |      | _	        t        |j                        | _        t        |j                        | _        t        |j                        | _        |j                  | _        y rj   )r^   r_   rR   r  feed_forward1feed_forward2r   	self_attnr  lconv1drr   r{   norm_pre_attnnorm_post_attnnorm_outr  r   s      rE   r_   zGemma4AudioLayer.__init__  s    3F;3F;-fi@-f5*6+=+=>+F,>,>?%f&8&89!'!9!9rD   rh   rK   Nr   kwargsrU   c                 @   t        | j                  t        j                  | j                  j
                  j                        j                        }| j                  |      }|}t        j                  || |      }| j	                  |      }| j                  |||      \  }}t        j                  || |      }| j                  |      }||z  }| j                  |      }| j                  |      }t        j                  || |      }| j                  |      }|S )N)rh   r   rK   )r  r  rM   r  r1  r   r   r   r-  rk   r/  r2  r0  r.  r3  )rf   rh   rK   r   r4  r  r  r   s           rE   rl   zGemma4AudioLayer.forward  s      6 6DDVDVD]D]DcDc8d8h8hi**=9 M4E3EGXY**=9>>' 3) * 
q M4E3EGXY++M:!]3**=9M4E3EGXYm4rD   )r@   rA   rB   r9   rm   r_   rM   rn   rN   r   r   rl   ro   rp   s   @rE   r+  r+    si    :0 :S : ||  ((4/  #\\	 
 +,  
 rD   r+  c                        e Zd Zdef fdZdej                  dej                  dej                  fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	Gemma4VisionPatchEmbedderrR   c                    t         |           || _        |j                  | _        |j                  | _        |j
                  | _        t        j                  d| j                  dz  z  | j                  d      | _        t        j                  t        j                  d| j
                  | j                              | _        y )Nr   r$   FrW   )r^   r_   rR   r{   
patch_sizeposition_embedding_sizer   ra   
input_projr   rM   onesposition_embedding_tabler  s     rE   r_   z"Gemma4VisionPatchEmbedder.__init__  s    !-- ++'-'E'E$))A(:$:D<L<LSXY(*UZZ4C_C_aeaqaq5r(s%rD   pixel_position_idspadding_positionsrU   c                 T   |j                  d      }t        j                  || j                        }|j	                  dddd      j                  | j                        }|| j                  z  }|j                  d      }t        j                  |j                  d      d	|      }|S )
zDPrepare patch positions map for matmul with positon embedding table.r   r  )num_classesr$   r8   r   r   r           )rk   r   one_hotr:  r   r   r=  sumrM   wherer   )rf   r>  r?  clamped_positionsrD  r   s         rE   _position_embeddingsz.Gemma4VisionPatchEmbedder._position_embeddings  s     /444;))-4;W;WX//!Q1-001N1NO%(E(EE155!5<#kk*;*E*Eb*I3Pcd""rD   pixel_valuesc                     d|dz
  z  }| j                  |j                  | j                   j                  j                              }| j	                  ||      }||z   S )Nr$         ?)r;  r   r   r   rH  )rf   rI  r>  r?  rh   r   s         rE   rl   z!Gemma4VisionPatchEmbedder.forward  s[     L3./8N8N8T8T(UV"778JL]^222rD   )
r@   rA   rB   r<   r_   rM   rn   rH  rl   ro   rp   s   @rE   r7  r7    su    t1 t#u|| #X]XdXd #iniuiu #3!LL3>Cll3_d_k_k3	3rD   r7  c                   .    e Zd ZdZdef fdZdej                  dej                  dede	ej                  ej                  f   fdZ
	 ddej                  dej                  d
ej                  ded	z  de	ej                  ej                  f   f
dZ xZS )Gemma4VisionPoolerz9Scaling and optional spatial pooling for vision encodingsrR   c                 l    t         |           |j                  | _        | j                  dz  | _        y )NrK  )r^   r_   r{   root_hidden_sizer  s     rE   r_   zGemma4VisionPooler.__init__  s/    !-- $ 0 0# 5rD   rh   r>  lengthrU   c                    |j                   d   }t        ||z  dz        }|dz  }||z  |k7  r%t        d|j                    d| d|d|d| d	      |j                  d
      }|d   j	                  dd      d
   dz   }t        j                  ||d      }	|	d   ||z  |	d   z  z   }	t        j                  |	j                         |      j                         |z  }
|
j                  dd      |j                         z  }t        j                  |
d
k(  j                  d            }|j                  |j                        |fS )z
        2D spatial pooling according to patch positions.
        Pools the input tokens by averaging patches within a `k^2` grid, where `k` is determined by the ratio between
        input and output lengths
        r8   rK  r$   zCannot pool z to z: k=z^2 times length=z	 must be .r   rA  .r   r   Tr   keepdimfloor)rounding_mode).r8   r   )r   rm   
ValueErrorrk   r   rM   divr   rD  longre   r)  r   allr   r   )rf   rh   r>  rP  input_seq_lenk	k_squaredrG  max_xkernel_idxsweightsoutputr   s                rE   _avg_pool_by_positionsz)Gemma4VisionPooler._avg_pool_by_positions  sh    &++A.&(S01qD	v.}2234xu!EVviW`an`oopq  /444;!&)--"d-CAFJii 11GL!&)UaZ;v;N,NN))K,,.7==?)K""1a(=+>+>+@@  'Q,!3!3!3!:;yy,,-t33rD   Nr?  output_lengthc                    ||j                   d   kD  rt        d| d|j                   d    d      |j                  |j                  d      d      }|j                   d   |k7  r| j	                  |||      \  }}|| j
                  z  }||fS )Nr8   z*Cannot output more soft tokens (requested z) than there are patches (z9). Change the value of `num_soft_tokens` when processing.r   rC  )r   rX  r   r   rc  rO  )rf   rh   r>  r?  rd  s        rE   rl   zGemma4VisionPooler.forward0  s     =..q11<]O L"((+,,eg 
 &112C2M2Mb2QSVWq!]2/3/J/J1=0,M, 	...///rD   rj   )r@   rA   rB   rL   r<   r_   rM   rn   rm   r   rc  rl   ro   rp   s   @rE   rM  rM    s    C61 6
4"\\4?D||4UX4	u||U\\)	*4@ %)0||0 "LL0 !<<	0
 Tz0 
u||U\\)	*0rD   rM  c                   $     e Zd Zdef fdZ xZS )Gemma4VisionMLPrR   c                 
   t         |   | |       t        || j                  | j                        | _        t        || j                  | j                        | _        t        || j                  | j                        | _        y rj   )r^   r_   rQ   r{   intermediate_size	gate_projup_proj	down_projr  s     rE   r_   zGemma4VisionMLP.__init__I  sf    v&.vt7G7GI_I_`,VT5E5EtG]G]^.vt7M7MtO_O_`rD   )r@   rA   rB   r<   r_   ro   rp   s   @rE   rg  rg  H  s    a1 a arD   rg  r   r   r   r   unsqueeze_dimrU   c           	         |j                   d   }| j                   d   }d|d|z  z  z  }|dk  rt        d| d| d| d      |g|z  }t        j                  | |d      }	t        j                  ||d      }
t        j                  ||d      }t	        |      D cg c]  }t        |	|   |
|   ||   |	       }}t        j                  |d      S c c}w )
ak  Applies multidimensional RoPE to inputs.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            If position_ids.ndim + 2 == x.ndim, then this function passes through to `apply_rotary_pos_emb()`.
            Otherwise, position_ids is used to split the inputs, x, into multiple pieces, where each piece is fed to
            `apply_rotary_pos_emb()`, and then concatenated back together.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.

    Returns:
      Tensor of shape [B, L, N, H] with RoPE applied.
    r   r$   r   zEInvalid configuration: num_rotated_channels_per_dim must be > 0, got z (num_input_channels=z, ndim=)r   )r   r   r   rm  )r   rX  rM   splitranger3   r   )r   r   r   r   rm  ndimnum_input_channelsnum_rotated_channels_per_dimsplit_sizesx_parts	cos_parts	sin_partsr]  y_partss                 rE   apply_multidimensional_roperz  P  s   8 b!D#$(:q4x(H#I #q(,--BCUBV WF!
 	
 0047Kkk![b1GC"5IC"5I t  	aj!!'		
G  99W"%%s   Cc                       e Zd Ze	 	 	 d	dedz  dej                  dz  dedz  dede	f   fd       Z
 ej                         ed               Zy)
Gemma4VisionRotaryEmbeddingNrR   r   r   rU   ztorch.Tensorc                 $   | j                   d   }t        | dd      xs | j                  | j                  z  }|dz  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetar   Nr$   rw   r   r   )r   r   )	rope_parametersgetattrr{   r   rM   r   int64r   re   )rR   r   r   baser   spatial_dimattention_factorinv_freqs           rE   compute_default_rope_parametersz;Gemma4VisionRotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c QhQQekkBEEV[`[f[fEgjuuw
 )))rD   c                 .   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}g g }}t        d      D ]  }|d d d d |f   }|d d d d d f   j                         }	t        |d      5  |j                         |	j                         z  j                  dd      }
t        j                  |
|
fd	      }|j                         | j                  z  }|j!                         | j                  z  }d d d        |j#                         |j#                          t        j                  |d	      j	                  |j$                  
      }t        j                  |d	      j	                  |j$                  
      }||fS # 1 sw Y   xY w)Nr   r   r8   mpscpur$   F)device_typeenabledr   r   )r  re   expandr   r   r   
isinstancetypestrrq  r    r)  rM   r   r   attention_scalingr   appendr   )rf   r   r   inv_freq_expandedr  all_cosall_sinidim_position_idsdim_position_ids_expandedfreqsembr   r   s                 rE   rl   z#Gemma4VisionRotaryEmbedding.forward  s    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr'1!((--'E!((--[`J`ahhmmfk rq 
	 A+Aq!G4(8D!(D(J(J(L%KG 9*0025N5T5T5VVaabcefgiiB7ggi$"8"88ggi$"8"88	9
 NN3NN3
	  iiR(++!''+:iiR(++!''+:Cx9 9s   4BHH	NNN)r@   rA   rB   staticmethodr<   rM   r   rm   r   re   r  r   r   rl   rC   rD   rE   r|  r|    s    ,0&*" *"T) *t# * t * 
~u$	%	 *  *D U]]_  rD   r|  c                       e Zd Zdedef fdZ	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	e	e
   d
eej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma4VisionAttentionrR   r   c                 6   t         |   | ||       | `| `| `d| _        d| _        t        ||j                  |j                  | j                  z        | _        t        ||j                  |j                  | j                  z        | _        t        ||j                  |j                  | j                  z        | _        t        ||j                  | j                  z  |j                        | _        t!        | j                  |j"                  d      | _        y )Nrw   Fr   )r^   r_   attn_logit_softcappingsliding_window
is_slidingscaling	is_causalrQ   r{   num_key_value_headsr   r   r   r   r   o_projrr   r   v_normr   s      rE   r_   zGemma4VisionAttention.__init__  s    vy1'O+FF4F4FHbHbeiererHrs+FF4F4FHbHbeiererHrs+FF4F4FHbHbeiererHrs+FF4N4NQUQ^Q^4^`f`r`rs#DMMv7J7JW\]rD   Nrh   r   rK   r   r4  rU   c                 N   |j                   d d }g |d| j                  }|\  }}	| j                  |      j                  |      }
| j	                  |
      }
t        |
||	|      }
|
j                  dd      }
| j                  |      j                  |      }| j                  |      }t        |||	|      }|j                  dd      }| j                  |      j                  |      }| j                  |      }|j                  dd      }t        j                  | j                  j                  t              } || |
|||f| j                   r| j"                  nd| j$                  d|\  }} |j&                  g |d j)                         }| j+                  |      }||fS )Nr   r8   r$   rC  )dropoutr  )r   r   r   r   q_normrz  r)  r   k_normr   r  r   get_interfacerR   _attn_implementationr4   trainingattention_dropoutr  r   r   r  )rf   rh   r   rK   r   r4  input_shaper   r   r   r   r   r   attention_interfacer   r   s                   rE   rl   zGemma4VisionAttention.forward  s    $))#2.88b8$--8&S{{=166|D{{<02<c<X#--a3[[/44\B
[[,
0S#|T
))!Q/
{{=166|D{{<0#--a3(?(M(MKK,,.E)
 %8	%
 /3mmD**LL	%
 	%
!\ *k));;;;FFHkk+.L((rD   r  )r@   rA   rB   r<   rm   r_   rM   rn   
LongTensorr   r   r   rl   ro   rp   s   @rE   r  r    s    ^1 ^c ^  -1.204,)||,) #\\,) t+	,)
 &&-,) +,,) 
u||U\\D0%2E2LL	M,)rD   r  c                       e Zd Zdedef fdZ	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	e	e
   d
eej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma4VisionEncoderLayerrR   r   c                 l    t         |   | ||       t        ||      | _        t	        |      | _        y NrR   r   )r^   r_   r  r/  rg  mlpr   s      rE   r_   z!Gemma4VisionEncoderLayer.__init__  s.    vy1.f	R"6*rD   Nrh   r   rK   r   r4  rU   c                     |}| j                  |      } | j                  d||||d|\  }}| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)rh   r   rK   r   rC   )input_layernormr/  post_attention_layernormpre_feedforward_layernormr  post_feedforward_layernorm)rf   rh   r   rK   r   r4  r  r   s           rE   rl   z Gemma4VisionEncoderLayer.forward	  s     !,,];)4>> 
' 3)%	

 
q 55mD =0 66}E/77F =0rD   r  )r@   rA   rB   r<   rm   r_   rM   rn   r  r   r   r   FloatTensorrl   ro   rp   s   @rE   r  r    s    +1 +c + -1.204|| #\\ t+	
 &&- +, 
u  %(9(95;L;L(L"MPT"TT	UrD   r  c                        e Zd Zdef fdZ	 d
dej                  dej                  dej                  dz  dee	   de
f
d	Z xZS )Gemma4VisionEncoderrR   c           	         t         |           || _        |j                  | _        t        |      | _        t        j                  t        | j                        D cg c]  }t        ||       c}      | _        y c c}w r  )r^   r_   rR   num_hidden_layers
num_layersr|  
rotary_embr   
ModuleListrq  r  layers)rf   rR   r  rg   s      rE   r_   zGemma4VisionEncoder.__init__)  sc     225f=mmKPQUQ`Q`Kaba%VqAb
bs   A?Ninputs_embedsrK   r>  r4  rU   c                     t        | j                  ||      }|}| j                  ||      }| j                  d| j                  j                   D ]  } ||f|||d|} t        |      S )z
        pixel_position_ids (torch.Tensor):
            Patch positions as (x, y) coordinates in the image as [batch, num_patches, 2].
        )rR   r  rK   N)rK   r   r   last_hidden_state)r   rR   r  r  r  r   )rf   r  rK   r>  r4  rh   r   decoder_layers           rE   rl   zGemma4VisionEncoder.forward2  s     3;;')
 &"oom=OP "[[)H4;;+H+HI 	M)-$7/	
 M	 'GGrD   rj   )r@   rA   rB   r<   r_   rM   rn   r  r   r   r   rl   ro   rp   s   @rE   r  r  (  si    
1 
 7;	H||H H ",,t3	H
 +,H 
!HrD   r  c                   (     e Zd Zdedef fdZ xZS )Gemma4TextMLPrR   r   c                     |j                   |j                  z
  }||cxk\  xr dkD  nc }|j                  xr |}t        |           |j
                  |r
dz  | _        y dz  | _        y )Nr   r$   r8   )r  num_kv_shared_layersuse_double_wide_mlpr^   r_   ri  )rf   rR   r   first_kv_shared_layer_idxis_kv_shared_layerr  rg   s         rE   r_   zGemma4TextMLP.__init__X  si    $*$<$<v?Z?Z$Z!&*CGaG$88O=O!'!9!9BUQ!][\!]rD   )r@   rA   rB   r;   rm   r_   ro   rp   s   @rE   r  r  W  s     ^/ ^C ^ ^rD   r  c                       e Zd ZddefdZy)Gemma4TextRotaryEmbeddingNrR   c                    t         j                  j                  |        |j                  | _        |j                  | _        || _        t        |j                        | _        i | _	        i | _
        | j                  D ]  }| j                  j                  |   }||d   x}dk7  r
t        |   }n| j                  }|| j                  |<   || j                  |<   ||d}|dk(  r
|dk(  rd|d<    || j                  fi |\  }}	| j                  | d|d	
       | j                  | d|j                         d	
       t!        | | d|	        y )N	rope_typedefault)r   
layer_typefull_attentionproportionalglobal_head_dimhead_dim_key	_inv_freqFry   _original_inv_freq_attention_scaling)r   Moduler_   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrR   setlayer_typesrope_init_fnsr  r  r   r  rc   clonesetattr)
rf   rR   r   r  rope_paramsr  rope_init_fnrope_init_fn_kwargscurr_inv_freqcurr_attention_scalings
             rE   r_   z"Gemma4TextRotaryEmbedding.__init__a  sk   
		4 "("@"@$*$B$B!v112SU)+** 	UJ++55jAK"(55	)C29=#CC-9Dz*)2DNN:&-3:"N--)~2M6G#N34@4dPc4d1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST)	UrD   NN)r@   rA   rB   r;   r_   rC   rD   rE   r  r  `  s    U/ UrD   r  c                       e Zd ZdZdedef fdZ	 ddej                  dej                  dej                  dz  d	e	ee
ej                  ej                  f   f   d
edz  dee   de
ej                  ej                  dz  f   fdZ xZS )Gemma4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrR   r   c                 `   t         |           t        |d      r|j                  |   nd | _        || _        || _        | j                  dk(  | _        | j                  r|j                  nd | _        | j                  s|j                  r|j                  n|j                  | _
        |j                  xr | j                   | _        | j                  r|j                  n|j                  }|j                  |z  | _        d| _        | j
                  j$                  | _        |j&                  dk7  | _        | j
                  j*                  t-        | j
                  dd      z
  }||cxk\  xr dkD  nc | _        |j                  d | }| j.                  r@t1        |      dz
  |d d d   j3                  |j                  |         z
  | _        d	| _        nBd | _        |t1        |      dz
  |d d d   j3                  |j                  |         z
  k(  | _        t9        j:                  |j<                  |j                  | j                  z  |j>                  
      | _         tC        | j                  |jD                        | _#        | j.                  stC        | j                  |jD                        | _$        tC        | j                  |jD                  d	      | _%        t9        j:                  |j<                  || j                  z  |j>                  
      | _&        | j                  s9t9        j:                  |j<                  || j                  z  |j>                  
      nd | _'        t9        j:                  |j                  | j                  z  |j<                  |j>                  
      | _(        y )Nr  sliding_attentionrw   r[  r  r   r8   r   FrW   )r   r   r   ))r^   r_   hasattrr  r  rR   r   r  r  r  r   attention_k_eq_vuse_alternative_attentionnum_global_key_value_headsr  r   num_key_value_groupsr  r  use_bidirectional_attentionr  r  r  r  lenindexkv_shared_layer_indexstore_full_length_kvr   ra   r{   attention_biasr   rr   r   r  r  r  r   r   r  )rf   rR   r   r  r  prev_layersrg   s         rE   r_   zGemma4TextAttention.__init__  s   ;B6=;Y&,,Y7_c"//-@@7;f33D6:oo&J`J`..flfufu)/)@)@)XEX&151O1OF--U[UoUo 	 %+$>$>BU$U!!%!>!>;;uD %)KK$A$AGDKKYoqrDs$s!"+/H"L1"L(()C*CD""),[)9A)=DbD@Q@W@WX^XjXjktXu@v)vD&(-D%)-D&(1S5E5IKX\Z\X\L]LcLc""9-M 6 )D% ii : :T]] JQWQfQf
 $6;N;NO &&'DMMv?R?RSDK'6;N;N[`aDK))""$7$--$GfNcNcDK
 55 		&,,.ADMM.QX^XmXmn K ii&&68J8JQWQfQf
rD   Nrh   r   rK   shared_kv_statespast_key_valuesr4  rU   c                    |j                   d d }g |d| j                  }|\  }	}
| j                  |      j                  |      }| j	                  |      }t        ||	|
d      }|j                  dd      }| j                  rI|| j                     \  }}|j                  |j                        }|j                  |j                        }n| j                  |      j                  |      }| j                   | j                  |      j                  |      n|}| j                  |      }t        ||	|
d      }|j                  dd      }| j                  |      }|j                  dd      }|,| j                  s |j                  ||| j                         \  }}| j"                  r||f|| j                   <   t$        }| j&                  j(                  dk7  rt*        | j&                  j(                     } || ||||f| j,                  r| j.                  nd| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j9                  |      }||fS )Nr   r$   )rm  r8   eagerrC  )r  r  r  )r   r   r   r   r  r3   r)  r  r  r   r   r   r   r  r  updater   r  r4   rR   r  r   r  r  r  r  r   r   r  )rf   rh   r   rK   r  r  r4  r  r   r   r   r   r   r   r  r   r   s                    rE   rl   zGemma4TextAttention.forward  s^    $))#2.88b8$--8&S{{=166|D{{<0+L#sRST#--a3
 ""'78R8R'S$J#|':':;J'??<+>+>?L]388FJLPKKLc4;;}5::<HisLZ0J-j#sRSTJ#--a3J;;|4L'11!Q7L&t/F/F'6'='=j,X\XfXf'g$J$$/9</GT^^,(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((rD   rj   )r@   rA   rB   rL   r;   rm   r_   rM   rn   dictr   r   r   r   rl   ro   rp   s   @rE   r  r    s    G6
/ 6
C 6
| )-=)||=) #\\=) t+	=)
 sE%,,*D$EEF=) =) -.=) 
u||U\\D00	1=)rD   r  c                   $     e Zd Zdef fdZ xZS )Gemma4TextExpertsrR   c                     t         |           |j                  | _        |j                  | _        t
        |j                     | _        y rj   )r^   r_   num_expertsmoe_intermediate_sizeintermediate_dimr
   hidden_activationr  r  s     rE   r_   zGemma4TextExperts.__init__  s<    !-- & < <V556rD   )r@   rA   rB   r;   r_   ro   rp   s   @rE   r  r    s    7/ 7 7rD   r  c                   z     e Zd Zdef fdZdej                  deej                  ej                  f   fdZ xZ	S )Gemma4TextRouterrR   c                 (   t         |           || _        |j                  | _        | j                  dz  | _        |j
                  | _        t        | j                  | j                  d      | _        t        j                  |j                  |j                  d      | _        t        j                  t        j                  | j                              | _        t        j                  t        j                  |j                              | _        y )Nr   Fr   rW   )r^   r_   rR   r{   scalar_root_sizer   r   rr   r   r   ra   r  projr   rM   r<  scaleper_expert_scaler  s     rE   r_   zGemma4TextRouter.__init__  s    !-- $ 0 0$ 6&&!$"2"2US	IIf00&2D2D5Q	\\%**T-=-=">?
 "UZZ8J8J-K LrD   rh   rU   c                 x   | j                  |      }|| j                  z  | j                  z  }| j                  |      }t        j
                  j                  |d      }t        j                  || j                  j                  d      \  }}||j                  dd      z  }|| j                  |   z  }|||fS )Nr   r   )r]  r   TrT  )r   r  r  r  r   r   r   rM   topkrR   top_k_expertsrE  r  )rf   rh   expert_scoresrouter_probabilitiestop_k_weightstop_k_indexs         rE   rl   zGemma4TextRouter.forward  s    		-0%

2T5J5JJ		-0!}}44]4K &+ZZ kk''&
"{ 	**r4*@@ &(=(=k(JJ#]K??rD   )
r@   rA   rB   r;   r_   rM   rn   r   rl   ro   rp   s   @rE   r  r    s>    
M/ 
M@U\\ @eELL%,,<V6W @rD   r  c                   0    e Zd Zdeez  def fdZ	 	 	 	 	 	 ddej                  dej                  de	ee
ej                  ej                  f   f   dz  dej                  d	ej                  dz  d
ej                  dz  dedz  dej                  fdZ xZS )Gemma4TextDecoderLayerrR   r   c                    t         |   ||       t        ||      | _        t	        ||      | _        | j                  dt        j                  d             |j                  | _	        | j                  rt        |j                     | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t%        | j                  |j&                        | _        |j*                  | _        | j*                  rt-        |      | _        t1        |      | _        t%        | j                  |j&                        | _        t%        | j                  |j&                        | _        t%        | j                  |j&                        | _        y y )Nr  layer_scalarr8   FrW   r   )r^   r_   r  r/  r  r  rc   rM   r<  hidden_size_per_layer_inputr
   r
  r  r   ra   r{   per_layer_input_gateper_layer_projectionrr   r   post_per_layer_input_normenable_moe_blockr  routerr  expertspost_feedforward_layernorm_1post_feedforward_layernorm_2pre_feedforward_layernorm_2r   s      rE   r_   zGemma4TextDecoderLayer.__init__+  sX   +,FiP 3^UZZ];+1+M+M(++ !9!9:DK(*		$2B2BDDdDdkp(qD%(*		$2R2RTXTdTdkp(qD%-:4;K;KQWQdQd-eD* & 7 7  *62DK,V4DL0=d>N>NTZTgTg0hD-0=d>N>NTZTgTg0hD-/<T=M=MSYSfSf/gD, !rD   Nrh   per_layer_inputr  r   rK   r   r  rU   c           
      &   |}	| j                  |      } | j                  d||||||d|\  }}
| j                  |      }|	|z   }|}	| j                  |      }| j	                  |      }| j
                  r| j                  |      }|	j                  d|	j                  d         }| j                  |      \  }
}}| j                  |      }| j                  |||      }|j                  |	j                        }| j                  |      }||z   }| j                  |      }|	|z   }| j                  rP|}	| j                  |      }| j!                  |      }||z  }| j#                  |      }| j%                  |      }|	|z   }|| j&                  z  }|S )N)rh   r   rK   r  r   r  r   rC   )r  r/  r  r  r  r"  r%  r   r   r#  r'  r$  r&  r  r  r  r  r   r!  r  )rf   rh   r(  r  r   rK   r   r  r4  r  r   hidden_states_1hidden_states_flatr  r  hidden_states_2s                   rE   rl   zGemma4TextDecoderLayer.forward@  s    !,,];)4>> 
' 3)-%+
 
q 55mD =0 66}E/  "??NO "*!1!1"hnnR6H!I,0KK8J,K)A}k">>?QRO"ll?KWO-55hnnEO"??PO ,o=M77F =0++$H 55mDM KK6M)O;M 55mDM ::=IM$}4M***rD   )NNNNNN)r@   rA   rB   r;   r<   rm   r_   rM   rn   r  r   r  r   rl   ro   rp   s   @rE   r  r  *  s    h/2DD hQT h0 )-PT,0.204(,9||9 9 sE%,,*D$EEFM	9
 #\\9 t+9 &&-9 9 
9rD   r  c                       e Zd Zy)Gemma4TextScaledWordEmbeddingNr?   rC   rD   rE   r.  r.  |  rF   rD   r.  c                   ~     e Zd ZU eed<   dZdZdZdZdZ	dZ
g dZddgZdZ ej                          fd       Z xZS )Gemma4PreTrainedModelrR   T)r  r  r+  r  r  )imagetextvideoaudioc                 	   t         |   |       t        |t              r t	        j
                  |j                         y t        |t              rd}d}|j                  dz  }t        j                  ||z        t        |dz
  d      z  }|t        j                  t        j                  |      | z        z  }t	        j                  |j                   |j#                  d      j#                  d             y t        |t$              rJt	        j&                  |j(                  |j*                         t	        j,                  |j.                         y t        |t0              r|j2                  j5                         D ]  \  }}d|i}	|dk(  r|j6                  |   dk(  rd	|	d
<    ||j8                  fi |	\  }
}t	        j                  t;        || d      |
       t	        j                  t;        || d      |
        y t        |t<              r|j6                  dk7  rt>        |j6                     n|j@                  } ||j8                        \  }}t	        j                  |jB                  |       t	        j                  |jD                  |       y t        |tF              r+t	        j&                  |jH                  |jJ                         y t        |tL              r?t	        j
                  |jN                         t	        j
                  |jP                         y t        |tR              r[| j8                  jT                  }t	        jV                  |jX                  d|       t	        jV                  |jZ                  d|       y t        |t\              r t	        j
                  |j^                         y t        |t`              r|jb                  rt	        j&                  |jd                  tg        d              t	        j&                  |jh                  tg        d             t	        j&                  |jj                  tg        d              t	        j&                  |jl                  tg        d             y t        |tn              rV|j8                  jp                  r?t	        j,                  |jr                         t	        j
                  |jt                         y y y )Nrw   rx   r$   r8   r   r  r  r  r  r  r  r  r  rC  )meanstdrZ   );r^   _init_weightsr  r7  initones_r=  rt   r{   r   r   r   rM   r   r   copy_ru   r   r   	constant_r   r   zeros_r   r  r  itemsr  rR   r  r|  r   r  r  original_inv_freqr.  embed_scalescalar_embed_scaler  r  r  r  initializer_rangenormal_gate_up_projrl  r  r  rQ   r`   rY   re   r[   r\   r]   Gemma4VisionModelstandardizestd_bias	std_scale)rf   moduler   r   r   r   ru   r  r  r  r  r   rope_fnbuffer_valuer7  rg   s                  rE   r8  z#Gemma4PreTrainedModel._init_weights  s   f%f78JJv667 @AM#M#//14N&*hh}}/L&MPSTbefTfhiPj&j#*UYYu||N7SWnVn7n-ooNJJv,,n.F.Fq.I.S.STU.VW 45NN6>>6+K+KLKK,,- 9:,2,@,@,F,F,H ^(
L'3Z&@#!11f6F6Fz6RVd6d:K'7#/#UAT#U q

76j\+CDmT

76j\9K+LM}]^  ;< ##y0 $F$4$45;; 
 &fmm4OL!JJv5JJv//> =>NN6--v/H/HI 01JJv||$JJv../ 12++//CLL,,3C@LL))= 67JJv**+ 566;U;UNN6++eEl];NN6++U5\:NN6,,uU|m<NN6,,eEl; 12v}}7P7PKK(JJv''( 8Q2rD   )r@   rA   rB   r:   rO   supports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backend_no_split_modules_skip_keys_device_placementinput_modalitiesrM   r   r8  ro   rp   s   @rE   r0  r0    s^    &*#N!"&b#46H"I:U]]_2) 2)rD   r0  zAThe base Gemma 4 language model without a language modeling head.custom_introc                       e Zd ZU eed<    eed      eedZ	def fdZ
dej                  dz  dej                  dz  d	ej                  fd
Z	 ddej                  dej                  dz  d	ej                  fdZeee	 	 	 	 	 	 	 ddej$                  dz  dej                  dz  dej$                  dz  dedz  dej(                  dz  dej                  dz  dedz  dee   d	efd                     Z xZS )Gemma4TextModelrR   r   )r  )router_logitsrh   
attentionsc           
         t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |      | _	        t        | j                  j                        | _        |j                  | _        | j                  rt        |j                   |j
                  |j                  z  | j"                  |j                  dz        | _        d| _        t        j(                  |j*                  |j
                  |j                  z  d      | _        |j*                  dz  | _        t1        |j                  |j2                        | _        g | _        t9        | j                        D ]K  \  }}|j:                  j<                  s| j6                  j?                  dD cg c]
  }d	| d
|  c}       M y c c}w c c}w )NrK  )r@  g;f?FrW   r   r  )r   r   r  r  zlayers.z.self_attn.) r^   r_   r   r  rq  r  r  r  r  r  r  rR   r  unique_layer_typesr  r.  vocab_size_per_layer_inputpadding_idxembed_tokens_per_layerper_layer_input_scalera   r{   per_layer_model_projection per_layer_model_projection_scalerr   r   per_layer_projection_norm"_keys_to_ignore_on_load_unexpected	enumerater/  r  extend)rf   rR   r   r  layernamerg   s         rE   r_   zGemma4TextModel.__init__  s    mmHMfNfNfHgh9#FI6h
 4F;"%dkk&=&=">+1+M+M(++*G11((6+M+MM  ">>C	+D' *3D&.0ii""((6+M+MM/D+
 5;4F4F4LD1-:6;];]cicvcv-wD* 35/!$++. 	HAu1177>>@hiwqcTF3i	1 i6 js   GG"
	input_idsNr  rU   c                    | j                   st        d| j                         |t        j                         5  |d d d d d d d f   | j
                  j                  d d d d d d f   | j                  j                  dz  z  k(  j                  d      j                         d d df   }	 |j                  |j                  d d       }	 d d d         | j                  |      j                  g |j                  | j                  j                  | j                    S # t        $ r t        d      w xY w# 1 sw Y   nxY w)Nz}Attempting to call get_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. rK  r   r   r$   a)  It seems like you tried to call `forward` from `inputs_embeds` without providing `input_ids`, and that the `inputs_embeds` you provided do not exactly match the embedding weights. Since Gemma4 needs to reverse the embedding to compute another embedding, make sure you provide exact `inputs_embeds`)r  RuntimeErrorrR   rM   r   embed_tokensr   r{   r[  nonzeror   r   r_  r   r  )rf   ri  r  s      rE   get_per_layer_inputsz$Gemma4TextModel.get_per_layer_inputs  sS   //**.++8    &aD!m4,,33D$14DEH_H_adHdde SQSZWYq!t%  )}/B/B2A/F GI$ >t**95== 
__
KK))
 ,,
 	
 $ &r  s   A1D9-D!!D66D99Eper_layer_inputsc                 V   | j                   st        d| j                         | j                  |      | j                  z  } |j
                  g |j                  d d | j                  j                  | j                    }| j                  |      }||S ||z   | j                  z  S )NzAttempting to call project_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. r   )
r  rk  rR   ra  rb  r   r   r  rc  r`  )rf   r  ro  r   s       rE   project_per_layer_inputsz(Gemma4TextModel.project_per_layer_inputs  s    
 //226++@ 
  $>>}MPTPuPuu;3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''$'774;U;UUUrD   rK   r   r  	use_cacher4  c           
         |du |duz  rt        d      || j                  |      }| j                  r&|| j                  ||      }| j	                  ||      }|r|t        | j                        }|V||j                         nd}	t        j                  |j                  d   |j                        |	z   }|j                  d      }t        |x}
t              s)| j                  ||||d}t        di |t!        di |d}
|}i }| j"                  D ]  }| j%                  |||      ||<    i }t'        | j(                  d| j                  j*                         D ]\  \  }}||dddd|ddf   nd} |||f||| j                  j,                  |      |
| j                  j,                  |      ||d	|}^ | j/                  |      }t1        ||
      S )uq  
        per_layer_inputs (`torch.Tensor` of shape `(batch_size, sequence_length, num_hidden_layers, hidden_size_per_layer_input)`, *optional*):
            Pre-computed per-layer input embeddings. When provided, these are used directly instead of being
            computed from `input_ids` via `get_per_layer_inputs()`. This is primarily used by the multimodal
            model (`Gemma4Model`) which pre-computes per-layer inputs from the original `input_ids` *before*
            merging multimodal soft tokens into `inputs_embeds` — at which point the original token ids are
            no longer recoverable.
        N:You must specify exactly one of input_ids or inputs_embeds)rR   r   r8   r   rR   r  rK   r  r   r  r  )r  r   rK   r   r  )r  r  rC   )rX  rl  r  rn  rq  r   rR   get_seq_lengthrM   r   r   r   r   r  r  r   r   r\  r  re  r  r  r  r   r   )rf   ri  rK   r   r  r  ro  rr  r4  past_seen_tokenscausal_mask_mappingmask_kwargsrh   r   r  r  r  r  r(  s                      rE   rl   zGemma4TextModel.forward)  s3   , -t";<YZZ  --i8M++'#'#<#<Y#V #<<]L\]0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-F ++!."0#2 ,K #5"C{"C%F%U%U# & 11 	gJ.2oom\[e.f
+	g  !*$++6U8U8U*V W 	A}>N>Z.q!Qz:`dO)	 "2$78O8OPQ8R$S24;;3J3J13MN) /	 	M	 		-0&++
 	
rD   rj   )NNNNNNN)r@   rA   rB   r;   rO   r"   r  r  r  _can_record_outputsr_   rM   rn   rn  rq  r!   r#   r   r  r   r  boolr   r   r   rl   ro   rp   s   @rE   rX  rX    s   '(8B/)/ B 
ellT.A  
RWR^R^aeRe  
jojvjv  
J 15V||V  ,,-V 
	V0   .2.204(,2604!%S
##d*S
 t+S
 &&-	S

 S
 ((4/S
  ,,-S
 $;S
 +,S
 
!S
    S
rD   rX  z>The base Gemma 4 language model with a language modeling head.c                   (     e Zd ZdZdef fdZ xZS )Gemma4ForCausalLMmodelrR   c                     t         |   |       | j                  j                  D cg c]  }d| 	 c}| _        y c c}w Nzmodel.r^   r_   r  rd  rf   rR   rh  rg   s      rE   r_   zGemma4ForCausalLM.__init__  ?      )-

(U(U3
 $fTFO3
/ 3
   >)r@   rA   rB   base_model_prefixr;   r_   ro   rp   s   @rE   r~  r~    s    
/ 
 
rD   r~  c                   ,    e Zd ZU dZeed<   dZdZee	dZ
def fdZdej                  dej                  fd	Zee ed
      	 ddej                  dej                  dz  dee   deej                  ej*                  f   fd                     Z xZS )Gemma4AudioModelznAn audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.rR   r   zmodel.audio_towerrh   rZ  c           	         t         |   |       || _        t        |      | _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  |j                  d      | _        | j#                          y c c}w )NTrW   )r^   r_   rR   r   subsample_conv_projectionrt   rel_pos_encr   r  rq  r  r+  r  ra   r{   output_proj_dimsoutput_proj	post_initr   s      rE   r_   zGemma4AudioModel.__init__  s     )KF)S&;FCmmBGH`H`BabYfi0b
 99V%7%79P9PW[\	 cs   B?mask_4drU   c                    |j                   \  }}}}|j                  }| j                  j                  }| j                  j                  dz
  }| j                  j
                  }||z   dz
  |z  }	|	|z  }
|
|z
  }t        j                  |d|d|fd      }|j                  |d|	||
      }t        j                  |||fd      }t        j                  |	|      |z  }t        j                  ||z   |z   |      }|dddf   |dddf   z   }|dddddddf   j                  |dd|d      }|j                  d|      S )z
        Convert a standard 4D attention mask `[batch_size, 1, seq_len, seq_len]` to the 5D blocked format
        `[batch_size, 1, num_blocks, chunk_size, context_size]` expected by the chunked local attention,
        r8   r   F)valuer   Nr   )r   r   rR   r|   r}   r~   r   r   r   rM   r   r  gather)rf   r  r   r   r   r   r   r   r   r   padded_seq_len
pad_amountmask_5dblock_startsoffsets
kv_indicess                   rE   _convert_4d_mask_to_blocked_5dz/Gemma4AudioModel._convert_4d_mask_to_blocked_5d  sN   
 %,MM!
Aw[[55
;;==A![[@@
*Q.:=
#j0#g-
%%!ZJ!?uM//*aZX%%"24F!GuU||Jv>K,,z,<<?QQZ`a!!T'*WT1W-==
dAtQ 67>>z1bR\^`a
~~b*--rD   z&Encodes audio features to soft tokens.rU  NrK   r4  c           	         | j                  ||      \  }}| j                  |      }t        | j                  ||t	        | j                  j
                  dz
  | j                  j                  f            }| j                  |      }| j                  d | j                  j                   D ]  } ||f||d|} | j                  |      }t        ||      S )Nr8   )rR   r  rK   and_mask_function)rK   r   )r  rK   )r  r  r   rR   r7   r}   r~   r  r  r  r  rJ   )rf   r   rK   r4  rh   output_maskr   encoder_layers           rE   rl   zGemma4AudioModel.forward  s     &*%C%CNTb%c"{"..}=2;;'&:33a79\9\]	
 <<^L![[)H4;;+H+HI 	M)-$7 	M	 ((7%VabbrD   rj   )r@   rA   rB   rL   r9   rO   main_input_namer  r+  r   r{  r_   rM   rn   r  r!   r#   r   r   r   r   rN   rl   ro   rp   s   @rE   r  r    s    x&O+)*
0 .ell .u|| .6  !IJ /3cc t+c +,	c
 
u||U---	.c K   crD   r  c                        e Zd ZdZeZeedZdef fdZ	e
e ed      dej                  dej                  d	ee   d
efd                     Z xZS )rE  zThe Gemma 4 Vision Encoder.r  rR   c                    t         |   |       t        |      | _        t	        |      | _        t        |      | _        | j                  j                  rr| j                  dt        j                  | j                  j                               | j                  dt        j                  | j                  j                               | j                          y )NrG  rH  )r^   r_   r7  patch_embedderr  encoderrM  poolerrR   rF  rc   rM   emptyr{   r  r  s     rE   r_   zGemma4VisionModel.__init__  s     7?*62(0;;""  U[[9P9P-QR  ekk$++:Q:Q.RSrD   z1Encodes image pixels to soft tokens from patches.rU  rI  r>  r4  rU   c                    | j                   j                  }|j                  d   ||z  z  }|dk(  j                  d      }| j	                  |||      } | j
                  d|| |d|}| j                  |j                  |||      \  }	}
|	|
   }	| j                   j                  r|	| j                  z
  | j                  z  }	t        |	      S )a  
        pixel_values (`torch.FloatTensor` or `list[torch.FloatTensor]`):
            The images to encode. Either a single `[batch, channels, height, width]` tensor
            (all images same size) or a list of `[1, channels, height, width]` tensors (different sizes).
        pixel_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`):
            The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
        r   r   )r  rK   r>  )rh   r>  r?  rd  r  rC   )rR   pooling_kernel_sizer   r[  r  r  r  r  rF  rG  rH  r   )rf   rI  r>  r4  r  rd  r?  r  rb  rh   pooler_masks              rE   rl   zGemma4VisionModel.forward  s      #kk==$**2.3FI\3\]/25::r:B++L:LN_` 
'--1
 	
 &*[[ 221/'	 &1 &
"{ &k2;;""*T]]:dnnLM&GGrD   )r@   rA   rB   rL   r<   rR   r  r  r{  r_   r!   r#   r   rM   r  r  r   r   r   rl   ro   rp   s   @rE   rE  rE    s    %F1+

1 
  !TU&H''&H ",,&H +,	&H
 
!&H V   &HrD   rE  c                   f     e Zd Zdeez  def fdZdej                  dej                  fdZ	 xZ
S )Gemma4MultimodalEmbeddermultimodal_configtext_configc                     t         |   ||       | `| `| `| `| `| `t        |d|j                        | _
        t        | j                  | j                  d      | _        y )Nr  Fr   )r^   r_   	embeddinghard_embedding_normsoft_embedding_normvocab_offset
vocab_sizeembedding_post_projection_normr  r{   multimodal_hidden_sizerr   r   embedding_pre_projection_norm)rf   r  r  rg   s      rE   r_   z!Gemma4MultimodalEmbedder.__init__%  sp     	*K8N$$O/&-.?ASUfUrUr&s#-:4;V;V\`\d\dqv-w*rD   r  rU   c                 F    | j                  |      }| j                  |      S )a:  Embeds token ids or soft tokens for multimodal content into language model space.
        Args:
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.
        Returns:
            A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        )r  embedding_projection)rf   r  embs_normeds      rE   rl   z Gemma4MultimodalEmbedder.forward7  s%     88G((55rD   )r@   rA   rB   r9   r<   r;   r_   rM   rn   rl   ro   rp   s   @rE   r  r  $  s>    x,/AAx &x$6U\\ 6ell 6rD   r  token_type_idsimage_group_idsc           
      V    | ydt         dt         dt         dt         dt        f
fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N	batch_idxhead_idxq_idxkv_idxrU   c                    	j                   d   }|j                  |dz
        }|j                  |dz
        }	| |f   }	| |f   }t        j                  ||k  |d      }t        j                  ||k  |d      }||k(  |dk\  z  S )Nr   r8   )r   r   )r   rk   rM   rF  )
r  r  r  r  r   q_idx_clampedkv_idx_clampedq_groupkv_groupr  s
            rE   
inner_maskz0token_type_ids_mask_function.<locals>.inner_maskO  s    $**2.
 
Q7*q.9 ")]":;"9n#<=++ej0'2>;;v
2HbA8#155rD   )rm   r|  )r  r  r  s    ` rE   token_type_ids_mask_functionr  C  s>     6c 6S 6 6c 6d 6 rD   rR   r  rK   r  mm_token_type_idsrI  is_trainingis_first_iterationc	                    |r|t        d      | j                         ||||d}
|
j                         }||n|du xs |j                   xs |du}||r|dk(  |dk(  z  }t	        j
                  |dd      }d|d	<   || z  }t	        j                  |j                         d
      dz
  }t	        j                  ||d      }t        |j                  |j                        |      |d<   t        di |
t        di |dS )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma4 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz>`mm_token_type_ids` is required as a model input when trainingru  r8   r$   r   )shiftsdimsFrS  r   or_mask_functionrv  rC   )rX  get_text_configcopyis_initializedrM   rollcumsumrm   rF  r  r   r   r   r   )rR   r  rK   r  r   r  rI  r  r  r4  rz  sliding_mask_kwargs	is_visionis_prev_visionnew_vision_startsvision_group_idss                   rE   create_causal_mask_mappingr  a  sH   $ (0YZZ ((*&(*$K &**, ) 	%g_-K-K)Kg|cgOg 
 $); '!+0AQ0FG	IabA!&v%7 <<(9(=(=(?QG!K ;;y2BBG2N  !5!568H3
./
 -;{;>UATU rD   z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c            "           e Zd Zdef fdZe ed      	 ddej                  dej                  dz  de
e   d	efd
              Ze ed      	 ddej                  dej                  dz  de
e   d	efd              Z	 	 ddej                  dz  dej                  dz  d	eej                   ej                   ej                   f   fdZeee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej&                  dz  dej&                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  de
e   d	efd                     Ze ed      dej&                  dej&                  de
e   d	eez  fd              Z xZS )Gemma4ModelrR   c                 *   t         |   |       | `| `|j                  t        j                  |j                        nd | _        |j                   t        |j                  |j                        nd | _        | `	| `
|j                  t        j                  |j                        nd | _	        |j                   t        |j                  |j                        nd | _
        | j                  j                  D cg c]  }d| 	 c}| _        y c c}w )Nzlanguage_model.)r^   r_   vision_towerembed_visionvision_configr%   from_configr  r  audio_towerembed_audioaudio_configlanguage_modelrd  r  s      rE   r_   zGemma4Model.__init__  s    KQK_K_KkI11&2F2FGqu ##/ %V%9%96;M;MN 	
 IOI\I\Ih9001D1DEnr "". %V%8%8&:L:LM 	 261D1D1g1g3
)-odV$3
/ 3
s   ;DzOProjects the last hidden state from the vision model into language model space.rU  NrI  image_position_idsr4  rU   c                 v     | j                   d||d|}|j                  }| j                  |      |_        |S )z
        image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
            The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
        rI  r>  r  rC   )r  r  r  pooler_output)rf   rI  r  r4  vision_outputsr  s         rE   get_image_featureszGemma4Model.get_image_features  sV     +** 
%1
 

 +<<'+'8'8GX'8'Y$rD   zQProjects the last hidden state from the vision encoder into language model space.pixel_values_videosvideo_position_idsc                     |j                  dd      }|j                  dd      } | j                  d||d|}|j                  }| j                  |      |_        |S )a9  
        video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
            2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
            Passed through to the vision encoder for positional embedding computation.
        r   r8   r  r  rC   )flattenr  r  r  r  )rf   r  r  r4  r  r  s         rE   get_video_featureszGemma4Model.get_video_features  s|     299!Q?/771=*** 
,1
 

 +<<'+'8'8GX'8'Y$rD   ri  r  c                 &   |M|| j                   j                  k(  }|| j                   j                  k(  }|| j                   j                  k(  }n>| | j	                         t        j                  | j                   j                  t
        j                  |j                              k(  j                  d      }| | j	                         t        j                  | j                   j                  t
        j                  |j                              k(  j                  d      }| | j	                         t        j                  | j                   j                  t
        j                  |j                              k(  j                  d      }|||fS )a  
        Obtains mask for multimodal placeholders (replaced by soft tokens) and hard text tokens.

        Masks will be obtained from `mm_token_type_ids`, `input_ids`, or `inputs_embeds` as available and in that
        precedence order. If passing `input_ids` or `inputs_embeds`, the image mask will be derived using
        `config.image_token_id`. Same goes for audio and video masks

        Args:
            input_ids: A tensor containing the hard token IDs from the text tokenizer.
            inputs_embeds: A tensor containing the embeddings for all hard text tokens.

        Returns:
            image_mask, video_mask, audio_mask
        )r   r   r   )
rR   image_token_idvideo_token_idaudio_token_idget_input_embeddingsrM   rd   rZ  r   r[  )rf   ri  r  special_image_maskspecial_video_maskspecial_audio_masks         rE   get_placeholder_maskz Gemma4Model.get_placeholder_mask  sP   &  !*dkk.H.H!H!*dkk.H.H!H!*dkk.H.H!H .4,,.LL!;!;5::VcVjVjk c"g  .4,,.LL!;!;5::VcVjVjk c"g  .4,,.LL!;!;5::VcVjVjk c"g  "#57IIIrD   r   rK   r  r   r  r  rr  c                 H   |du |
duz  rt        d      | j                  ||
      \  }}}||z  |z  }d}|
I|j                         }| j                  j                  j
                  ||<    | j                         |      }
| j                  j                         j                  r| j                  j                  j                  | j                  j                  j
                  ddf   }t        j                  |d   |j                  ddd      |
      }| j                  j                  ||      }nd}|| j!                  ||d      j"                  }|j%                  |
j&                  |
j(                        }|j+                         }|j-                  d      j/                  |
      j%                  |
j&                        }t1        |
|   j3                         |j3                         k(  d| d	|j4                  d
           |
j7                  |j%                  |
j&                        |j%                  |
j&                              }
|| j9                  ||d      j"                  }|j%                  |
j&                  |
j(                        }|j+                         }|j-                  d      j/                  |
      j%                  |
j&                        }t1        |
|   j3                         |j3                         k(  d| d	|j4                  d
           |
j7                  |j%                  |
j&                        |j%                  |
j&                              }
||| j;                  ||d      }|j"                  }|j<                  }||   }|j+                         }|j-                  d      j/                  |
      j%                  |
j&                        }t1        |
|   j3                         |j3                         k(  d| d	|j4                  d
   |j4                  d   z          |
j7                  |j%                  |
j&                        |j%                  |
j&                              }
|V||j?                         nd
}t        j@                  |
j4                  d   |
j&                        |z   }|j-                  d
      }tC        |x} tD              sh| j                  j                         jF                  dk(  r(tI        | j                  |
||||	|| jJ                        } ntM        | j                  |
|||      }  | j                  d|| |||
|dd|}!tO        |!jP                  |!jR                  |!jT                  |!jV                  |nd|      S d      S )  
        input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
            The attention mask for the input audio.
        image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
            2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
            Passed through to the vision encoder for positional embedding computation.
        video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
            2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
            Passed through to the vision encoder for positional embedding computation.
        Nrt  r   r8   r   T)return_dictz6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: z6Audio features and audio tokens do not match, tokens: r   vision)r  )ro  rK   r   r  r  rr  r  )r  r  rh   rZ  image_hidden_statesaudio_hidden_statesrC   ),rX  r  r  rR   r  pad_token_idr  r  r  r  rl  r   rM   rF  r   rn  r  r  r   r   r   rE  r   	expand_asr   numelr   masked_scatterr  get_audio_featuresrK   rw  r   r  r  r  r  r  r   r>   r  r  rh   rZ  )"rf   ri  rI  r  r   rK   r  r   r  r  r  rr  r  r  r4  
image_mask
video_mask
audio_maskmultimodal_maskllm_input_idspad_embeddingllm_inputs_embedsro  image_featuresn_image_tokensvideo_featuresn_video_tokensaudio_outputaudio_featuresaudio_mask_from_encodern_audio_tokensrx  ry  outputss"                                     rE   rl   zGemma4Model.forward  s   < -t";<YZZ-1-F-FyR_-`*
J
$z1J>  %OO-M-1[[-D-D-Q-QM/*7D557FM;;&&(DD //<<CCDKKD[D[DhDhjkDklM %OI,FHZHZ[\^_acHdfs t#22GGWhi# #!44\CUcg4hvvN+..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M *!44#%7T 5 m  ,..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M
 %*=*I22>CVdh2iL)77N&2&A&A#
 ,,CDN'^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+n.B.B1.EEFH *88m223^5F5F}G[G[5\M
 CRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-F{{**,HHHT&@KK!"# %  $	'# '@KK!"# '# &$%% 	
-.%+'	
 	
 )%77#33!//))2>2JPT2@2L
 	
 SW
 	
rD   zPProjects the last hidden state from the audio encoder into language model space.c                     | j                   t        d       | j                   ||fddi|}| j                  |j                        |_        |S )a0  
        input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
            The tensors corresponding to the input audio.
        input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
            The attention mask for the input audio.
        zAudio features were requested, but the model was initialized without an audio_config. Cannot process audio without an audio tower and audio embedder.r  Tr  )r  rX  r  r  r  )rf   r   r  r4  audio_outputss        rE   r  zGemma4Model.get_audio_features  sh     #R 
 )((9LiZ^ibhi&*&6&6]EdEd&6&e#rD   rj   r  )NNNNNNNNNNNNN)r@   rA   rB   r:   r_   r   r   rM   r  r  r   r   r   r  r  r   rN   r  r!   rn   r   r|  r>   rl   rJ   r  ro   rp   s   @rE   r  r    s   
| 
. !rs 7;'' ",,t3 +,	
 
$ t & !tu 7;".. ",,t3 +,	
 
$ v 0 .226+J##d*+J ((4/+J 
u!1!153C3CC	D	+JZ   .2158<37.23704(,5926!%6:6:Z
##d*Z
 ''$.Z
 #..5	Z

 ))D0Z
 t+Z
 #\\D0Z
 &&-Z
 Z
 !++d2Z
 ((4/Z
 $;Z
 ",,t3Z
 ",,t3Z
 +,Z
  
#!Z
    Z
x !st #\\ +,	
 
'	' u rD   r  z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            #           e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  dej                  dz  dej                  dz  dedz  deej                  z  dee   def"dZe	 ddej                  dej                  dz  dee   fd       Ze	 	 ddedej                  d	ej                  dz  de
dz  dej                  dz  dej                  dz  dedz  defd       Z	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma4ForConditionalGenerationr  rR   c                     t         |   |       | j                  j                  D cg c]  }d| 	 c}| _        y c c}w r  r  r  s      rE   r_   z'Gemma4ForConditionalGeneration.__init__  r  r  Nri  rI  r  r   rK   r  r   r  r  r  r  r  labelsrr  logits_to_keepr4  rU   c                     | j                   d	||||||||
||||||	dd|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }| j                  j                         j                  x}||z  }t        j                  |      }||z  }d}|S|j                         }|dddddf   }|dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t!        j"                         }|j%                  d| j                  j                         j&                        }|j%                  d      j                  |j                        } |||      }t)        |||j*                  |j,                  |j.                  |j0                  |j2                        S )
r  T)ri  rI  r  r   rK   r  r   r  r  r  r  rr  r  r  r  N.r   r8   r   )losslogitsr  rh   rZ  r  r  rC   )r  r  r  rm   slicelm_headrR   r  final_logit_softcappingrM   r   re   r   r   r   r   r   CrossEntropyLossr   r  rH   r  rh   rZ  r  r  )rf   ri  rI  r  r   rK   r  r   r  r  r  r  r  r  rr  r  r4  r  rh   slice_indicesr  r!  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                rE   rl   z&Gemma4ForConditionalGeneration.forward  sY   : $** 
% 3)) 3%+/'11
  !
&  118B>SV8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0K0K0M0X0XYK&++B/22<3F3FGKK5D+#33!//)) ' ; ; ' ; ;
 	
rD   c                 >     | j                   j                  ||fi |S )a-  
        image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
            2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
            Passed through to the vision encoder for positional embedding computation.
        )r  r  )rf   rI  r  r4  s       rE   r  z1Gemma4ForConditionalGeneration.get_image_features7  s$     -tzz,,\;MXQWXXrD   r  c           
          t        | j                         dd       dk(  r;t        | |||||fd|i|j                         D 	ci c]  \  }}	|dk7  s||	 c}	}S t	        | ||||fi |S c c}	}w )Nr  r  r  rI  )r  r  r  r>  r   )
rR   r  rK   r  r   r  r  r4  r]  vs
             rE   r   z8Gemma4ForConditionalGeneration.create_masks_for_generateE  s     6))+-JDQU]]-!	 $6	 %+LLNJDAqa>6I1a4J	 	 -~X^ 	 Ks   A*A*c                 j    t        |   |f|||||||
|d|}|s|s||d<   ||d<   ||d<   |	|d<   |S )N)r  r  rK   r   rr  r  r  r  rI  r  r   r  )r^   prepare_inputs_for_generation)rf   ri  r  r  r   rI  r  r   rK   r  r  rr  r  r  r  r4  model_inputsrg   s                    rE   r.  z<Gemma4ForConditionalGeneration.prepare_inputs_for_generationb  sv    & w<
+')%))1
 
 Y+7L(2EL./-;L)*2EL./rD   )NNNNNNNNNNNNNNr   rj   )NF)NNNNNNNNNTNNF)r@   rA   rB   r  r:   r_   rM   r  r  rn   r   r|  rm   r   r   rH   rl   r   r  r  r   r  r   r.  ro   rp   s   @rE   r  r    s     
| 
 .2158<37.237046:6:(,5926*.!%-.!W
##d*W
 ''$.W
 #..5	W

 ))D0W
 t+W
 #\\D0W
 &&-W
 ",,t3W
 ",,t3W
 W
 !++d2W
 ((4/W
   4'W
 $;W
  ell*!W
" +,#W
$ 
&%W
r  7;Y''Y ",,t3Y +,	Y Y  26*/ || t+ 	
 llT) !<<$. !4K 
 >    ' 'rD   r  )r  r~  r  r  r0  rX  rE  )r$   )NNFN)r   collections.abcr   dataclassesr   	functoolsr   rM   r   torch.nnr   r    r	   r9  activationsr
   cache_utilsr   r   configuration_utilsr   integrationsr   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr    r!   utils.output_capturingr"   r#   auto.modeling_autor%   gemma3.modeling_gemma3r&   r'   r(   r)   r*   r+   r,   gemma3n.modeling_gemma3nr-   r.   r/   r0   r1   r2   r3   r4   llama.modeling_llamar5   mixtral.modeling_mixtralr6   0moonshine_streaming.modeling_moonshine_streamingr7   configuration_gemma4r9   r:   r;   r<   
get_loggerr@   loggerr>   rH   rJ   r  rQ   rr   rt   r   r   r   r  Conv1dr  r  r+  r7  rM  rg  rn   rm   rz  r|  r  r  r  r  r  r  r  r  r  r.  r0  rX  r~  r  rE  r  r  r  r|  r  r  r  r  __all__rC   rD   rE   <module>rM     s    $ ! %   $ & ! . 3 /  C S K F &  H E *  	 	 	 8 5 [ g g 
		H	%	 : 		#@ 	 37 3  3BII :	N 	7ryy 7>i)299 i)X#bii #8; ;< RYY  H"bii "B&RYY &R0ryy 0l3		 3D80 80vai a 5&||5&	5& 
5& ,,	5&
 5& \\5&p:"6 :z:)O :)|"1 "J)H")) )H^^I ^U 5 UD )*x)")) x) +x)v7 7!@ryy !@HO/ Od	$A 	?)O ?)D `ay
o y
 by
x ]^
) 
 _
Rc, Rcj>H- >HB68 6>LL4'\\D( _H .2-1&*99<<9 LL4'9 T\	9
 ,,%9 ||d*9 ##d*9 9 t9 
9x h, hhV	 u%D uuprD   