
    iۼ                        d dl mZ d dlmZ d dlZd dlmZ d dlmZ ddl	m
Z ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;  e-jx                  e=      Z> e+d      e G d de4                    Z? e+d      e G d de                    Z@ G d de9      ZA G d de7      ZB G d  d!e:      ZC G d" d#e6      ZD G d$ d%e6      ZE G d& d'e      ZF G d( d)e      ZG G d* d+ej                        ZI G d, d-ej                        ZJe+ G d. d/e8             ZKd0ej                  dz  d1ej                  d2eNdz  d3ej                  fd4ZO G d5 d6eK      ZP G d7 d8eK      ZQe+ G d9 d:eK             ZRe+ G d; d<eK             ZS G d= d>eKe      ZTe+ G d? d@eK             ZUe+ G dA dBeK             ZVg dCZWy)D    )Callable)AnyN)strict   )initialization)CacheDynamicCacheEncoderDecoderCache)PreTrainedConfig)GenerationMixin)create_bidirectional_mask(create_bidirectional_sliding_window_maskcreate_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingeager_attention_forwardzgoogle/t5_gemma_module-7b)
checkpointc                   .    e Zd ZU dZdZeed<    e       Zy)T5GemmaModuleConfigaA  
    query_pre_attn_scalar (`float`, *optional*, defaults to 256):
        scaling factor used on the attention scores
    final_logit_softcapping (`float`, *optional*, defaults to 30.0):
        scaling factor when applying tanh softcapping on the logits.
    attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
        scaling factor when applying tanh softcapping on the attention scores.

    ```python
    >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
    >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
    >>> configuration = T5GemmaModuleConfig()
    >>> # Initializing a model from the t5_gemma_module-7b style configuration
    >>> model = T5GemmaModuleModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```F
is_decoderN)	__name__
__module____qualname____doc__r.   bool__annotations__AttributeErroruse_bidirectional_attention     |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/t5gemma/modular_t5gemma.pyr-   r-   B   s    $ J"0"2r8   r-   c                        e Zd ZU dZdZdgZeedZdZee	e
e
f   z  dz  ed<   dZee	e
e
f   z  dz  ed<   dZeed	<   d
Zeez  ed<   d
Zeez  ed<   d
Zeez  ed<   dZeed<   dZeed<    fdZ xZS )T5GemmaConfiga  
    encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
        Configuration for the encoder.
    decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
        Configuration for the decoder.

    Example:

    ```python
    >>> from transformers import T5GemmaConfig, T5GemmaModel
    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
    >>> model = T5GemmaModel(t5gemma_config)
    ```t5gemmapast_key_values)encoderdecoderNr>   r?   Tis_encoder_decoder        dropout_rateclassifier_dropout_rateattention_dropouttie_word_embeddingsi  
vocab_sizec                    t        | j                  t              rt        di | j                  | _        n| j                  t               | _        t        | j                  t              rt        di | j                  | _        n| j                  t               | _        d| j                  _        | j                  | j                  _        | j                  | j                  _        d| j                  _        d| j                  _        | j                  | j                  _        | j                  | j                  _        | j                  j                  | j                  _
        |j                  d| j                  j                        | _        dD ]   }||vst        | j                  |      ||<   " t        | <  di | y )NFTinitializer_range)bos_token_idpad_token_ideos_token_idr7   )
isinstancer>   dictr-   r?   r.   rB   rD   	use_cachehidden_sizecross_attention_hidden_sizepoprH   getattrsuper__post_init__)selfkwargsspecial_token_key	__class__s      r9   rT   zT5GemmaConfig.__post_init__y   sP   dllD).>>DL\\!.0DLdllD).>>DL\\!.0DL"'$($5$5!)-)?)?&"&!%$($5$5!)-)?)?&37<<3K3K0!',?A_A_!`!Q 	U .,3DLLBS,T()	U 	''r8   )r/   r0   r1   r2   
model_typekeys_to_ignore_at_inferencer-   sub_configsr>   rM   r   r4   r?   r@   r3   rB   intfloatrC   rD   rE   rF   rT   __classcell__rX   s   @r9   r;   r;   [   s     J#4"51>QRK;?G 4S>1D8?;?G 4S>1D8?## #L#+#+.S5[.%(us{( $$J( (r8   r;   c                       e Zd Zy)T5GemmaRMSNormNr/   r0   r1   r7   r8   r9   ra   ra          r8   ra   c                   $     e Zd Z fdZd Z xZS )
T5GemmaMLPc                 l    t         |   |       t        j                  |j                        | _        y N)rS   __init__nnDropoutrB   dropoutrU   configrX   s     r9   rh   zT5GemmaMLP.__init__   s&     zz&"5"56r8   c                     | j                  | j                  |            | j                  |      z  }| j                  |      }| j	                  |      }|S rg   )act_fn	gate_projup_projrk   	down_proj)rU   xhidden_statesrr   s       r9   forwardzT5GemmaMLP.forward   sH    DNN1$56aH]3NN=1	r8   )r/   r0   r1   rh   ru   r^   r_   s   @r9   re   re      s    7r8   re   c                       e Zd Zy)T5GemmaRotaryEmbeddingNrb   r7   r8   r9   rw   rw      rc   r8   rw   c                   (     e Zd Zdedef fdZ xZS )T5GemmaSelfAttentionrm   	layer_idxc                 H    t         |   ||       |j                  | _        y rg   )rS   rh   r.   	is_causalrU   rm   rz   rX   s      r9   rh   zT5GemmaSelfAttention.__init__   s    +**r8   )r/   r0   r1   r-   r\   rh   r^   r_   s   @r9   ry   ry      s    +2 +s + +r8   ry   c                        e Zd Zdedef fdZ	 ddej                  dej                  dz  dej                  dz  dedz  d	e	e
   d
eej                  ej                  dz  eej                     dz  f   fdZ xZS )T5GemmaCrossAttentionrm   rz   c                    t         |   ||       | `| `d| _        |j
                  t        d      t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        y )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)rS   rh   sliding_window
layer_typer|   rP   
ValueErrorri   Linearnum_key_value_headshead_dimattention_biask_projv_projr}   s      r9   rh   zT5GemmaCrossAttention.__init__   s    +O--5abbii..0J0JT]]0Zagavav
 ii..0J0JT]]0Zagavav
r8   Nrt   attention_maskencoder_hidden_statesr=   rV   returnc                    |t        d      |j                  d d }g |d| j                  }| j                  |      j	                  |      j                  dd      }|1|j                  j                  | j                        }	|j                  }
|	s|j                  d d }g |d| j                  }| j                  |      j	                  |      j                  dd      }| j                  |      j	                  |      j                  dd      }|
j                  ||| j                        \  }}d|j                  | j                  <   nF
j                  | j                     j                  }|
j                  | j                     j                  }t!        j"                  | j$                  j&                  t(              } || ||||f| j*                  r| j,                  nd| j.                  d | j0                  d|\  }} |j2                  g |d j5                         }| j7                  |      }||fS )Nz5Encoder hidden state is required for cross attention.   r#   TrA   )rk   scalingr   softcap)r   shaper   q_projview	transpose
is_updatedgetrz   cross_attention_cacher   r   updatelayerskeysvaluesr   get_interfacerm   _attn_implementationr*   trainingrD   r   attn_logit_softcappingreshape
contiguouso_proj)rU   rt   r   r   r=   rV   input_shapehidden_shapequery_statesr   curr_past_key_valuesencoder_input_shapeencoder_hidden_shape
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                     r9   ru   zT5GemmaCrossAttention.forward   s?    !(TUU#))#2.88b8$--8{{=166|DNNqRST&(3377GJ#2#H#H "*"7"="=cr"B#L%8#L"#Ldmm#L %:;@@AUV``abdefJ;;'<=BBCWXbbcdfghL*+?+F+FzS_aeaoao+p(
L=A**4>>:-44T^^DIIJ/66t~~FMML(?(M(MKK,,.E)
 %8%
 /3mmD**LL//%
 %
!\ *k));;;;FFHkk+.L((r8   rg   )r/   r0   r1   r-   r\   rh   torchTensorr   r   r   tupleru   r^   r_   s   @r9   r   r      s    
2 
s 
* )-3)||3) t+3)  %||d2	3)
 3) -.3) 
u||U\\D0%2E2LL	M3)r8   r   c                        e Zd ZdZdef fdZ	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	eej                  f   f
d
Z xZS )T5GemmaEncoderLayerzEncoder sub-layer.rz   c                 D   t         |           |j                  | _        || _        || _        |j
                  |   | _        t        ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t#        j$                  |j&                        | _        y N)rm   rz   eps)rS   rh   rO   rm   rz   layer_typesattention_typery   	self_attnra   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormre   mlppre_feedforward_layernormpost_feedforward_layernormri   rj   rB   rk   r}   s      r9   rh   zT5GemmaEncoderLayer.__init__   s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56r8   Nrt   position_embeddingsr   position_idsr   c           	      >   |}| j                  |      } | j                  d||||d d|\  }}| j                  |      }|| j                  |      z   }|}| j	                  |      }| j                  |      }| j                  |      }|| j                  |      z   }|S )N)rt   r   r   r   r=   r7   )r   r   r   rk   r   r   r   )rU   rt   r   r   r   rV   residual_s           r9   ru   zT5GemmaEncoderLayer.forward  s     !44]C)4>> 
' 3)% 
 
q 55mD 4<<#>> 66}E/77F 4<<#>>r8   )NNN)r/   r0   r1   r2   r\   rh   r   r   r   
LongTensorFloatTensorru   r^   r_   s   @r9   r   r      s    7# 7. IM.204|| #5<<#=>E t+	
 &&- 
u  !	"r8   r   c                   8    e Zd ZdZdef fdZ	 	 	 	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  dej                  dz  dej                  fdZ xZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.rz   c                     t         |           |j                  | _        || _        || _        |j
                  |   | _        t        ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t#        j$                  |j&                        | _        t+        ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y r   )rS   rh   rO   rm   rz   r   r   ry   r   ra   r   r   r   re   r   r   r   ri   rj   rB   rk   r   
cross_attnpre_cross_attn_layernormpost_cross_attn_layernormr}   s      r9   rh   zT5GemmaDecoderLayer.__init__0  s&   !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56/vS(6v7I7IvObOb(c%)78J8JPVPcPc)d&r8   Nrt   r   r   r   r=   rN   r   encoder_attention_maskr   c	           
         |}
| j                  |      } | j                  d||||||j                  nd |d|	\  }}| j                  |      }|
| j	                  |      z   }|}
| j                  |      } | j                  d|||||d|	\  }}| j                  |      }|
| j	                  |      z   }|}
| j                  |      }| j                  |      }| j                  |      }|
| j	                  |      z   }|S )N)rt   r   r   r   r=   rN   )rt   r   r   r=   rN   r7   )r   r   self_attention_cacher   rk   r   r   r   r   r   r   )rU   rt   r   r   r   r=   rN   r   r   rV   r   r   s               r9   ru   zT5GemmaDecoderLayer.forwardG  s;    !44]C)4>> 
' 3)%DSD_O@@ei
 
q 55mD 4<<#>> 55mD*4?? 
'"71+
 
q 66}E 4<<#>> 66}E/77F 4<<#>>r8   )NNNNFNN)r/   r0   r1   r2   r\   rh   r   r   r   r   r
   r3   r   ru   r^   r_   s   @r9   r   r   -  s    <e# e4 IM.2046:!&596:,||, #5<<#=>E, t+	,
 &&-, -t3, $;,  %||d2, !&t 3, 
		,r8   r   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
T5GemmaClassificationHeadz-Head for sentence-level classification tasks.rO   
num_labelsrC   c                     t         |           t        j                  |      | _        t        j
                  ||      | _        y )N)p)rS   rh   ri   rj   rk   r   out_proj)rU   rO   r   rC   rX   s       r9   rh   z"T5GemmaClassificationHead.__init__y  s1    zz$;<		+z:r8   rt   r   c                 J    | j                  |      }| j                  |      }|S rg   )rk   r   )rU   rt   s     r9   ru   z!T5GemmaClassificationHead.forward~  s$    ]3m4r8   )rA   )r/   r0   r1   r2   r\   r]   rh   r   r   ru   r^   r_   s   @r9   r   r   v  s<    7;C ;S ;SX ;
U\\ ell r8   r   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
T5GemmaLMHeadz.Head for language modeling (generation) tasks.rO   rF   r   c                 \    t         |           t        j                  |||      | _        y )Nr   )rS   rh   ri   r   r   )rU   rO   rF   r   rX   s       r9   rh   zT5GemmaLMHead.__init__  s"    		+zEr8   rt   r   c                 (    | j                  |      }|S rg   )r   )rU   rt   logitss      r9   ru   zT5GemmaLMHead.forward  s    }-r8   )F)r/   r0   r1   r2   r\   r3   rh   r   r   ru   r^   r_   s   @r9   r   r     s?    8FC FS F FU\\ ell r8   r   c                       e Zd ZU eed<   dZdZddgZe e	e
dd       e	e
dd	       e	edd	      gd
Z ej                         d        Zd Zy)T5GemmaPreTrainedModelrm   modelTr   r   r   r   )index
layer_namer   )rt   
attentionsc                 @   t        j                  | |       | j                  j                  }t	        |t
              r|j                  j                  j                  d   dz  }t        j                  |j                  j                  d||z         t        |j                  d      rA|j                  j                  *t        j                  |j                  j                         y y y t	        |t              rm| j                  j                  sV|j                  j                  j                  d   dz  }t        j                  |j                  j                  d||z         y y d|j                   j"                  v r t        j                  |j                         y y )Nr   g      rA   )meanstdr   RMSNorm)r   _init_weightsrm   rH   rL   r   r   weightr   initnormal_hasattrr   zeros_r   rE   rX   r/   )rU   moduler   scales       r9   r   z$T5GemmaPreTrainedModel._init_weights  s+    	%%dF3kk++f78OO**003t;ELL//csU{Kv/FOO4H4H4TFOO001 5U/.;;22..44Q74?V__33#3;O 3 &**333KK& 4r8   c                 `   | j                   j                  j                  }| j                   j                  j                  }|t	        d      |j                  |j                        }|dddf   j                         |dddf<   ||d<   |t	        d      |j                  |dk(  |       |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rm   r?   rI   rJ   r   	new_zerosr   clonemasked_fill_)rU   	input_idsdecoder_start_token_idrJ   shifted_input_idss        r9   _shift_rightz#T5GemmaPreTrainedModel._shift_right  s     "&!4!4!A!A{{**77!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r8   N)r/   r0   r1   r;   r4   base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r!   ry   r   _can_record_outputsr   no_gradr   r   r7   r8   r9   r   r     sx    &*#.0EF,/q[Q/q\R0lS
 U]]_' '"!r8   r   	token_idsrt   rJ   r   c                    | <|t        d      | |k7  j                  |j                  t        j                        }|S t        j
                  |j                  d   |j                  d   f|j                  t        j                        }|S )z%Construct the default attention mask.z3`pad_token_id` is required for padding information.r   r   devicedtype)r   tor  r   longonesr   )r   rt   rJ   r   s       r9   make_default_2d_attention_maskr    s     RSS#|3778L8LejjY
    #]%8%8%;<]EYEYafakak
 r8   c                        e Zd ZeedZ fdZee	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   d	eez  fd
              Z xZS )T5GemmaEncoder)r   rt   c           	      T   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        |j                  |j                        | _        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w Nr   Frm   )rS   rh   rJ   padding_idxrF   ri   	EmbeddingrO   embed_tokensra   r   normgradient_checkpointing
ModuleListrangenum_hidden_layersr   r   rj   rB   rk   rw   
rotary_emb	post_initr}   s      r9   rh   zT5GemmaEncoder.__init__       !.. ++LL):):F<N<NPTP`P`a"6#5#56;N;NO	&+#mmEJ6KcKcEde	 3e
 zz&"5"560? 	 f    D%Nr   r   r   inputs_embedsrV   r   c                    |d u |d uz  rt        d      |j                  dd        || j                  |      }|?t        j                  |j
                  d   |j                        }|j                  d      }|!t        ||| j                  j                        }t        |x}t              s'| j                  ||d}t        di |t        di |d}|}t        j                  | j                  j                   dz  |j"                  	      }	||	z  }| j%                  |      }| j'                  ||      }
t)        | j*                  d | j                  j,                         D ]+  \  }} |||
|| j                  j.                  |      |fi |}- | j1                  |      }| j%                  |      }t3        |
      S )N:You must specify exactly one of input_ids or inputs_embedsr=   r   r  r   )rm   r  r   full_attentionsliding_attention      ?r  )last_hidden_stater7   )r   rQ   r  r   aranger   r  	unsqueezer  rm   rJ   rL   rM   r   r   tensorrO   r  rk   r  	enumerater   r  r   r  r   )rU   r   r   r   r  rV   self_attn_mask_mappingmask_kwargsrt   
normalizerr   ilayer_modules                r9   ru   zT5GemmaEncoder.forward  s    -t";<YZZ 	

$d+  --i8M <<(;(;A(>}G[G[\L'11!4L!;I}VZVaVaVnVnoNNB0DI++!."0K #<"Jk"J%M%\P[%\&"
 &\\$++"9"93">mFYFYZ
%
2]3"oom\J(5Tt{{7T7T)UV 	OA|(#&t{{'>'>q'AB	
 M	 		-0]3+
 	
r8   NNNN)r/   r0   r1   ry   r   r   rh   r    r"   r   r   r   r   r   r   r   r   ru   r^   r_   s   @r9   r  r    s    *,
$   .2.204266
##d*6
 t+6
 &&-	6

 ((4/6
 +,6
 
	 6
   6
r8   r  c                   T    e Zd Z eed       eed      edZ fdZe	e
	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  dedz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )T5GemmaDecoderr   )r   )r   cross_attentionsrt   c           	      T   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        |j                  |j                        | _        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w r
  )rS   rh   rJ   r  rF   ri   r  rO   r  ra   r   r  r  r  r  r  r   r   rj   rB   rk   rw   r  r  r}   s      r9   rh   zT5GemmaDecoder.__init__7  r  r  Nr   r   r   r=   r  rN   r   r   rV   r   c	                    |d u |d uz  rt        d      |t        d      || j                  |      }| j                  s,|r*|(t        t	        | j
                        t	                     }|V||j                         nd}
t        j                  |j                  d   |j                        |
z   }|j                  d      }|#|!t        ||| j
                  j                        }t        |x}t              s7| j
                  ||||j                   nd |d}t#        di |t%        di |d}t        |x}t              sd	t'        | j
                  |||
      i}|}t        j(                  | j
                  j*                  dz  |j,                        }||z  }| j/                  |      }| j1                  ||      }t3        | j4                  d | j
                  j6                         D ]2  \  }} ||||| j
                  j8                  |      |||||d	   fi |	}4 | j;                  |      }| j/                  |      }t=        ||      S )Nr  z0`encoder_hidden_states` must be given in decoderr  r   r   r  )rm   r  r   r=   r   r  r  )rm   r  r   r   r  r   )r!  r=   r7   )r   r  r   r
   r	   rm   get_seq_lengthr   r"  r   r  r#  r  rJ   rL   rM   r   r   r   r   r$  rO   r  rk   r  r%  r   r  r   r  r   )rU   r   r   r   r=   r  rN   r   r   rV   past_seen_tokensr&  r'  cross_attn_mask_mappingrt   r(  r   r)  r*  s                      r9   ru   zT5GemmaDecoder.forwardI  su    -t";<YZZ (OPP  --i8M}}/F 2,dkk2RT`TbcOCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L!o&=;I}VZVaVaVnVnoNNB0DI++!."0KZKf?#G#Glp ,K #5"C{"C%F%U%U&"
 5KK1TR ";;;"/#9*?	#'# &\\$++"9"93">mFYFYZ
%
2]3"oom\J(5Tt{{7T7T)UV 	OA|(#&t{{'>'>q'AB%'(89
 
M	 		-0]38++
 	
r8   )NNNNNNNN)r/   r0   r1   r!   ry   r   r   r   rh   r    r"   r   r   r   r
   r   r3   r   r   r   r   ru   r^   r_   s   @r9   r-  r-  0  s   $%9C*+@J,$   .2.2046:26!%596:P
##d*P
 t+P
 &&-	P

 -t3P
 ((4/P
 $;P
  %||d2P
 !&t 3P
 +,P
 
:	:P
   P
r8   r-  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  dedz  de	j                  dz  de	j                  dz  dedz  dee   defd              Z xZS )T5GemmaModelrm   c                     t         |   |       |j                  st        d      t	        |j
                        | _        t        |j                        | _        | j                          y )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	rS   rh   r@   r   r  r>   r-  r?   r  rl   s     r9   rh   zT5GemmaModel.__init__  sO     ((uvv%fnn5%fnn5r8   c                 6    | j                   j                         S rg   r>   get_input_embeddingsrU   s    r9   r9  z!T5GemmaModel.get_input_embeddings      ||0022r8   c                 8    | j                   j                  |      S rg   r>   set_input_embeddingsrU   new_embeddingss     r9   r>  z!T5GemmaModel.set_input_embeddings      ||00@@r8   Nr   r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr=   r  decoder_inputs_embedsrN   rV   r   c                    | | j                   d||||	d|}|j                  } | j                  d||||
||||d|}t        |j                  |j                  |j                  dd      r|j                  n|j                  f|j                  |j                  |j                  |j                  |j                        S )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        r   r   r   r  )r   r   r   r  r=   r   r   rN   output_hidden_statesF)r!  r=   decoder_hidden_statesdecoder_attentionsr.  encoder_last_hidden_stater   encoder_attentionsr7   )	r>   r!  r?   r   r=   r   rt   r   r.  )rU   r   r   r   rB  rC  rD  rE  r=   r  rF  rN   rV   r   decoder_outputss                  r9   ru   zT5GemmaModel.forward  s    , "*dll #-)+	
 O !0 A A&$,, 

'1-/+"7#1

 

 "-??+;;zz0%8 #2"?"?!335.99,==&5&G&G"1"?"?.99
 	
r8   )NNNNNNNNNNN)r/   r0   r1   r;   rh   r9  r>  r   r   r   r   r   
BoolTensorr   r
   r   r3   r   r   r   ru   r^   r_   s   @r9   r5  r5    sA   	} 	3A  .2370459:>8<266:-159!%6
##d*6
 ))D06
 &&-	6

 !++d26
 !& 0 04 76
 $..56
 )4/6
 -t36
 ||d*6
  %||d26
 $;6
 +,6
 
6
  6
r8   r5  c                        e Zd Zdef fdZd Zd Zee	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
ee   defd              Z xZS )T5GemmaEncoderModelrm   c                     t         |   |       |j                  rt        d      t	        |j
                        | _        | j                          y )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)rS   rh   r@   r   r  r>   r  rl   s     r9   rh   zT5GemmaEncoderModel.__init__  s?     $$pqq%fnn5r8   c                 6    | j                   j                         S rg   r8  r:  s    r9   r9  z(T5GemmaEncoderModel.get_input_embeddings  r;  r8   c                 8    | j                   j                  |      S rg   r=  r?  s     r9   r>  z(T5GemmaEncoderModel.set_input_embeddings  rA  r8   Nr   r   r   r  rV   r   c                 4     | j                   d||||d|}|S )NrH  r7   )r>   )rU   r   r   r   r  rV   rE  s          r9   ru   zT5GemmaEncoderModel.forward  s7     '$,, 
)%'	

 
 r8   r+  )r/   r0   r1   r;   rh   r9  r>  r   r   r   r   r   r   r   r   r   ru   r^   r_   s   @r9   rQ  rQ    s    } 3A  .23704-1##d* ))D0 &&-	
 ||d* +, 
  r8   rQ  c            "            e Zd ZddiZddiZddgdgfiZdef fdZd	 Zd
 Z	e
e	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dej                  dz  dej                  dz  dej                  dz  dedz  deej(                  z  dee   deej                     ez  fd              Zdej(                  fdZ xZS )T5GemmaForConditionalGenerationzlm_head.out_proj.weightz!model.decoder.embed_tokens.weightzlm_head.out_projcolwise_gather_outputrt   r   rm   c                    d|_         t        | 	  |       t        |      | _        |j
                  j                  | _        t        |j
                  j                  | j                        | _	        d| _
        | j                          y )NTForMaskedLM)r@   rS   rh   r5  r   r?   rF   r   rO   lm_head	loss_typer  rl   s     r9   rh   z(T5GemmaForConditionalGeneration.__init__  sb    $(! !&)
 ..33$V^^%?%?Q&r8   c                 &    || j                   _        y rg   r[  r   r?  s     r9   set_output_embeddingsz5T5GemmaForConditionalGeneration.set_output_embeddings!  s     .r8   c                 .    | j                   j                  S rg   r^  r:  s    r9   get_output_embeddingsz5T5GemmaForConditionalGeneration.get_output_embeddings$  s    ||$$$r8   Nr   r   r   rB  rC  rD  rE  r=   r  rF  labelsrN   logits_to_keeprV   r   c                    |||
| j                  |      } | j                  d|||||||||	|
|d|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                         j                  }|j                  3||j                  z  }t        j                  |      }||j                  z  }d}| | j                  ||| j                  fi |}t        |||j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  	      S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r   r   r   rB  rC  rD  rE  r=   r  rF  rN   )	lossr   r=   rJ  rK  r.  rL  r   rM  r7   )r   r   r!  rL   r\   slicer[  get_decoderrm   final_logit_softcappingr   tanhloss_functionrF   r   r=   rJ  rK  r.  rL  r   rM  )rU   r   r   r   rB  rC  rD  rE  r=   r  rF  rb  rN   rc  rV   rN  rt   slice_indicesr   decoder_configre  s                        r9   ru   z'T5GemmaForConditionalGeneration.forward'  ss   : "3";@U@] $ 1 1& 9.8djj /
)%/#9!5++'"7/
 /
 (998B>SV8W~ot4]kmA}a,?@A))+2211=nDDDFZZ'FnDDDF%4%%ffdooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r8   c                 $    | j                  |      S rg   )r   )rU   rb  s     r9   %prepare_decoder_input_ids_from_labelszET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labelsr  s      ((r8   )NNNNNNNNNNNNr   )r/   r0   r1   _tied_weights_keys_tp_plan_pp_planr;   rh   r_  ra  r   r   r   r   r   rO  r   r
   r3   r\   r   r   r   r   r   ru   rn  r^   r_   s   @r9   rW  rW    s   35XY"$;<H"o%6
$CDH	} 	/%  .2370459:>8<266:26:>*.!%-.G
##d*G
 ))D0G
 &&-	G

 !++d2G
 !& 0 04 7G
 $..5G
 )4/G
 -t3G
 ((4/G
  %0047G
   4'G
 $;G
 ell*G
 +,G
  
u  	!O	3!G
  G
R)ELL )r8   rW  c                       e Zd Zddededz  f fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  dedz  de
j                  dz  de
j                  dz  de
j                  dz  dee   defd              Z xZS ) T5GemmaForSequenceClassificationNrm   r@   c                    |||_         t        | 	  |       |j                  | _        |j                   rt	        |      | _        nt        |      | _        |j                  j                  }|j                   r|j                  j                  }t        |dd      }t        || j                  |      | _        | j                          y)z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        NrC   皙?r@   rS   rh   r   r5  r   rQ  r>   rO   r?   rR   r   scorer  rU   rm   r@   rO   classifier_dropoutrX   s        r9   rh   z)T5GemmaForSequenceClassification.__init__x  s    
 )(:F%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{DOOM_`
r8   c                 6    | j                   j                         S rg   r   r9  r:  s    r9   r9  z5T5GemmaForSequenceClassification.get_input_embeddings      zz..00r8   c                 :    | j                   j                  |       y rg   r   r>  rU   values     r9   r>  z5T5GemmaForSequenceClassification.set_input_embeddings      

''.r8   r   r   r   rB  rC  rD  rE  r  rF  rb  rV   r   c                    | j                   j                  r'|%|#t        d| j                  j                   d      | j                   j                  r"| |	|t        d      | j                  |      }| j                   j                  rB | j                  |f||||||||	dd	|}|j                  }|j                  }|j                  }n; | j                  |f|||d|}|j                  }|j                  }|j                  }| j                  |      }||j                  d   }n|j                  d   }| j                   j                  |d	k7  rt        d
      | j                   j                  d}n||| j                   j                  k7  j!                  |j"                  t$        j&                        }t%        j(                  |j                  d   |j"                  t$        j&                        }||z  j+                  d      }| j                   j                  r[|d	z  }t%        j,                  ||j                  d   d	z
        }n.d}t.        j1                  | j                  j                   d       |t%        j(                  ||j"                        |f   }d}|
| j3                  ||
|| j                         }t5        ||||      S )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   rB  rC  rD  rE  r  rF  rN   r   r   r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )r   rb  pooled_logitsrm   re  r   rt   r   )rm   r@   NotImplementedErrorrX   r/   r   r   r   r!  rJ  rK  rt   r   rw  r   rJ   r  r  r   int32r"  argmaxclamploggerwarning_oncerj  r   )rU   r   r   r   rB  rC  rD  rE  r  rF  rb  rV   outputsr!  rt   r   r   
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr  re  s                          r9   ru   z(T5GemmaForSequenceClassification.forward  s   2 ;;))y/@]E^%J4>>KbKbJcc|} 
 ;;))/@/HMbMj  U 
 !% 1 1) <;;))*4$**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'1tzz(-)+	(
 (G !( 9 9#11M ++J-. "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J{{--"a'"%*[[1CIZI`I`acIdghIh%i"!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r8   rg   
NNNNNNNNNN)r/   r0   r1   r;   r3   rh   r9  r>  r   r   r   r   r   r   r   r   r   r   ru   r^   r_   s   @r9   rs  rs  v  sN   } $+ .1/  .2.204596:8<2626:>*.i
##d*i
 t+i
 &&-	i

 !++d2i
 !&t 3i
 $..5i
 )4/i
 ((4/i
  %0047i
   4'i
 +,i
 
"i
  i
r8   rs  c                       e Zd Zddededz  f fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  dedz  de
j                  dz  de
j                  dz  de
j                  dz  dee   defd              Z xZS )T5GemmaForTokenClassificationNrm   r@   c                    |||_         t        | 	  |       |j                  | _        |j                   rt	        |      | _        nt        |      | _        |j                  j                  }|j                   r|j                  j                  }t        |dd      }t        || j                  |      | _        | j                          y)z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        NrC   ru  rv  rx  s        r9   rh   z&T5GemmaForTokenClassification.__init__  s    
 )(:F%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{DOOM_`
r8   c                 6    | j                   j                         S rg   r{  r:  s    r9   r9  z2T5GemmaForTokenClassification.get_input_embeddings  r|  r8   c                 :    | j                   j                  |       y rg   r~  r  s     r9   r>  z2T5GemmaForTokenClassification.set_input_embeddings   r  r8   r   r   r   rB  rC  rD  rE  r  rF  rb  rV   r   c                    | j                   j                  r'|%|#t        d| j                  j                   d      | j                   j                  r"| |	|t        d      | j                  |      }| j                   j                  rB | j                  |f||||||||	dd	|}|j                  }|j                  }|j                  }n; | j                  |f|||d|}|j                  }|j                  }|j                  }| j                  |      }d}|
| j                  ||
| j                         }t        ||||      S )	r  Nr  r  r  Fr  r  r  )rm   r@   r  rX   r/   r   r   r   r!  rJ  rK  rt   r   rw  rj  r   )rU   r   r   r   rB  rC  rD  rE  r  rF  rb  rV   r  r!  rt   r   r   re  s                     r9   ru   z%T5GemmaForTokenClassification.forward#  s   4 ;;))y/@]E^%J4>>KbKbJcc|}  ;;))/@/HMbMj  U 
 !% 1 1) <;;))*4$**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'1tzz(-)+	(
 (G !( 9 9#11M ++J-.%%ffdkkBD$'!	
 	
r8   rg   r  )r/   r0   r1   r;   r3   rh   r9  r>  r   r   r   r   r   r   r   r   r   r   ru   r^   r_   s   @r9   r  r    sN   } $+ 01/  .2.204596:8<2626:>*.N
##d*N
 t+N
 &&-	N

 !++d2N
 !&t 3N
 $..5N
 )4/N
 ((4/N
  %0047N
   4'N
 +,N
 
N
  N
r8   r  )r;   r-   rW  r5  rQ  r   rs  r  )Xcollections.abcr   typingr   r   torch.nnri   huggingface_hub.dataclassesr    r   r   cache_utilsr   r	   r
   configuration_utilsr   
generationr   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr    utils.output_capturingr!   r"   gemma2.configuration_gemma2r$   gemma2.modeling_gemma2r%   r&   r'   r(   r)   r*   
get_loggerr/   r  r-   r;   ra   re   rw   ry   r   r   r   Moduler   r   r   r   r   r\   r  r  r-  r5  rQ  rW  rs  r  __all__r7   r8   r9   <module>r     sc   %    . & C C 3 )  C 9  G &  8 E 6  
		H	% 673, 3  83. 677($ 7(  87(t	] 		 		2 	+? +D)O D)N14 1hF4 FR		 	BII 	 8!2 8! 8!v$&<< * \\	"P
+ P
fk
+ k
\ J
) J
 J
Z !0 ! !Hb)&<o b)J I
'= I
 I
X o
$: o
 o
d	r8   