
    iH                     2   d dl mZ d dlZd dlmZ d dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1  e#jd                  e3      Z4 e"d      e G d de                    Z5 G d de.      Z6 G d de+      Z7 G d de/      Z8	 	 	 d6dejr                  dejt                  dejt                  d ejt                  d!ejt                  dz  d"e;e<z  d#e;dz  d$e;dz  d%e=ejt                  ejt                  f   fd&Z> G d' d(e'      Z? G d) d*e      Z@ G d+ d,e-      ZA G d- d.e,      ZB G d/ d0e(      ZC G d1 d2e)      ZD G d3 d4e*      ZEg d5ZFy)7    )CallableN)strict   )ACT2FN)CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSRopeParametersdynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)maybe_autocast   )GemmaAttentionGemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassificationGemmaMLP
GemmaModelGemmaPreTrainedModelGemmaRMSNormGemmaRotaryEmbeddingapply_rotary_pos_emb	repeat_kvzgoogle/gemma2-7b)
checkpointc                   &    e Zd ZU dZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZdZe	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZee
d<   dZe	e
d<   dZee
d<   d Zee
d!<   d"Zee
d#<   d$Ze	d%z  e
d&<   d'Ze	ee	   z  d%z  e
d(<   d)Ze	d%z  e
d*<   d"Zee
d+<   d%Zee z  d%z  e
d,<   d-Z!ee
d.<   d/Z"e	ez  d%z  e
d0<   dZ#e	e
d1<   d2Z$e	d%z  e
d3<   d%Z%ee   d%z  e
d4<   d5Z&ed%z  e
d6<   d7Z'ed%z  e
d8<   d%Z(ed%z  e
d9<    fd:Z)d; Z* xZ+S )<Gemma2Configa  
    query_pre_attn_scalar (`float`, *optional*, defaults to 256):
        scaling factor used on the attention scores
    final_logit_softcapping (`float`, *optional*, defaults to 30.0):
        scaling factor when applying tanh softcapping on the logits.
    attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
        scaling factor when applying tanh softcapping on the attention scores.
    use_bidirectional_attention (`bool`, *optional*):
        If True, the model will attend to all text tokens instead of using a causal mask.

    ```python
    >>> from transformers import Gemma2Model, Gemma2Config
    >>> # Initializing a Gemma2 gemma2-7b style configuration
    >>> configuration = Gemma2Config()
    >>> # Initializing a model from the gemma2-7b style configuration
    >>> model = Gemma2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei 	  hidden_sizei $  intermediate_size   num_hidden_layers   num_attention_heads   num_key_value_heads   head_dimgelu_pytorch_tanhhidden_activationi    max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_epsT	use_cacher   Npad_token_id   eos_token_idr   bos_token_idtie_word_embeddingsrope_parametersFattention_bias        attention_dropoutquery_pre_attn_scalari   sliding_windowlayer_typesg      >@final_logit_softcappingg      I@attn_logit_softcappinguse_bidirectional_attentionc                     | j                   ;t        | j                        D cg c]  }t        |dz   dz        rdnd c}| _         t	        |   di | y c c}w )NrE   r   sliding_attentionfull_attention )rO   ranger7   boolsuper__post_init__)selfkwargsi	__class__s      z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/gemma2/modular_gemma2.pyrZ   zGemma2Config.__post_init__y   s`    #X]^b^t^tXu STtQUaK'8#>NN D 	''	 s   Ac                     | j                   | j                  z  dk7  r&t        d| j                    d| j                   d      y)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   zThe hidden size (z6) is not a multiple of the number of attention heads (z).N)r4   r9   
ValueError)r[   s    r_   validate_architecturez"Gemma2Config.validate_architecture   sS    d666!;#D$4$4#5 622327  <    ),__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr3   int__annotations__r4   r5   r7   r9   r;   r=   r?   strr@   rA   floatrB   rC   rX   rD   rF   listrG   rH   rI   r   dictrJ   rL   rM   rN   rO   rP   rQ   rR   rZ   rb   __classcell__r^   s   @r_   r'   r'   7   s   ( J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 JK!s!s    Hc0s0#'S'#u#L%It L#* +,L#S	/D(, L#*  $$48O^d*T18 ND ,/sU{T)/!$3$!%NC$J%$(KcT!(,0UT\0+/EDL//33(rc   r'   c                       e Zd Zy)Gemma2RMSNormNrd   re   rf   rV   rc   r_   ru   ru          rc   ru   c                        e Zd Z fdZ xZS )	Gemma2MLPc                 T    t         |   |       t        |j                     | _        y N)rY   __init__r   r?   act_fnr[   configr^   s     r_   r|   zGemma2MLP.__init__   s"     V556rc   )rd   re   rf   r|   rr   rs   s   @r_   ry   ry      s    7 7rc   ry   c                   R    e Zd ZddefdZ ej                         ed               Zy)Gemma2RotaryEmbeddingNr   c                    t         j                  j                          |j                  | _        |j                  | _        || _        | j                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j                  |      \  }| _        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultinv_freqF)
persistentoriginal_inv_freq)nnModuler|   r@   max_seq_len_cachedoriginal_max_seq_lenr   rI   r   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r[   r   devicerope_init_fnr   s        r_   r|   zGemma2RotaryEmbedding.__init__   s    
		"("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUrc   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rE   mpscpuF)device_typeenabledr   )dim)dtype)r   ro   expandshapetor   
isinstancetypern   r   	transposetorchcatcosr   sinr   )
r[   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r_   forwardzGemma2RotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$r{   )	rd   re   rf   r'   r|   r   no_gradr   r   rV   rc   r_   r   r      s4    V| V  U]]_<  <rc   r   modulequerykeyvaluer/   dropoutscalingsoftcapreturnc                 |   || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j                         }||fS )N      r   r   r   )r   r   )ptrainingrE   )r=   r$   num_key_value_groupsr   matmulr   tanhr   
functionalsoftmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r/   r   r   r   r\   
key_statesvalue_statesattn_weightsattn_outputs                r_   eager_attention_forwardr      s    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!#n4 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$rc   c                       e Zd Zdedef fdZ	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma2Attentionr   	layer_idxc                 t   t        |d      r|j                  |   nd | _        t        |   ||       | j
                  j                  | _        | j
                  j                  | _        t        |dd       | _	        |j                  dz  | _        | j                  dk(  r|j                  | _        y d | _        y )NrO   rR   Fr   rT   )hasattrrO   
layer_typerY   r|   r   rQ   rL   getattr	is_causalrM   r   rN   r[   r   r   r^   s      r_   r|   zGemma2Attention.__init__   s    ;B6=;Y&,,Y7_c+&*kk&H&H#!%!>!>$V-JERR33T97;J]7]f33cgrc   Nr.   position_embeddingsr/   r)   r\   r   c                 6   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  r| j                   nd| j"                  | j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nr   rE   r   rK   )r   r   rN   r   )r   r=   q_projviewr   k_projv_projr#   updater   r   get_interfacer   _attn_implementationr   r   rL   r   rN   rQ   reshaper   o_proj)r[   r.   r   r/   r)   r\   input_shapehidden_shapequery_statesr   r   r   r   attention_interfacer   r   s                   r_   r   zGemma2Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8%
 /3mmD**LL..//%
 %
!\ *k));;;;FFHkk+.L((rc   )NNN)rd   re   rf   r'   rl   r|   r   Tensortupler   r   r   r   rr   rs   s   @r_   r   r      s    h| h h IM.2(,()||() #5<<#=>E() t+	()
 () -.() 
u||U\\D0%2E2LL	M()rc   r   c                   .    e Zd Zdedef fdZ	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	e
dz  d
eej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma2DecoderLayerr   r   c                    t         |           |j                  | _        || _        t	        ||      | _        t        |      | _        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )N)r   r   )eps)rY   r|   r4   r   r   	self_attnry   mlpru   rB   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      r_   r|   zGemma2DecoderLayer.__init__  s    !--()LV$,V-?-?VEXEXY(5f6H6HfNaNa(b%)6v7I7IvObOb)c&*78J8JPVPcPc*d'rc   Nr.   r   r/   r   r)   r   c           	         |}| j                  |      } | j                  d|||||d|\  }}| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)r.   r   r/   r   r)   rV   )r   r   r   r   r   r   )	r[   r.   r   r/   r   r)   r\   residual_s	            r_   r   zGemma2DecoderLayer.forward  s     !,,]; *4>> 
' 3)%+
 
q 55mD =0 66}E/77F =0rc   )NNNN)rd   re   rf   r'   rl   r|   r   r   r   
LongTensorr   FloatTensorr   rr   rs   s   @r_   r   r     s    
e| 
e 
e IM.204(,|| #5<<#=>E t+	
 &&-  
u  %(9(95;L;L(L"MPT"TT	Urc   r   c                       e Zd Zy)Gemma2PreTrainedModelNrv   rV   rc   r_   r   r   <  rw   rc   r   c                        e Zd Zdef fdZ	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	e
dz  d
ee   defdZ xZS )Gemma2Modelr   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |      | _	        y c c}w r{   )
rY   r|   r   
ModuleListrW   r7   r   r1   r   
rotary_embr   s      r_   r|   zGemma2Model.__init__A  sU     mmDI&JbJbDcdy	2d
 07 es   A'Nr,   r/   r   r)   r-   rC   r\   r   c           	         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        |x}	t              s)| j                  ||||d}
t        d
i |
t        d
i |
d}	|}| j                  ||      }t        | j                   d | j                  j"                         D ]-  \  }} ||f|	| j                  j$                  |      |||d|}/ | j'                  |      }t)        ||	      S )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   rE   )r   )r   r-   r/   r)   r   )rU   rT   )r/   r   r   r)   )last_hidden_stater)   rV   )ra   r0   r   r   get_seq_lengthr   aranger   r   	unsqueezer   rq   r
   r   r   	enumerater1   r7   rO   r2   r   )r[   r,   r/   r   r)   r-   rC   r\   past_seen_tokenscausal_mask_mappingmask_kwargsr.   r   r]   decoder_layers                  r_   r   zGemma2Model.forwardH  s    -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-F ++!."0#2 ,K #5"C{"C%F%U%U# &"oom\J )$++6U8U8U*V W 	A})24;;3J3J13MN$7) / M	 		-0&++
 	
rc   )NNNNNN)rd   re   rf   r'   r|   r   r   r   r   r   rX   r   r   r   r   rr   rs   s   @r_   r   r   @  s    8| 8 .2.204(,26!%;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 +,;
 
!;
rc   r   c                       e Zd Z fdZ	 	 	 	 	 	 	 	 ddej
                  dz  dej                  dz  dej
                  dz  dedz  dej                  dz  dej
                  dz  d	e	dz  d
e
ej                  z  dee   defdZ xZS )Gemma2ForCausalLMc                 d    t         |   |       t        |      | _        | j	                          y r{   )rY   r|   r   model	post_initr~   s     r_   r|   zGemma2ForCausalLM.__init__  s&      (
rc   Nr,   r/   r   r)   r-   labelsrC   logits_to_keepr\   r   c	           
          | j                   d||||||d|	}
|
j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                  ||| j                  fi |	}t        |||
j                  |
j                  |
j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM

        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)r,   r/   r   r)   r-   rC   N)losslogitsr)   r.   
attentionsrV   )r  r   r   rl   slicelm_headr   rP   r   r   loss_functionr3   r   r)   r.   r  )r[   r,   r/   r   r)   r-   r  rC   r  r\   outputsr.   slice_indicesr  r  s                  r_   r   zGemma2ForCausalLM.forward  s   < ,64:: ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
rc   )NNNNNNNr   )rd   re   rf   r|   r   r   r   r   r   rX   rl   r   r   r   r   rr   rs   s   @r_   r   r     s     .2.204(,26*.!%-.;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
   4';
 $;;
 ell*;
 +,;
 
 ;
rc   r   c                       e Zd Zy)Gemma2ForSequenceClassificationNrv   rV   rc   r_   r  r    rw   rc   r  c                       e Zd Zy)Gemma2ForTokenClassificationNrv   rV   rc   r_   r  r    rw   rc   r  )r'   r   r   r   r  r  )rK   NN)Gcollections.abcr   r   torch.nnr   huggingface_hub.dataclassesr   activationsr   cache_utilsr   r   configuration_utilsr	   masking_utilsr
   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   gemma.modeling_gemmar   r   r   r   r   r   r    r!   r"   r#   r$   
get_loggerrd   loggerr'   ru   ry   r   r   r   ro   rl   r   r   r   r   r   r   r   r  r  __all__rV   rc   r_   <module>r%     s   %   . ! . 3 R B 9 O 
 6 & @ @ +    
		H	% -.N# N  /Nb	L 	7 7<0 <N   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%D3)n 3)l,3 ,^	0 	C
* C
LA
( A
H	&D 		#> 	rc   