
    i2                        d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  ejR                  e*      Z+ ed      e G d de                    Z, G d de$      Z- G d de"      Z. G d de      Z/ G d de       Z0 G d de#      Z1 G d d e(      Z2 G d! d"e!      Z3g d#Z4y)$    )CallableN)strict   )CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)RopeParametersdynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)maybe_autocast   )CohereAttentionCohereDecoderLayerCohereForCausalLMCohereLayerNormCoherePreTrainedModelCohereRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Gemma2ModelzCohereForAI/c4ai-command-r-v01)
checkpointc                       e Zd ZU dZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZdZe	e
d<   dZe	e
d<   dZe	e
d<   dZee
d<   dZe	e
d<   dZe	e
d<   dZe	dz  e
d<   dZee
d<   dZe	e
d<   dZee
d<   dZee
d <   d!Zee
d"<   d#Ze	dz  e
d$<   d%Ze	dz  e
d&<   d'Ze	ee	   z  dz  e
d(<   d!Zee
d)<   dZee z  dz  e
d*<   d+Z!ee
d,<   d-Z"ee	z  e
d.<   d/Z#e	dz  e
d0<   dZ$ee   dz  e
d1<    fd2Z% xZ&S )3Cohere2Configa  
    logit_scale (`float`, *optional*, defaults to 0.0625):
        The scaling factor for the output logits.

    ```python
    >>> from transformers import Cohere2Model, Cohere2Config

    >>> # Initializing a Cohere Nextmodel configuration
    >>> configuration = Cohere2Config()

    >>> # Initializing a model from the Cohere2 configuration
    >>> model = Cohere2Model(configuration) # doctest: +SKIP

    >>> # Accessing the model configuration
    >>> configuration = model.config # doctest: +SKIP
    ```
    cohere2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei    hidden_sizei X  intermediate_sizeg      ?logit_scale(   num_hidden_layers@   num_attention_headsNnum_key_value_headssilu
hidden_actmax_position_embeddingsg{Gz?initializer_rangegh㈵>layer_norm_epsT	use_cacher   pad_token_id   bos_token_idi eos_token_idtie_word_embeddingsrope_parametersFattention_bias        attention_dropouti   sliding_windowlayer_typesc                 V   | j                   | j                  | _         | j                  | j                  z  | _        | j                  M|j                  dd      }t        | j                        D cg c]  }t        |dz   |z        rdnd c}| _        t        | (  di | y c c}w )Nsliding_window_pattern      sliding_attentionfull_attention )r4   r3   r-   head_dimrE   popranger1   boolsuper__post_init__)selfkwargs_sliding_window_patterni	__class__s       |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/cohere2/modular_cohere2.pyrR   zCohere2Config.__post_init__m   s    ##+'+'?'?D$ ((D,D,DD #&,jj1I1&M# t556  (,QU6M,M'N#Tdd D
 	'' s   2B&)'__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr,   int__annotations__r-   r.   r/   floatr1   r3   r4   r6   strr7   r8   r9   r:   rP   r;   r=   r>   listr?   r@   r   dictrA   rC   rD   rE   rR   __classcell__rW   s   @rX   r    r    1   s   $ J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 JK"s"Ks!!&*t*J#'S'#u# NE It L#*  L#* +1L#S	/D(1 $$48O^d*T18 ND %(us{(!%NC$J%$(KcT!(( (    r    c                   D    e Zd Z ej                         ed               Zy)Cohere2RotaryEmbeddingc                    | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  |dd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j                  |j                   
      	j                  |j                   
      fS # 1 sw Y   AxY w)Nr   rI   mpscpuF)device_typeenabledr   )dim)dtype)inv_freqrc   expandshape
isinstancedevicetyperd   r   	transposetorchrepeat_interleavecosattention_scalingsintors   )
rS   xposition_idsinv_freq_expandedposition_ids_expandedrp   freqsembr}   r   s
             rX   forwardzCohere2RotaryEmbedding.forward   s@    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))%;C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   =BFF
N)rY   rZ   r[   r{   no_gradr   r   rL   ri   rX   rk   rk      s$    U]]_<  <ri   rk   c                       e Zd Zy)Cohere2LayerNormNrY   rZ   r[   rL   ri   rX   r   r          ri   r   c                      e Zd ZdZddededz  fdZ	 ddej                  de	ej                  ej                  f   dej                  dz  d	e
dz  d
ee   de	ej                  ej                  dz  e	ej                     dz  f   fdZy)Cohere2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    t         j                  j                  |        || _        || _        t        |d|j                  |j                  z        | _        |j                  |j                  z  | _
        | j                  dz  | _        |j                  | _        d| _        t        |d      r|j                  |   nd }|dk(  r|j                   nd | _        t        j"                  |j                  |j                  | j                  z  |j$                        | _        t        j"                  |j                  |j                  | j                  z  |j$                        | _        t        j"                  |j                  |j                  | j                  z  |j$                        | _        t        j"                  |j                  | j                  z  |j                  |j$                        | _        y )NrM   g      TrE   rJ   )bias)nnModule__init__r   r   getattrr-   r3   rM   r4   num_key_value_groupsscalingrC   	is_causalhasattrrE   rD   LinearrA   q_projk_projv_projo_proj)rS   r   r   
layer_types       rX   r   zCohere2Attention.__init__   s   
		4 "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!96=fm6TV''	2Z^
7AEX7Xf33^bii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
ri   r'   position_embeddingsr(   r"   rT   returnc                 8   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
|\  }}| j                  t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                   sdn| j"                  | j$                  | j                  d|\  }} |j&                  g |d j)                         }| j+                  |      }||fS )Nrm   rI   r   rB   )dropoutr   rD   )rv   rM   r   viewrz   r   r   rD   r   updater   r   get_interfacer   _attn_implementationr   trainingrC   r   reshape
contiguousr   )rS   r'   r   r(   r"   rT   input_shapehidden_shapequery_states
key_statesvalue_statesr}   r   attention_interfaceattn_outputattn_weightss                   rX   r   zCohere2Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S*';L*VY[^'_$L*&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((ri   N)rY   rZ   r[   r\   r    ra   r   r{   Tensortupler   r   r   r   rL   ri   rX   r   r      s    G
} 
t 
< )-()||() #5<<#=>() t+	()
 () +,() 
u||U\\D0%2E2LL	M()ri   r   c                   $    e Zd Zdedef fdZ	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  de	dz  d	e
dz  d
ee   deej                  eej                  ej                  f   dz  f   fdZ xZS )Cohere2DecoderLayerr   r   c                 &    t         |   ||       y r   )rQ   r   )rS   r   r   rW   s      rX   r   zCohere2DecoderLayer.__init__   s    +ri   Nr'   r   r(   r"   r:   rT   r   c           	          |}| j                  |      } | j                  d|||||d|\  }}	| j                  |      }
||z   |
z   }|S )N)r'   r   r(   r"   r:   rL   )input_layernorm	self_attnmlp)rS   r'   r   r(   r"   r:   rT   residualhidden_states_attention_hidden_states_mlps              rX   r   zCohere2DecoderLayer.forward   su     !,,];%3T^^ &
' 3)+&
 &
" !HH]3 #::=NNri   )NNNF)rY   rZ   r[   r    ra   r   r{   r   r   r   rP   r   r   FloatTensorr   rg   rh   s   @rX   r   r      s    ,} , , IM.2(,!&|| #5<<#=>E t+	
  $; +, 
u  %(9(95;L;L(L"MPT"TT	Uri   r   c                   "    e Zd ZU eed<   eedZy)Cohere2PreTrainedModelr   )r'   
attentionsN)rY   rZ   r[   r    rb   r   r   _can_record_outputsrL   ri   rX   r   r      s    ,&ri   r   c                        e Zd Zdef fdZ	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	e
dz  d
ee   defdZ xZS )Cohere2Modelr   c                     t         |   |       t        |j                  |j                        | _        t        j                  |j                  |j                  | j                        | _
        y )N)r-   eps)rQ   r   r   r-   r9   r+   r   	Embeddingr,   padding_idxr)   )rS   r   rW   s     rX   r   zCohere2Model.__init__  sR     $&2D2D6K`K`a	LL):):F<N<NPTP`P`ari   Nr%   r(   r   r"   r&   r:   rT   r   c           
         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        |x}	t              s)| j                  ||||d}
t        d
i |
t        d
i |
d}	|}| j                  ||      }t        | j                         D ].  \  }} ||f|	| j                  j"                  |      ||||d|}0 | j%                  |      }t'        ||	      S )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   rI   )rx   )r   r&   r(   r"   r   )rK   rJ   )r(   r   r"   r:   r   )last_hidden_stater"   rL   )
ValueErrorr)   r   r   get_seq_lengthr{   arangerv   rx   	unsqueezerw   rf   r	   r
   
rotary_emb	enumerater*   rE   r+   r   )rS   r%   r(   r   r"   r&   r:   rT   past_seen_tokenscausal_mask_mappingmask_kwargsr'   r   rV   decoder_layers                  rX   r   zCohere2Model.forward  s    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-F++!."0#2 ,K #5"C{"C%F%U%U#
 &"oom\J )$++ 6 		A})24;;3J3J13MN$7 /#) M		 		-0&++
 	
ri   )NNNNNN)rY   rZ   r[   r    r   r{   
LongTensorr   r   r   rP   r   r   r   r   rg   rh   s   @rX   r   r     s    b} b .2.204(,26!%7
##d*7
 t+7
 &&-	7

 7
 ((4/7
 $;7
 +,7
 
!7
ri   r   c                       e Zd Zy)Cohere2ForCausalLMNr   rL   ri   rX   r   r   A  r   ri   r   )r    r   r   r   )5collections.abcr   r{   torch.nnr   huggingface_hub.dataclassesr   cache_utilsr   r   configuration_utilsr   masking_utilsr	   r
   modeling_outputsr   modeling_rope_utilsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   cohere.modeling_coherer   r   r   r   r   r   r   r   gemma2.modeling_gemma2r   
get_loggerrY   loggerr    rk   r   r   r   r   r   r   __all__rL   ri   rX   <module>r      s    %   . . 3 R 7 6 & @ @ +	 	 	 1 
		H	% ;<J($ J(  =J(Z<2 <"	 	D) D)N, :2 =
; =
@	* 	 \ri   