
    i'Y                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,  e%jZ                  e.      Z/ G d dej`                        Z1d Z2d3dZ3 G d dej`                        Z4	 d4dej`                  dejj                  dejj                  dejj                  d ejj                  dz  d!e6d"e6fd#Z7 G d$ d%ej`                        Z8 G d& d'e      Z9e# G d( d)e             Z:e# G d* d+e:             Z; G d, d-e:e      Z< G d. d/ee:      Z= G d0 d1ee:      Z>g d2Z?y)5zPyTorch Persimmon model.    )Callable)OptionalN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocastmerge_with_config_defaults)capture_outputs   )PersimmonConfigc                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )PersimmonRotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr"   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr#   rope_parametersr%   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr#   devicerope_init_fnr"   	__class__s        /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/persimmon/modeling_persimmon.pyr*   z!PersimmonRotaryEmbedding.__init__<   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuU    r4   ztorch.deviceseq_lenreturnztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr      dtype)r4   rA   )r.   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r#   r4   r9   baser=   r>   dimattention_factorr"   s	            r7   r/   z8PersimmonRotaryEmbedding.compute_default_rope_parametersL   s    ( %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r8   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r   mpscpuF)device_typeenabledr?   rM   r@   )r"   rK   expandshaperJ   r4   
isinstancetypestrr   	transposerG   catcosr0   sinrA   )
r3   xposition_idsinv_freq_expandedposition_ids_expandedrS   freqsembr]   r^   s
             r7   forwardz PersimmonRotaryEmbedding.forwardm   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$N)NNN)__name__
__module____qualname__rG   Tensor__annotations__r   r*   staticmethodr   rF   tuplerK   r/   no_gradr   re   __classcell__r6   s   @r7   r!   r!   9   s    llV V   *.+/"*$&*(* t* 
~u$	%	* *> U]]_<  <r8   r!   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrP   r?   rU   )rW   rG   r\   )r_   x1x2s      r7   rotate_halfrt   ~   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r8   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezert   )qkr]   r^   unsqueeze_dimq_embedk_embeds          r7   apply_rotary_pos_embr|      sY    $ --
&C
--
&C3w;q>C/0G3w;q>C/0GGr8   c                   $     e Zd Z fdZd Z xZS )PersimmonMLPc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _
        y rf   )r)   r*   r   LinearrD   intermediate_sizedense_h_to_4hdense_4h_to_hr   
hidden_actactr3   r#   r6   s     r7   r*   zPersimmonMLP.__init__   s^    YYv'9'96;S;STYYv'?'?ASAST&++,r8   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rf   )r   r   r   )r3   hidden_statess     r7   re   zPersimmonMLP.forward   s6    **=9/**=9r8   )rg   rh   ri   r*   re   ro   rp   s   @r7   r~   r~      s    -r8   r~   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr?   r   rP   )rM   rA   )ptrainingr   )rG   matmulr[   r   
functionalsoftmaxfloat32rJ   rA   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r7   eager_attention_forwardr      s     <<s}}Q':;gEL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r8   c                       e Zd ZdZddededz  f fdZdej                  de	ej                  ej                  ej                  f   fdZ
	 	 	 	 	 	 dd	ej                  d
ej                  dz  dej                  dz  dedz  dedede	ej                  ej                  f   dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )PersimmonAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr#   	layer_idxc                    t         |           || _        || _        |-t        j                  d| j                  j                   d       |j                  | _        |j                  | _
        | j                  | j                  z  | _        t        | j                  |j                  d   z        | _        d| _        | j                  | j                  z  | j                  k7  r&t!        d| j                   d| j                   d      t#        j$                  | j                  d| j                  z  d	      | _        t#        j$                  | j                  | j                  z  | j                  d	      | _        |j*                  | _        | j                  d
z  | _        | j*                  r|t#        j.                  |j                  | j                  z  |j0                  d      | _        t#        j.                  |j                  | j                  z  |j0                  d      | _        t#        j6                  |j8                        | _        y )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r=   Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   biasg      )epselementwise_affine)r)   r*   r#   r   loggerwarning_oncer6   rg   rD   rE   	num_headsr>   rF   r.   rotary_ndims	is_causal
ValueErrorr   r   query_key_valuedenseqk_layernormr   	LayerNormlayer_norm_epsq_layernormk_layernormDropoutattention_dropoutr3   r#   r   r6   s      r7   r*   zPersimmonAttention.__init__   s   " !8!8 9 :, , "--33((DNN:0F0FG^0_ _`MMDNN*t/?/??QRVRbRbQc$T^^$4B8   "yy)9)91t?O?O;OVZ[YYt~~=t?O?OVZ[
"//}}d*!||""dnn4&:O:Odh D  "||""dnn4&:O:Odh D "$F,D,D!Er8   	fused_qkvr:   c                     |j                   \  }}}|j                  ||| j                  d| j                        }|ddddf   |ddddf   |ddddf   fS )a  
        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
        storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        r   .r   Nr   r?   )rW   viewr   r>   )r3   r   
batch_size
seq_lengththree_times_hidden_sizes        r7   _split_headszPersimmonAttention._split_heads   sb     ;D//7
J 7NN:z4>>1dmm\	a#YsAqy%99S!QY;OOOr8   r   r   r`   past_key_valuesoutput_attentions	use_cacheposition_embeddingsr   c                    |j                         \  }	}
}| j                  |      }| j                  |      \  }}}| j                  r"| j	                  |      }| j                  |      }|j                  dd      }|j                  dd      }|j                  dd      }|\  }}|dd | j                  f   |d| j                  d f   }}|dd | j                  f   |d| j                  d f   }}t        ||||      \  }}t        j                  ||fd      }t        j                  ||fd      }| |j                  ||| j                        \  }}t        j                  | j                  j                   t"              } || ||||f| j$                  sdn| j                  j&                  | j(                  d|\  }}|j+                  |	|
d      }| j-                  |      }||fS )Nr   r?   .rP   rU           )r   r   )sizer   r   r   r   r   r[   r   r|   rG   r\   updater   r   get_interfacer#   _attn_implementationr   r   r   r   reshaper   )r3   r   r   r`   r   r   r   r   r   bszq_len_r   query_states
key_statesvalue_statesr]   r^   	query_rot
query_passkey_rotkey_passattention_interfacer   r   s                            r7   re   zPersimmonAttention.forward   s!    &**,UA ((7	 483D3DY3O0z<++L9L))*5J $--a3#--a3))!Q/
&S1 1 1112d//112 	
 s/d////0sD--//0 
 2)Wc3O	7 yy)Z!8bAYY2;
&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$++2O2OLL	%
 	%
!\ "))#ub9jj-L((r8   rf   )NNNFFN)rg   rh   ri   __doc__r   rF   r*   rG   rj   rm   r   
LongTensorr   boolr   r   re   ro   rp   s   @r7   r   r      s1   G#F #F3: #FJPell PuU\\5<<Y^YeYe=e7f P& /304(,"'HLA)||A) t+A) &&-	A)
 A)  A) A) #5<<#=>EA) -.A) 
u||U\\D0%2E2LL	MA)r8   r   c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
eej                  ej                  f   dz  dee   dej                  fdZ xZS )PersimmonDecoderLayerr#   r   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        t        j                  |j                        | _        y )N)r#   r   r   )r)   r*   rD   r   	self_attnr~   mlpr   r   r   input_layernormpost_attention_layernormr   hidden_dropoutr   r   s      r7   r*   zPersimmonDecoderLayer.__init__A  s    !--+6YO'!||F,>,>FDYDYZ(*V5G5GVMbMb(c%zz&"7"78r8   Nr   r   r`   r   r   r   r   r:   c           
          |}| j                  |      } | j                  d||||||d|\  }}	||z   }|}| j                  |      }| j                  |      }| j	                  |      }||z   }|S )N)r   r   r`   r   r   r    )r   r   r   r   r   )
r3   r   r   r`   r   r   r   r   residualr   s
             r7   re   zPersimmonDecoderLayer.forwardJ  s     !,,]; *4>> 
')%+ 3
 
q !=0 !55mD/]3%0r8   )NNNFN)rg   rh   ri   r   rF   r*   rG   rj   r   r   r   rm   r   r   re   ro   rp   s   @r7   r   r   @  s    9 93 9 /304(,!&HL"||" t+" &&-	"
 " $;" #5<<#=>E" -." 
"r8   r   c                   D    e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZeedZy)PersimmonPreTrainedModelr#   modelTr   r   )r   
attentionsN)rg   rh   ri   r   rk   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_sdpa_supports_flash_attn_supports_attention_backendr   r   _can_record_outputsr   r8   r7   r   r   o  sH    &*#01"3!N"&.(r8   r   c                        e Zd ZdZdef fdZeee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  dedz  d	e	j                  dz  d
edz  dee   defd                     Z xZS )PersimmonModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersimmonDecoderLayer`]

    Args:
        config: PersimmonConfig
    r#   c           	      4   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  |j                        | _        t#        | j$                        | _        d| _        | j+                          y c c}w )Nr   r#   F)r)   r*   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrD   embed_tokens
ModuleListrangenum_hidden_layersr   layersr   r   final_layernormr!   r#   
rotary_embgradient_checkpointing	post_initr   s      r7   r*   zPersimmonModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammGLVMeMeGfg)"695g
  "||F,>,>FDYDYZ2$++F&+# hs   DN	input_idsr   r`   r   inputs_embedsr   r   r:   c           
         |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        | j                  ||||      }	|}
| j                  |
|      }| j                  D ]  } ||
f|	||||d|}
 | j                  |
      }
t        |
|	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r4   )r#   r   r   r   r`   )r`   )r   r`   r   r   r   )last_hidden_stater   )r   r	   r#   r   get_seq_lengthrG   rH   rW   r4   rv   r   r   r   r   r   )r3   r   r   r`   r   r   r   r   past_seen_tokenscausal_maskr   r   decoder_layers                r7   re   zPersimmonModel.forward  s<    -t";<YZZ0*$++>O  --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oom,oW![[ 		M)*) /#$7 M		 ,,];&++
 	
r8   )NNNNNN)rg   rh   ri   r   r   r*   r   r   r   rG   r   rj   r   FloatTensorr   r   r   r   re   ro   rp   s   @r7   r   r     s        .2.204(,26!%3
##d*3
 t+3
 &&-	3

 3
 ((4/3
 $;3
 +,3
 
!3
    3
r8   r   c                   *    e Zd ZddiZ fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
ej                  dz  dedz  deej                  z  dee   defd              Z xZS )PersimmonForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y )NFr   )
r)   r*   r   r   r   r   r   rD   lm_headr   r   s     r7   r*   zPersimmonForCausalLM.__init__  sU     #F+
 ++yy!3!3V5F5FUS 	r8   Nr   r   r`   r   r   labelsr   logits_to_keepr   r:   c	           
      x    | j                   d||||||d|	}
|
j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|* | j                  ||fd| j                  j                  i|	}t        |||
j                  |
j                  |
j                        S )uk  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PersimmonForCausalLM

        >>> model = PersimmonForCausalLM.from_pretrained("adept/persimmon-8b-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-base")

        >>> prompt = "human: Hey, what should I eat for dinner?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'human: Hey, what should I eat for dinner?\n\ncat: 🐱\n\nhuman: 😐\n\n'
        ```)r   r   r`   r   r   r   Nr   )losslogitsr   r   r   r   )r   r  rX   rF   slicer
  loss_functionr#   r   r   r   r   r   )r3   r   r   r`   r   r   r  r   r  r   outputsr   slice_indicesr  r  s                  r7   re   zPersimmonForCausalLM.forward  s    H ,64:: ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%ffbAWAWb[abD%#33!//))
 	
r8   )NNNNNNNr   )rg   rh   ri   _tied_weights_keysr*   r   r   rG   r   rj   r   r  r   rF   r   r   r   re   ro   rp   s   @r7   r  r    s    *,GH  .2.204(,26*.!%-.:
##d*:
 t+:
 &&-	:

 :
 ((4/:
   4':
 $;:
 ell*:
 +,:
 
 :
  :
r8   r  c                       e Zd Zy)"PersimmonForSequenceClassificationNrg   rh   ri   r   r8   r7   r  r        r8   r  c                       e Zd Zy)PersimmonForTokenClassificationNr  r   r8   r7   r  r     r  r8   r  )r  r   r   r  r  )r   )r   )@r   collections.abcr   typingr   rG   r   activationsr   cache_utilsr   r	   
generationr
   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_persimmonr   
get_loggerrg   r   Moduler!   rt   r|   r~   rj   rK   r   r   r   r   r   r  r  r  __all__r   r8   r7   <module>r.     s  &  $    ! . ) / B 
 G & R R G 5 4 
		H	%A<ryy A<J(4299 * %II%<<% 
% <<	%
 LL4'% % %,y) y)x,6 ,^     M
- M
 M
`I
3_ I
X j)IKc i d&CE] cr8   