
    iG                        d dl Z d dlZd dlmZ ddlmZ ddlmZmZ ddl	m
Z
mZ ddlmZ ddlmZ d	d
lmZ d	dlmZmZmZmZmZmZmZmZmZ d	dlmZ ddlmZ  ej@                  e!      Z"dZ#dZ$ G d de      Z%d Z& G d de      Z' G d dejP                        Z) G d de)      Z* G d de)      Z+e)e*e+dZ, G d de      Z- G d d e      Z. G d! d"e      Z/ G d# d$e      Z0 G d% d&e      Z1 G d' d(e      Z2 G d) d*e      Z3g d+Z4y),    N)nn   )initialization)CacheStaticCache)_flash_attention_forward!flash_attn_supports_top_left_mask)PreTrainedModel)logging   )GemmaForCausalLM)	LlamaDecoderLayerLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_emb	repeat_kv)
MistralMLP   )DiffLlamaConfigzkajuma/DiffLlama-0.3B-handcutr   c                       e Zd Zy)DiffLlamaMLPN__name__
__module____qualname__     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.pyr   r   1       r!   r   c                 >    ddt        j                  d| z        z  z
  S )Ng?g333333?g333333ӿ)mathexp)	layer_idxs    r"   lambda_init_fnr(   5   s     txxy 01111r!   c                       e Zd Zy)DiffLlamaRotaryEmbeddingNr   r    r!   r"   r*   r*   9   r#   r!   r*   c                   <    e Zd ZdZddededz  f fdZ	 	 	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	ej                  dz  d
edz  dede	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )DiffLlamaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfigr'   c                    t         |           || _        || _        |-t        j                  d| j                  j                   d       |j                  | _        |j                  | _	        |j                  | _        t        |d| j                  | j                  z        | _        |j                  | _        | j                  | j                  z  | _        |j                   | _        d| _        t%        j&                  | j                  | j                  | j                  z  |j(                        | _        t%        j&                  | j                  | j                  | j                  z  |j(                        | _        t%        j&                  | j                  | j                  | j                  z  |j(                        | _        t%        j&                  | j                  | j                  z  | j                  |j(                        | _        t3        |      | _        t%        j6                  t9        j:                  d|j<                  | j                  f            | _        t%        j6                  t9        j:                  d|j<                  | j                  f            | _         t%        j6                  t9        j:                  d|j<                  | j                  f            | _!        t%        j6                  t9        j:                  d|j<                  | j                  f            | _"        t%        jF                  d| j                  z  |jH                  d	
      | _%        y )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.head_dimT)biasr   )sizer   F)epselementwise_affine)&super__init__r-   r'   loggerwarning_once	__class__r   attention_dropouthidden_sizenum_attention_heads	num_headsgetattrr/   num_key_value_headsnum_key_value_groupsmax_position_embeddings	is_causalr   Linearattention_biasq_projk_projv_projo_projr(   lambda_init	Parametertorchnormallambda_std_dev	lambda_q1	lambda_k1	lambda_q2	lambda_k2RMSNormrms_norm_eps	groupnormselfr-   r'   r8   s      r"   r5   zDiffLlamaAttention.__init__@   sq   " !8!8 9 :, , "(!9!9!--33
D4D4D4VW#)#=#= $(NNd6N6N$N!'-'E'E$ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlm))4ell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdA$56;N;Nchir!   hidden_statesposition_embeddingsattention_maskposition_idspast_key_values	use_cachereturnc                    |j                         \  }}	}
|	}| j                  |      }| j                  |      }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j                  | j                        j                  dd      }|j	                  ||| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}| |j                  ||| j                        \  }}t        || j                        }t        || j                        }t        j                  t        j                   |dd      d      }|j#                  dddd      }t        j$                  ||j                  dd            t'        j(                  | j                        z  }|||z   }t*        j,                  j/                  |dt        j0                        j3                  |j4                        }t*        j,                  j7                  || j8                  | j:                        }t        j<                  t        j>                  | j@                  | jB                  z  dt        j0                              j3                  |j4                        }t        j<                  t        j>                  | jD                  | jF                  z  dt        j0                              j3                  |j4                        }||z
  | jH                  z   }t        j$                  ||      }t        j                   |dd      \  }}|||z  z
  }d| jH                  z
  | jK                  |      z  }|j                  dd      jM                         }|jO                  ||d      }| jQ                  |      }||fS )Nr   r   dimr   r_   dtype)ptraining))r1   rD   rE   rF   viewr<   r/   	transposer>   r   updater'   r   r?   rJ   catchunkrepeatmatmulr%   sqrtr   
functionalsoftmaxfloat32torb   dropoutr9   rd   r&   sumrM   rN   rO   rP   rH   rS   
contiguousreshaperG   )rU   rV   rW   rX   rY   rZ   r[   kwargsbsz
target_len_q_lenquery_states
key_statesvalue_statescossinattn_weightslambda_1lambda_2lambda_fullattn_outputattn_output1attn_output2s                           r"   forwardzDiffLlamaAttention.forwarda   sI    +//1Z{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$Jz4+D+DE
 t/H/HIyy\1!!D"M#**1aA6||L*2F2Fq!2LMPTPYPYZ^ZgZgPhh%'.8L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<ll<>%*[[aQ%G"l"[<%??4+++t~~k/JJ!++Aq1<<>!))#ub9kk+.L((r!   NNNNF)r   r   r   __doc__r   intr5   rJ   Tensortuple
LongTensorr   boolr   __classcell__r8   s   @r"   r,   r,   =   s    Gj j3: jJ /304(,8)||8) #5<<#=>8) t+	8)
 &&-8) 8) 8) 
u||U\\D0%2E2LL	M8)r!   r,   c                        e Zd ZdZ fdZ	 	 	 	 ddej                  deej                  ej                  f   dej                  dz  dej                  dz  de	dz  d	e
d
eej                  df   fdZ xZS )DiffLlamaFlashAttention2aN  
    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y r   )r4   r5   r	   _flash_attn_uses_top_left_mask)rU   argsru   r8   s      r"   r5   z!DiffLlamaFlashAttention2.__init__   s#    $)&)
 /P.Q+r!   NrV   rW   rX   rY   rZ   r[   r\   c                 	   t        |t              rt        d      |j                         \  }}}	| j	                  |      }
| j                  |      }| j                  |      }|
j                  ||| j                  | j                        j                  dd      }
|j                  ||| j                  | j                        j                  dd      }|j                  ||| j                  | j                        j                  dd      }|\  }}t        |
|||      \  }
}| |j                  ||| j                        \  }}|
j                  dd      }
|j                  dd      }|j                  dd      }| j                  r| j                   nd}|
j"                  }|
j$                  j&                  dk7  r|
j$                  j&                  nd}|t(        j*                  k(  rt)        j,                  |      rt)        j.                  |      }nMt1        | j2                  d      r| j2                  j"                  }n | j                  j4                  j"                  }t6        j9                  d| d	       |
j;                  |      }
|j;                  |      }|j;                  |      }t)        j<                  |dd
      \  }}|j?                  dddd      }|j?                  dddd      }tA        |
||||||tC        | dd       | jD                  | jF                  
      }tA        |
||||||tC        | dd       | jD                  | jF                  
      }t)        jH                  ||gd
      }t)        j<                  |dd
      \  }}t)        jJ                  t)        jL                  | jN                  | jP                  z  dt(        j*                              j;                  |
j"                        }t)        jJ                  t)        jL                  | jR                  | jT                  z  dt(        j*                              j;                  |
j"                        }||z
  | jV                  z   }|||z  z
  }d| jV                  z
  | jY                  |      z  }|j[                  ||d      j]                         }| j_                  |      }|d fS )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr   r           mpscpu_is_quantizedzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r^   sliding_window)rY   rq   r   use_top_left_maskrA   r`   ra   )0
isinstancer   
ValueErrorr1   rD   rE   rF   re   r<   r/   rf   r>   r   rg   r'   rd   r9   rb   devicetyperJ   ro   is_autocast_enabledget_autocast_dtypehasattrr-   weightr6   r7   rp   ri   rj   r   r=   r   rA   rh   r&   rr   rM   rN   rO   rP   rH   rS   rt   rs   rG   )rU   rV   rW   rX   rY   rZ   r[   rv   ry   rx   rz   r{   r|   r}   r~   dropout_rateinput_dtypedevice_typetarget_dtypevalue_states1value_states2r   r   r   r   r   r   s                              r"   r   z DiffLlamaFlashAttention2.forward   sE    o{3} 
 &**,UA{{=1[[/
{{=1
 $((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J $--a3))!Q/
#--a315t--C #((2>2E2E2J2Je2Sl))..Y^%--'((5$77Do6#{{00#{{1177 >$ (??<8L#|4J'??<8L',{{<'J$}%,,Q1a8%,,Q1a8/% "4)94@"AAnn
 0% "4)94@"AAnn
 ii| <"E%*[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!))#ub9DDFkk+.D  r!   r   )r   r   r   r   r5   rJ   r   r   r   r   r   r   r   r   s   @r"   r   r      s    R 3704(,r!||r! #5<<#=>r! ((4/	r!
 &&-r! r! r! 
u||T!	"r!r!   r   c                      e Zd ZdZ	 	 	 	 ddej
                  deej
                  ej
                  f   dej
                  dz  dej                  dz  dedz  de	d	eej
                  ej
                  dz  eej
                     dz  f   fd
Z
y)DiffLlamaSdpaAttentiona   
    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    NrV   rW   rX   rY   rZ   r[   r\   c                    |j                         \  }}	}
| j                  |      }| j                  |      }| j                  |      }|j	                  ||	| j
                  | j                        j                  dd      }|j	                  ||	| j                  | j                        j                  dd      }|j	                  ||	| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}| |j                  ||| j                        \  }}t        || j                        }t        || j                        }t        j                  t        j                   |dd      d      }|j#                  dddd      }|}||d d d d d d d |j$                  d   f   }|d u xr |	dkD  }t        j&                  j(                  j+                  ||||| j,                  r| j.                  nd|      }t        j                   |dd      \  }}t        j0                  t        j2                  | j4                  | j6                  z  dt        j8                              j;                  |j<                        }t        j0                  t        j2                  | j>                  | j@                  z  dt        j8                              j;                  |j<                        }||z
  | jB                  z   }|||z  z
  }d| jB                  z
  | jE                  |      z  }|j                  dd      jG                         }|j	                  ||	d      }| jI                  |      }|d fS )	Nr   r   r^   r`   r   )	attn_mask	dropout_prA   ra   )%r1   rD   rE   rF   re   r<   r/   rf   r>   r   rg   r'   r   r?   rJ   rh   ri   rj   shaper   rm   scaled_dot_product_attentionrd   r9   r&   rr   rM   rN   ro   rp   rb   rO   rP   rH   rS   rs   rG   )rU   rV   rW   rX   rY   rZ   r[   ru   rv   ry   rx   rz   r{   r|   r}   r~   causal_maskrA   r   r   r   r   r   r   s                           r"   r   zDiffLlamaSdpaAttention.forward(  s    &**,UA{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$Jz4+D+DE
 t/H/HIyy\1!!D"M#**1aA6$%%aA/E1A1A"1E/E&EFK  4'5EAI	hh))FF!04d,,3 G 
 &+[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!++Aq1<<>!&&sE26kk+.D  r!   r   )r   r   r   r   rJ   r   r   r   r   r   r   r    r!   r"   r   r      s     /304(,?!||?! #5<<#=>?! t+	?!
 &&-?! ?! ?! 
u||U\\D0%2E2LL	M?!r!   r   )eagerflash_attention_2sdpac                   (     e Zd Zdedef fdZ xZS )DiffLlamaDecoderLayerr-   r'   c                 d    t         |   ||       t        |j                     ||      | _        y )N)r-   r'   )r4   r5   DIFFLLAMA_ATTENTION_CLASSES_attn_implementation	self_attnrT   s      r"   r5   zDiffLlamaDecoderLayer.__init__r  s-    +4V5P5PQY_ktur!   )r   r   r   r   r   r5   r   r   s   @r"   r   r   q  s    v v3 v vr!   r   c                   B    e Zd ZdZdZ ej                         d        Zy)DiffLlamaPreTrainedModelFc                    t        j                  | |       t        |t              rt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         y y )Nr   )r
   _init_weightsr   r,   initnormal_rM   r-   rL   rN   rO   rP   )rU   modules     r"   r   z&DiffLlamaPreTrainedModel._init_weights|  s    %%dF3f01LL))1dkk.H.HILL))1dkk.H.HILL))1dkk.H.HILL))1dkk.H.HI	 2r!   N)r   r   r   _supports_flex_attn_supports_attention_backendrJ   no_gradr   r    r!   r"   r   r   x  s*    "'U]]_J Jr!   r   c                       e Zd Zy)DiffLlamaModelNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForCausalLMNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)"DiffLlamaForSequenceClassificationNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForQuestionAnsweringNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForTokenClassificationNr   r    r!   r"   r   r     r#   r!   r   )r   r   r   r   r   r   )5r%   rJ   r    r   r   cache_utilsr   r   modeling_flash_attention_utilsr   r	   modeling_utilsr
   utilsr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_diffllamar   
get_loggerr   r6   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r(   r*   Moduler,   r   r   r   r   r   r   r   r   r   r   __all__r    r!   r"   <module>r      s!  "    & - i -  3
 
 
 2 4 
		H	%5 #	: 	2	3 	\) \)~A!1 A!HG!/ G!V  1" v- vJ3 J	Z 		+ 		)G 		$= 		&A 	r!   