
    iK                    F   U d Z ddlmZ ddlZddlZddlmZ erddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlZdd	lmZ d
dlmZ dgZdaded<   daded<   e
 G d d             Zed*d       Z	 d+	 	 	 d,dZd-dZd.dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d/dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d0dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d1dZ  ed      Z!d2dZ"d3dZ#	 	 	 	 d4	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d5dZ$	 d6	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d7dZ%	 	 	 d8ddddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d9d Z&ddddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d:d!Z'dddd"	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d;d#Z(	 	 	 	 	 	 d<dd$	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d=d%Z)	 	 	 d>dd$	 	 	 	 	 	 	 	 	 	 	 	 	 d?d&Z*dd$	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d@d'Z+ ejX                  d(e)       y)Az
PROTOTYPE!
Flash Attention 3 implementation.
For fp8: only supports forward pass right now.
For fp16/bf16: supports forward and backward pass.
    )annotationsN)TYPE_CHECKING)Callable)	dataclass)cache)TypeVarTupleUnpack)Library   )	_registryregister_flash_attention_fa3zCallable | None_FA3_CUDA_FWD_FA3_CUDA_BWDc                       e Zd ZU ded<   ddZy)
_FA3HandlezLibrary | Nonelibraryc                P    d | _         t        j                  j                  d       y )NF)r   torch_C_set_sdp_use_fa3)selfs    h/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/torch/nn/attention/_fa3.pyremovez_FA3Handle.remove*   s    !!%(    N)returnNone)__name__
__module____qualname____annotations__r    r   r   r   r   &   s    )r   r   c                J    t         j                  j                  |       \  }}|S N)r   cudaget_device_capability)devicemajor_s      r   _get_device_majorr)   0   s    zz//7HE1Lr   c                |    t        |        t        j                  j                  d       t	        t                     S )z
    Register FA3 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA3 implementation.
    T)_fa3_import_moduler   r   r   r   _fa3_register_kernelsmodule_paths    r   r   r   6   s/     {# 
HHd#+-..r   c                   t        j                  |        t        t        j                  d      st        d|  d      t        t        j                  j                  d      st        d|  d      t        t        j                  j                  d      st        d|  d      t        j                  j                  j                  at        j                  j                  j                  a
y )Nflash_attn_3zModule 'z' does not expose FA3 kernelsfwdz%' does not expose FA3 forward kernelsbwdz&' does not expose FA3 backward kernels)	importlibimport_modulehasattrr   opsRuntimeErrorr0   r1   r   r2   r   r-   s    r   r+   r+   G   s    K(599n-Xk]2OPQQ599))51{m#HI
 	
 599))51{m#IJ
 	
 II**..MII**..Mr   c                 4   t        ddd      } | j                  dt        d       | j                  dt        d       | j                  dt        d       | j                  dt
        d       | j                  dt        d       | j                  d	t        d       | S )
NatenIMPLCUDAz"_flash_attention_forward.quantizedz-_scaled_dot_product_flash_attention.quantized_flash_attention_forward#_scaled_dot_product_flash_attention_flash_attention_backward,_scaled_dot_product_flash_attention_backward)r
   impl!_fa3_flash_attention_forward_impl4_fa3_scaled_dot_product_flash_attention_forward_impl)_fa3_flash_attention_forward_impl_default<_fa3_scaled_dot_product_flash_attention_forward_impl_default"_fa3_flash_attention_backward_impl5_fa3_scaled_dot_product_flash_attention_backward_impl)libs    r   r,   r,   X   s    
&&&
)CHH,.OQW HH7<
 HH"$Mv HH-D HH(*LfUHH6=
 Jr   c                   |dk7  ryt        d |D              syt        |D ch c]  }|j                   c}      dk7  ry| j                  t        j
                  k(  r |||t        j                  dt               || j                         dk7  ry	|| j                         d
k7  ryt        j                  j                         syt        | j                        dk7  ryy c c}w )N        zdropout_p must be 0c              3  4   K   | ]  }|j                     y wr#   )is_cuda.0ts     r   	<genexpr>z,_fa3_common_support_error.<locals>.<genexpr>   s     *Qqyy*s   zinputs must be CUDA tensorsr   inputs must share devicezWhen using SDPA with fp8, descale tensor should always be used for accurate dequantization. Please use _scaled_dot_product_attention_quantized and provide the descale tensors.   zdense query must be 4D   zragged query must be 3DzCUDA not available	   z#FA3 requires compute capability 9.0)alllenr&   dtyper   float8_e4m3fnwarningswarnUserWarningdimr$   is_availabler)   )querytensors	dropout_p	cum_seq_q	q_descale	k_descale	v_descalerN   s           r   _fa3_common_support_errorrd   t   s     C$*'**,
g&AHH&'1,){{e)))Y.)2C+ 	
 UYY[A-'!1(::""$#&!+4) 's   C#c           	        |ry|y|+|j                   t        j                  k7  ry|j                  syt        j                  t        j
                  t        j                  ft        fd| ||hD              sd S t        | ||hD ch c]  }|j                    c}      dk7  ryt        | | ||f||||	|
      }||d	k(  ry
|S y c c}w )Nzreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDAc              3  :   K   | ]  }|j                   v   y wr#   rV   rM   rN   supported_dtypess     r   rO   z-_fa3_forward_support_error.<locals>.<genexpr>   s     Hqqww**H   inputs must be one of r   #all inputs must have the same dtyperP   z(query, key, value must be on same device)
rV   r   int32rK   rW   float16bfloat16rT   rU   rd   )r]   keyvaluer_   return_debug_maskalibi_slopes	seqused_kr`   ra   rb   rc   rN   errorri   s                @r   _fa3_forward_support_errorrv      s     0+??ekk),  +++U]]ENNKHUC4GHH'(8'9::
eS%01AGG12a74%	UE ..= 2s   C	c
           	        |j                   t        j                  k(  r	 y|j                   t        j                  k7  ryt        j                  t        j
                  ft        fd| ||||hD              sd S t        | ||||hD 
ch c]  }
|
j                    c}
      dk7  ryt        || |||||f||d d d       }||S y c c}
w )NzHFA3 backward does not support fp8 - use inference only (torch.no_grad())zlogsumexp dtype must be float32c              3  :   K   | ]  }|j                   v   y wr#   rg   rh   s     r   rO   z._fa3_backward_support_error.<locals>.<genexpr>   s     Wqqww**Wrj   rk   r   rl   )	rV   r   rW   float32rn   ro   rT   rU   rd   )grad_outr]   rp   rq   out	logsumexpr_   r`   window_size_leftwindow_size_rightrN   ru   ri   s               @r   _fa3_backward_support_errorr      s     {{e)))V	
 %--'0u~~6WXuc5RU4VWW'(8'9::
hsE3?@AGG@AQF4%	5#uc95E  As   CTsc                 &    t        d | D              S )Nc              3  @   K   | ]  }|j                  d d        yw)r      N)	transposerL   s     r   rO   z#_transpose_dense.<locals>.<genexpr>   s     4qQ"4s   )tuple)r^   s    r   _transpose_denser      s    4G444r   c                R    | $| j                  d      dk7  r| j                         S | S )z2Ensure tensor is contiguous in the last dimension.r   )stride
contiguous)xs    r   _maybe_contiguousr      s&    ]qxx|q/@1<<>GaGr   c                   t         t        d      t        |       }t        |      }|j                  t        j
                  k(  r8|j                  d      dk7  r$|j                  d      dk7  r|j                         n
t        |      }t        |      }t        |      }t        |      }t        g |||ddd|||dd|||dddddd||||||	|	nd|
|
ndddddt	        j                         rdnddt        j                  j                         xs d \  }}}}||j                         fS )	zF
    Run the FA3 forward pass by calling the C++ kernel directly.
    NFA3 not registeredr   r   r   rI   T)r   r7   r   rV   r   rW   r   r   $are_deterministic_algorithms_enabledr   _get_sm_carveout_experimental)r]   rp   rq   cu_seq_qcu_seq_kmax_qmax_kscale	is_causalr}   r~   rt   r{   ra   rb   rc   qkvcu_seqlens_qcu_seqlens_ksoftmax_lse	out_accumsoftmax_lse_accums                           r   _fa3_run_forwardr      sM   * /00% A#A ;;%---LL!LL! 	 u%  %X.L$X.L!),I5B #6	#6	#6 	
#6 		#6
 	#6 	#6 	#6 	#6 	#6 	#6 	#6 	#6 	#6 	#6 	#6  	!#6" 	##6$ 	%#6& 	'#6( 	)#6* 	+#6, 	-#6. 	/#60 	1#62 	3#64 -8b5#66 /:7#68 	
9#6: 	;#6< 	=#6> 	?#6@ 779qA#6B 	C#6D 	..05AE#62Ci!2H &&(((r   c                R   t         t        d      t        |       }|j                  d      dk7  r|j	                         n|}|j                  d      dk7  r|j	                         n|}|j                  d      dk7  r|j	                         n|}t        |      }t        |      }t        j                  |      }t        j                  |      }t        j                  |      }t        |||||||||||d d ||	|
|||d|t
        j                  j                         xs d       |||fS )Nr   r   r   rI   r   )	r   r7   r   r   r   r   
empty_liker   r   )rz   r]   rp   rq   r{   r|   r   r   max_seqlen_qmax_seqlen_kr   r   r}   r~   deterministicdoutr   r   r   olsedqdkdvs                           r   _fa3_run_backwardr   C  s/   " /00 X&D#ll2.!3AJJrNa/SA#ll2.!3A#A
I
&C 
		!	B			!	B			!	B				


..05A-0 r2:r   r   r   r}   r~   rt   rs   r{   c                  t        | ||||	||||
||      }|t        d|       t        | |||||||||||||
||      \  }}t        j                  dt        j
                  | j                        }t        j                  dt        j
                  | j                        }t        j                  d| j                  | j                        }|||||fS )Nz)FA3 flash_attention forward unsupported: )r   )rV   r&   r!   r   )	rv   r7   r   r   zerosuint64r&   emptyrV   )r]   rp   rq   r`   	cum_seq_kr   r   r_   r   rr   ra   rb   rc   r   r}   r~   rt   rs   r{   ru   r   	rng_statephilox_offset
debug_masks                           r   rA   rA   ~  s    , 'E FugNOO!HC$ DU\\JIKK%,,u||LMQekk%,,GJYz99r   c
               >    t        | |||||||||	d d d |
|||||      S )Nr   )rA   )r]   rp   rq   r`   r   r   r   r_   r   rr   r   r}   r~   rt   rs   r{   s                   r   rC   rC     sJ    & -)+!' r   )r   r}   r~   c                   t        | ||||||
|||
      }|t        d|       t        j                         }t	        | |||||||||	||||nd||nd|      \  }}}|||fS )z0FA3 implementation of _flash_attention_backward.z*FA3 flash_attention backward unsupported: r   )r   r7   r   r   r   )rz   r]   rp   rq   r{   r|   r`   r   r   r   r_   r   r   unusedr   r}   r~   ru   r   r   r   r   s                         r   rE   rE     s    * (E GwOPP>>@M",8b.:JBB" r2:r   r   c	                  t        | ||||d d d |||      }
|
t        d|
       t        | ||      \  }}}| j                  t        j
                  k(  rt        j                  n| j                  }t	        j                  | |      }|j                  dd      }|j                  d      }|j                  d      }t        |||d d ||||||	||||      \  }}}}}| j                  d      }|j                  d      }||d d |||||f	S )NzFA3 SDPA forward unsupported: rg   r   r   )r   r{   ra   rb   rc   )rv   r7   r   rV   r   rW   ro   r   r   sizerA   )r]   rp   rq   ra   rb   rc   r_   r   rr   r   ru   r   r   r   	out_dtypeout_bhsdout_bshdmax_q_flashmax_k_flashr(   r   r   r   r   r   r   s                             r   rB   rB      sA    'E ;E7CDDuc51GAq!
 #(++1D1D"D%++IY7H!!!Q'H&&)K&&)K3T			40AsI}j" JJqMEHHQKE
 
r   c               ,    t        | ||d d d ||||
      S )Nr   )rB   )r]   rp   rq   r_   r   rr   r   s          r   rD   rD   g  s0     @ r   c                   t        | ||||||
ddd
      }|t        d|       t        | ||||      \  }}}}}t        ||||||dd||	|
||||      \  }}}t        |||      \  }}}|||fS )zCFA3 implementation of _scaled_dot_product_flash_attention_backward.NzFA3 SDPA backward unsupported: r   )r   r7   r   rE   )rz   r]   rp   rq   r{   r|   r`   r   r   r   r_   r   philox_seedr   r   ru   
grad_out_tq_tk_tv_tout_tr   r   r   dq_outdk_outdv_outs                              r   rF   rF     s    & (%eS)YdDE <UGDEE (8%eS($JS#u 4JBB& .b"b9FFF66!!r   FA3)register_fn)r&   ztorch.devicer   int)flash_attn_interface)r.   strr   r   )r.   r   r   r   )r   r
   )r]   torch.Tensorr^   ztuple[torch.Tensor, ...]r_   floatr`   torch.Tensor | Nonera   r   rb   r   rc   r   r   
str | None)r]   r   rp   r   rq   r   r_   r   rr   boolrs   r   rt   r   r`   r   ra   r   rb   r   rc   r   r   r   )rz   r   r]   r   rp   r   rq   r   r{   r   r|   r   r_   r   r`   r   r}   
int | Noner~   r   r   r   )r^   z
Unpack[Ts]r   ztuple[Unpack[Ts]])r   r   r   r   )NNNN)"r]   r   rp   r   rq   r   r   r   r   r   r   r   r   r   r   float | Noner   r   r}   r   r~   r   rt   r   r{   r   ra   r   rb   r   rc   r   r   z!tuple[torch.Tensor, torch.Tensor])F) rz   r   r]   r   rp   r   rq   r   r{   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r}   r   r~   r   r   r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])NNN)&r]   r   rp   r   rq   r   r`   r   r   r   r   r   r   r   r_   r   r   r   rr   r   ra   r   rb   r   rc   r   r   r   r}   r   r~   r   rt   r   rs   r   r{   r   ) r]   r   rp   r   rq   r   r`   r   r   r   r   r   r   r   r_   r   r   r   rr   r   r   r   r}   r   r~   r   rt   r   rs   r   r{   r   )"rz   r   r]   r   rp   r   rq   r   r{   r   r|   r   r`   r   r   r   r   r   r   r   r_   r   r   r   r   r   r   r   r   r   r}   r   r~   r   )NNNrI   FF)r]   r   rp   r   rq   r   ra   r   rb   r   rc   r   r_   r   r   r   rr   r   r   r   )rI   FF)r]   r   rp   r   rq   r   r_   r   r   r   rr   r   r   r   )rz   r   r]   r   rp   r   rq   r   r{   r   r|   r   r`   r   r   r   r   r   r   r   r_   r   r   r   r   r   r   r   r   r   )-__doc__
__future__r   r3   rX   typingr   collections.abcr   dataclassesr   	functoolsr   typing_extensionsr   r	   r   torch.libraryr
    r   __all__r   r    r   r   r)   r   r+   r,   rd   rv   r   r   r   r   r   r   rA   rC   rE   rB   rD   rF   register_flash_attention_implr!   r   r   <module>r      s[   #     ( !  2  !  #
 "& %!% % ) ) )   .///"/"8""%" " #	"
 #" #" #" "J((	( ( 	(
 ( &( #( #( #( #( #( (V### 
# 	#
 
# # # ## !# "# #L $5H$  $%)%)%)!J)J)	J) J) "	J)
 "J) J) J) J) J) !J) "J) #J) 
J) #J) #J)  #!J)" '#J)x  888 
8 	8
 
8 8 "8 "8 8 8 8 8 8 8 8  5!8L &*%)%):: %)(,#)::::	:: :: #	::
 #:: :: :: :: :: :: #:: #:: #:: ::  !::" #::$ #%::& &'::( 
)::R %)(,##''	' ' #	'
 #' ' ' ' ' ' ' ' ' #'  &!'" 
#'t #'$(%888 
8 	8
 
8 8 #8 #8 8 8 8 8 8 8  !8" !#8$ "%8~ &*%)%)#D DD	D D #	D
 #D #D D D D DV # 	  	
   P !2"2"2" 
2" 	2"
 
2" 2" #2" #2" 2" 2" 2" 2" 2"  2"  !2"j (	 ' ';W Xr   