
    i\1              #          d Z ddlZddlmZ ddlmZmZ ddlZ ej                  e	      Z
ddgZdee   dz  dee   fd	Z ed
      dedefd       Z G d de      Zej$                  j'                  di       	 	 	 d0dej(                  dej(                  dej(                  dej(                  dej(                  dededededz  dee   dz  deej(                  ej(                  ej(                  f   fd       Zej0                  	 	 	 d0dej(                  dej(                  dej(                  dej(                  dej(                  dededededz  dee   dz  deej(                  ej(                  ej(                  f   fd       Zdddddej(                  dej(                  dej(                  dej(                  dej(                  dedededz  dedz  deeef   dej(                  eej(                  ej(                  f   z  fdZd ed!eed"f   d#eddfd$Zej$                  j'                  d%i       	 	 d1d&ej(                  dej(                  dej(                  dej(                  d'ej(                  d(ej(                  dej(                  dej(                  dededed)ej(                  dedz  dee   dz  deej(                  ej(                  ej(                  f   fd*       Zej0                  	 	 d1d&ej(                  dej(                  dej(                  dej(                  d'ej(                  d(ej(                  dej(                  dej(                  dededed)ej(                  dedz  dee   dz  deej(                  ej(                  ej(                  f   fd+       Zd ed&ej(                  d,ej(                  d-ej(                  deej(                  dz  d"f   f
d.Zej?                  ee/       y)2z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuplevarlen_attn
AuxRequestwindow_sizereturnc                 \    | ddg} t        |       dk7  rt        dt        |              | S )N   z$window_size must have length 2, got )len
ValueError)r   s    j/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py_normalize_window_sizer      s=    2h
;1?K@P?QRSS       )maxsizedevice_indexc                      y)z;Cache device capability check to avoid repeated CUDA calls.F )r   s    r   _should_use_cudnnr      s     r   c                        e Zd ZU dZdZeed<   y)r   z
    Request which auxiliary outputs to compute from varlen_attn.

    Each field is a boolean indicating whether that auxiliary output should be computed.
    FlseN)__name__
__module____qualname____doc__r   bool__annotations__r   r   r   r   r   #   s     Cr   ztorch_attn::_varlen_attn)mutates_argsquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalscalec
                 z   t        |	      }	| j                  xr t        | j                  j                        }
|
rvt
        j                  d       |	d   dk7  s|	d   dk7  rt        d      t        j                  j                  j                  | ||d||||dd|d	|
      }|d   |d   |d   }}}nWt
        j                  d       t        j                  j                  j                  | ||||||d|d	||	d   |	d         \  }}}}}t        j                  dt        j                  | j                        }|||fS )z
    Private custom op for variable-length attention.

    This is the internal implementation. Users should use the public varlen_attn function instead.
    #Using cuDNN backend for varlen_attnr   r      TcuDNN backend does not support window attention. Please use Flash Attention backend.NT        Fr)      -Using Flash Attention backend for varlen_attn)return_debug_maskr)   window_size_leftwindow_size_rightr   dtypedevice)r   is_cudar   r8   indexloginfoRuntimeErrortorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r!   r"   r#   r$   r%   r&   r'   r(   r)   r   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_s                    r   _varlen_attnrL   -   sW   $ )5KG"3ELL4F4F"GI67q>R;q>R#7f  88 9 
  *0F1IvayY@A/4yy~~/V/V#(^)!n 0W 0
,Y1  ELLJ ;
**r   c
                    t        |	      }	t        j                  |       }
| j                  d      }| j                  d      }t        j                  j
                  rH|j                  d      dz
  }t        j                  |||ft        j                  | j                        }n2t        j                  ||ft        j                  | j                        }t        j                  dt        j                  | j                        }|
||fS )z
    Fake implementation for meta tensor computation and tracing.

    Based on the 3D varlen path from meta__flash_attention_forward:
    - query shape: (total, num_heads, head_dim)
    - logsumexp shape: (num_heads, total_q)
    r   r,   r6   r5   )
r   r>   
empty_likesizeversionhipemptyfloatr8   rD   )r!   r"   r#   r$   r%   r&   r'   r(   r)   r   rG   total_q	num_heads
batch_size	logsumexprI   s                   r   _varlen_attn_fakerX   s   s    ( )5K e$F jjmG

1I}}]]1%)
KKE*%++ell
	 KK ELL
	 DU\\JI9i''r   )r   r   )
return_auxr)   r   rY   c                    |	dk(  }
t         j                  j                  j                  | |||||||
|t	        |	      
      \  }}}||j
                  r||fS |S )au  
    Compute variable-length attention using Flash Attention.
    This function is similar to scaled_dot_product_attention but optimized for
    variable-length sequences using cumulative sequence position tensors.

    Args:
        query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
        key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
        value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
        cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
        cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
        max_q (int): Maximum query sequence length in the batch.
        max_k (int): Maximum key/value sequence length in the batch.
        return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
        scale (float, optional): Scaling factor for attention scores
        window_size (tuple[int, int], optional): Window size for sliding window attention as (left, right).
            Use (-1, -1) for full attention (default), (-1, 0) for causal attention,
            or (W, 0) for causal attention with sliding window of size W.

    Returns:
        output (Tensor): Output tensor from attention computation; shape :math:`(T_q, H, D)`.

        If ``return_aux`` is not None and ``return_aux.lse`` is True:
            lse (Tensor): Log-sum-exp of attention scores; shape :math:`(T_q, H)`.

    Shape legend:
        - :math:`N`: Batch size
        - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
        - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
        - :math:`H`: Number of attention heads
        - :math:`D`: Head dimension

    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
        >>> head_dim = embed_dim // num_heads
        >>> seq_lengths = []
        >>> for _ in range(batch_size):
        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
        ...     seq_lengths.append(min(length, max_seq_len))
        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
        >>> total_tokens = seq_lengths.sum().item()
        >>>
        >>> # Create packed query, key, value tensors
        >>> query = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> key = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> value = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>>
        >>> # Build cumulative sequence tensor
        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
        >>> cu_seq[1:] = seq_lengths.cumsum(0)
        >>> max_len = seq_lengths.max().item()
        >>>
        >>> # Call varlen_attn
        >>> output = varlen_attn(
        ...     query, key, value, cu_seq, cu_seq, max_len, max_len
        ... )
    )r   r   )r>   r?   
torch_attnrL   listr   )r!   r"   r#   r$   r%   r&   r'   rY   r)   r   r(   outr   rJ   s                 r   r   r      sn    \ w&I))&&33[KCa *..CxJr   ctxinputs.rG   c           
          |\
  }}}}}}}	}
}}|\  }}}| j                  ||||||||       || _        |	| _        |
| _        || _        || _        y N)save_for_backwardr&   r'   r(   r)   r   )r^   r_   rG   r!   r"   r#   r$   r%   r&   r'   r(   r)   r   r]   r   rI   s                   r   _setup_contextrc      su     	 Ci%eXxc9UCICICMCI!COr   z!torch_attn::_varlen_attn_backwardgrad_outr]   r   rI   c                 N   t        |      }t        j                  d|j                        }|j                  xr t        |j                  j                        }|rmt        j                  d       |d   dk7  s|d   dk7  rt        d      t        j                  j                  j                  | |||||||||	d|
|||      \  }}}nYt        j                  d	       t        j                  j                  j                  | |||||||||	d|
||||d   |d   
      \  }}}|||fS )Nr   )r8   r+   r   r,   r-   r.   r/   r1   )r)   r3   r4   )r   r>   rR   r8   r9   r   r:   r;   r<   r=   r?   r@   _cudnn_attention_backward_flash_attention_backward)rd   r!   r"   r#   r]   r   r$   r%   r&   r'   r(   rI   r)   r   unusedrE   dqdkdvs                      r   _varlen_attn_backwardrl     sD   " )5K[[5<<0FG"3ELL4F4F"GI67q>R;q>R#7f  YY^^== > 

B$ 	@AYY^^==(^)!n# > 

B& r2:r   c                     t        |      }t        j                  |      }t        j                  |      }t        j                  |      }|||fS )zF
    Fake implementation for meta tensor computation and tracing.
    )r   r>   rN   )rd   r!   r"   r#   r]   r   r$   r%   r&   r'   r(   rI   r)   r   
grad_querygrad_key
grad_values                    r   _varlen_attn_backward_fakerq   \  sK    ( )5K!!%(J$H!!%(Jx++r   grad_lsegrad_rngc                 0   | j                   \  }}}}}}	}
}| j                  }| j                  }| j                  }| j                  }| j
                  }t        j                  j                  j                  |||||	|
||||||||      \  }}}|||d d d d d d d f
S ra   )
saved_tensorsr&   r'   r(   r)   r   r>   r?   r[   rl   )r^   rd   rr   rs   r!   r"   r#   r$   r%   r]   r   rI   r&   r'   r(   r)   r   ri   rj   rk   s                       r   	_backwardrv   y  s     BEARAR>E3x3YIIEIIEIIIE//K%%;;JBB  r2tT4tT4??r   )setup_context)FNN)NN) r   logging	functoolsr   typingr   r   r>   	getLoggerr   r;   __all__r\   intr   r   r   r   library	custom_opTensorrS   tuplerL   register_fakerX   r   rc   rl   rq   rv   register_autogradr   r   r   <module>r      s     "  g!,
'S	D(8 T#Y  1C D  
  3"E $(B+<<B+	B+ <<B+ ll	B+
 llB+ B+ B+ B+ 4<B+ cT!B+ 5<<u||34B+ FB+J  $(((<<((	(( <<(( ll	((
 ll(( (( (( (( 4<(( cT!(( 5<<u||34(( ((h %)#+]<<]	] <<] ll	]
 ll] ] ] T!] 4<] sCx] \\E%,,455]@" "U38_ "c "d "0 <2N $(AllA<<A 
A <<	A
 
A 
A llA llA A A A ||A 4<A cT!A 5<<u||34A OAH $$ $(,ll,<<, 
, <<	,
 
, 
, ll, ll, , , , ||, 4<, cT!, 5<<u||34, %,8@	@@05@HM@
5<<$#$@<   y  Gr   