
    i                        d dl Z ddlmZ ddlmZ e j
                  j                  de j                  j                  de j                  de j                  de j                  d	e j                  dz  d
ede j                  de j                  e
ee j                  f   z  dedee
eef   z  de j                  dz  dee j                  df   fd       Zy)    N   )PagedAttentionCache)!lazy_import_paged_flash_attentionmoduleqkvattention_maskcachecu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kblock_tablereturnc                 2   t        | j                  j                        \  }}t        | dd      sdn| j                  dz
  df}|dk(  rdnd}t        |t              r
||   }|	|   }	|
	|j                  ||| j                  |d	   |d
         \  }}d|v rd|j                  d      ini } ||j                  dd      j                  d      j                         |j                         |j                         |j                  t        j                        |j                  t        j                        j!                         ||	f| j"                  d|d|}t        |t$              r|d   }|dfS |j&                  | j                     \  }}|j(                  |   j+                  d|j,                  |j.                  |j0                        }|j2                  |   j+                  d|j,                  |j.                  |j0                        }|j5                  dddd      j                         }|j5                  dddd      j                         }|j5                  dddd      j                         }|j7                  d      }|d|dz    |d| z
  dz
  j                  t        j                        }|j9                  |      |
|   i}d|v r|d   |d<    |d||||||| j"                  d|d	|}t        |t$              r|d   }|j                  d      }|dfS )ap  Performs the forward pass of attention with paged key-value cache. This function handles the cache updates and
    performs the attention computation. For decode-only batches (when block_table is provided), uses
    `flash_attn_with_kvcache` for fused attention + cache update. Otherwise uses `flash_attn_varlen_func`.
    See the [paged attention guide](https://huggingface.co/docs/transformers/en/paged_attention) for more details.

    Args:
        q: (1, nheads, total_q, headdim), where total_q = total number of query tokens in the batch.
        k: (1, nheads_k, total_k, headdim), where total_k = total number of key tokens in the batch.
        v: (1, nheads_k, total_k, headdim), where total_k = total number of key tokens in the batch.
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        block_table: (num_groups, batch_size, max_blocks_per_seq), dtype int32. Block table for paged KV cache.
            If provided, uses flash_attn_with_kvcache for fused attention + cache update. For each request, the block
            table is a vector of size (max_blocks_per_seq,) with indices indicating the physical location of the cache
            to read from and write to. The kernel, using the cache_seqlens for that request, knows how much cache to
            read and dispatches the read using the block table. Same for the write. If a request has fewer than
            max_blocks_per_seq blocks, the block table is padded with -1s to indicate that the block is not allocated.
    sliding_windowF)r      r   full_attentionsliding_attentionN
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   s_auxr   T)softmax_scalecausalwindow_sizer      )	r   k_cachev_cacher   r	   cache_seqlensr   r   r     )r   config_attn_implementationgetattrr   
isinstancedictupdater   get	transposesqueeze
contiguoustotorchint32clonescalingtuplelayer_index_to_group_indices	key_cacheview
block_sizenum_key_value_headshead_dimvalue_cachepermutesizeget_block_table_key)r   r   r   r	   r
   r   r   r   r   r   r   kwargsflash_attn_varlen_funcflash_attn_with_kvcacher   
layer_typecustom_kwargsattn_output	group_idxlayer_idx_in_groupr"   r#   
batch_sizer$   flash_kwargss                            v/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/integrations/flash_paged.pypaged_attention_forwardrK      sI   L 7X**733
 &-V5Eu%MXTZTiTilmTmopSqN%3x%?!EXJ-&%j1#J/ ||&&l+}-  
1 ;BV:K&**W"56QS,KK1%%a(335LLNLLNU[[)U[[)//1
 !..&
 
 k5)%a.KT K ).(J(J6K[K[(\%	%//"45::  %";";U^^
 ##$67<<  %";";U^^
 IIaAq!,,.IIaAq!,,.IIaAq!,,. VVAY
&q:>:];J=WWZ[[__`e`k`kl112IJKXaLbcf$*7OL!- 
' ..&
 
 k5)%a.K!))!,    )r1   generation.continuous_batchingr   modeling_flash_attention_utilsr   compilerdisablennModuleTensorr*   strintr5   rK   r%   rL   rJ   <module>rV      s
    @ N sHHOOs||s ||s ||	s
 LL4's s <<s <<$sELL'8"99s s S#X&s $s 5<<s srL   