
    i                        d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z
mZmZmZ  G d d	      Zd
edefdZdedededefdZdedededefdZ	 d dej(                  dee   dee   deddf
dZdedededededee
   fdZy)!    )OrderedDict)ceil)AnyN)PretrainedConfig   )FutureRequestStateRequestStateRequestStatusloggerc                       e Zd ZdZdeddfdZddZdededej                  j                  dz  fd	Z
dd
eddfdZdededej                  j                  ddfdZy)CudaGraphBufferz>A fixed-size dict for CUDA graphs with LRU eviction when full.max_sizereturnNc                 V    |dk  rt        d|       || _        t               | _        y )Nr   z#max_size must be positive, but got )
ValueErrorr   r   _storage)selfr   s     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/generation/continuous_batching/utils.py__init__zCudaGraphBuffer.__init__   s,    q=B8*MNN LWM    c                 \    | j                   }d| _         | j                  d       || _         y )Nr   T)silent)r   plan_for_new_graph)r   original_max_sizes     r   __del__zCudaGraphBuffer.__del__"   s+     MMt,)r   q_lenkv_lenc                 ~    | j                   j                  ||f      }|| j                   j                  ||f       |S N)r   getmove_to_endr   r   r   graphs       r   	get_graphzCudaGraphBuffer.get_graph(   s;    !!5&/2MM%%ufo6r   r   c                 "   t        | j                        | j                  k\  rm| j                  j                  d      \  }}|st	        j
                  d|       |j                          t        | j                        | j                  k\  rly y )NF)lastz!Evicting graph for evicted_key = )lenr   r   popitemr   inforeset)r   r   evicted_keyevicted_graphs       r   r   z"CudaGraphBuffer.plan_for_new_graph.   sn    $-- DMM1)-)>)>E)>)J&K@+1ABC!	 $-- DMM1r   r#   c                 |    | j                          t        j                  d|d|       || j                  ||f<   y )NzSetting graph for q_len = z, kv_len = )r   r   r)   r   r"   s       r   	set_graphzCudaGraphBuffer.set_graph5   s:    !1
,VK@A).ufo&r   )r   N)F)__name__
__module____qualname____doc__intr   r   torchcuda	CUDAGraphr$   boolr   r.    r   r   r   r      s    HZ Z Z*s C EJJ4H4H44O " "$ "/s /C /

8L8L /QU /r   r   configr   c                     | j                   dv S )z:Checks if attention mask is needed for the given (config).)zpaged|eagerz
paged|sdpa)_attn_implementation)r9   s    r   attn_mask_is_neededr<   <   s    &&*GGGr   sizeinterval_size	max_valuec                 X    |dk  r|S | dkD  rt        | |z        |z  n|}t        ||      S )zQReturn the smallest multiple of (interval_size) >= (size), capped at (max_value).r   )r   min)r=   r>   r?   paddeds       r   pad_to_intervalrC   A   s9    ;?!8T$&'-7Fvy!!r   x	divide_byalign_toc                 T    t        t        | |z              } | |z  r| || |z  z
  z  } | S r   )r3   r   )rD   rE   rF   s      r   aligned_dividerH   I   s4    DY A8|	XX&&Hr   attention_maskcumulative_seqlens_qcumulative_seqlens_ksliding_windowc                 J   t        j                  | j                        j                  }t	        t        |      dz
        D ]  }||dz      ||   z
  }||dz      ||   z
  }||k  r|dk\  r	||z
  dz   }nd}t        ||   ||dz            }	t        ||   ||dz            }
t        j                  | d|	|
f   j                  || j                  | j                        }t        j                  ||      }|dkD  r"||z
  |z
  }|t        j                  ||      z  }|| d|	|
f<    y)u  Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it
    will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its
    equivalent) so it's more of an attention score bias tensor.
    The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair.
    Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask.

    An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6:

    CAUSAL MASK:

           █ █ █ █ █ ░ ░ ░
           █ █ █ █ █ █ ░ ░
           █ █ █ █ █ █ █ ░
           █ █ █ █ █ █ █ █

    SLIDING WINDOW MASK:
         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the left
       <─┴─>
     ░ █ | █ █ █ █ █ █ █ █
     ░ ░ | █ █ █ █ █ █ █ █
     ░ ░ | ░ █ █ █ █ █ █ █
     ░ ░ | ░ ░ █ █ █ █ █ █

    ATTENTION MASK (sum of causal and sliding window masks):

           █ █ █ █ █ ░ ░ ░
           █ █ █ █ █ █ ░ ░
           ░ █ █ █ █ █ █ ░
           ░ ░ █ █ █ █ █ █

    Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2:

    CAUSAL MASK:

           █ █ █ ░ ░
           █ █ █ █ ░
           █ █ █ █ █

    SLIDING WINDOW MASK:
         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the left
        <┴>
         | ░ █ █ █ █
         | ░ ░ █ █ █
         | ░ ░ ░ █ █

    ATTENTION MASK (sum of causal and sliding window masks):

           ░ █ █ ░ ░
           ░ ░ █ █ ░
           ░ ░ ░ █ █

    r   .)dtypedevice)diagonalN)r4   finforN   rA   ranger'   slicefullshaperO   triutril)rI   rJ   rK   rL   	min_valueiseqlen_qseqlen_kcausal_diagonalquery_range	key_range	minus_infmaskedsliding_diagonals                 r   build_attention_maskrb   P   sS   t N00155I3+,q01 ='A.1Ea1HH'A.1Ea1HHh8q=&1A5OO035I!a%5PQ.q13GA3NO	JJ3Y67== &&!((	
	 I@A'(2^Cejj5EFFF6<sK23-=r   numstatusnum_query_tokensnum_cache_tokenscachec                    t        |       D cg c]  }d|j                   d| d }}||z   }t        ||j                  z        }g }	|D ]m  }
t	        |
dg|z  d      }||_        dg|z  |_        ||_        |j                  ||j                  d      }||	c S |	j                  t        |dd             o |	S c c}w )	zQAn utility function to create a list of FutureRequestStates for the warmup of CB.	__warmup____r   r   )
request_idinitial_tokensmax_new_tokensT)has_new_tokencomplete_blocks)rR   namer   
block_sizer	   _statustokens_to_processposition_offsetallocate_blocksrl   appendr   )rc   rd   re   rf   rg   rY   request_idstotal_tokensblocks_neededfuture_statesreq_idstate	allocateds                r   create_warmup_future_statesr      s     =B#JGqYv{{m1QCr2GKG#&66L(8(889MM 	_s\?Qbcd#$#(8"8 0))-9I9I1M	  /T[\]^	_  Hs   B=)r   )collectionsr   mathr   typingr   r4    transformers.configuration_utilsr   requestsr   r	   r
   r   r   r7   r<   r3   rC   rH   Tensorlistrb   r   r8   r   r   <module>r      s   $    = M M /  /FH 0 HT H
"# "c "c "c "c c S S  	Q=LLQ=s)Q= s)Q= 	Q=
 
Q=h	  	
  

r   