
    i9                     p   d Z ddlmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZ  ed      Z e       rdd	lmZ dd
lmZmZmZ erddlmZ ndZ e	j.                  e      Z G d d      Zdedeeeed   z  f   fdZ	 d%dej>                  dej>                  dej>                  dej>                  e ej>                  ej>                  f   z  fdZ!ej>                  e"z  Z#	 	 	 	 	 d&dej>                  de"dz  de e#e#f   dz  dedz  ddf
dZ$dej>                  de"dej>                  fdZ%	 	 	 d'dejL                  jN                  dej>                  dej>                  dej>                  d eej>                  df   d!e(dz  d"e(dz  d#ej>                  dz  de ej>                  ej>                  dz  f   fd$Z)y)(a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalUnionN)version   )is_torch_flex_attn_availablelogging)get_torch_versionis_torch_greater_or_equalis_torch_less_or_equalis_torchdynamo_compilingz2.9.0)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attention)
AuxRequestc                   x     e Zd ZdZdZdZdZ fdZej                  j                  d      d        Zd Z xZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                 \    | j                   t        | 	  |       | _         | j                   S N)	_instancesuper__new__)clsargskwargs	__class__s      y/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/integrations/flex_attention.pyr   zWrappedFlexAttention.__new__D   s'    == !GOC0CM}}    )	recursivec                    | j                   r|| j                  k7  r|| _        t        d      r!t        j                  t
        d      | _        nlt        j                  t                     j                  dk(  r$|r"t        j                  t
        dd      | _        nt        j                  t
              | _        d| _         yy)	z>
        Initialize or update the singleton instance.
        2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r"   modeTN)_is_flex_compiledtrainingr   torchcompiler   _compiled_flex_attentionr   parser	   base_version)selfr%   s     r   __init__zWrappedFlexAttention.__init__J   s    
 %%T]])B$DM%g.05nV[0\- 023@@GKPX05"E8T1-
 16n0M-%)D" *Cr   c                     | j                   S r   )r(   )r+   s    r   __call__zWrappedFlexAttention.__call__`   s    ,,,r   )__name__
__module____qualname____doc__r   r$   r(   r   r&   compilerdisabler,   r.   __classcell__)r   s   @r   r   r   ;   sK     I# ^^e,* -**-r   r   
return_lsereturnr   c                 <    t         rd| rt        d      iS diS d| iS )aU  
    Requests the LSE from flex_attention in a version-agnostic fashion.

    Before torch 2.9, the LSE was requested via the boolean return_lse field. However, starting with
    torch 2.9, an AuxRequest object must be passed via the aux_request field. This method conditionally
    returns the correct form based on the python version.
    
return_auxT)lseNr6   )_TORCH_FLEX_USE_AUXr   )r6   s    r   get_flex_attention_lse_kwargsr<   d   s,     jjT2KKdKK*%%r   querykeyvaluec                 X    t               s t        |             nt        } || ||fi |S r   )r   r   r   )r=   r>   r?   r%   r   flex_attention_compileds         r   compile_friendly_flex_attentionrB   r   s@     G_F`<28<>ft" 	 r   attention_mask_2dattention_chunk_sizeoffsets	is_causalr   c                 D     j                   \  }}|s|}|s|}|t        z  dz   t        z  }t        j                  j                  j                   dd||z
  f        j                  }	 j                         |4j                         j                  d      j                  d      dz
  |z   fdfd}
 fd}|s|n|n|
|0|d   j                  |	      |d   j                  |	      fd	}n}t        ||d|||	t        d
             S )aG  
    IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
    and will be removed in a future version without warnings. New code should not use it. It is only kept here
    for BC for now, while models using it are being patched accordingly.

    Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
    Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full (causal) block
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
       r   )r?   padNc                 T    ||k\  }	| |f   	| |f   k(  }| |f   dkD  }||z  |z  }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.
        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskrC   document_idss
           r   causal_mask_modz4make_flex_block_causal_mask.<locals>.causal_mask_mod   sV     vo$Y%56,yRXGX:YY(E)9:Q> </-?
r   c                 B    | |f   | |f   k(  } | |||      }||z  S )zU
        Combines the chunk mask with the causal mask for chunked attention.
        rL   )rM   rN   rO   rP   
chunk_maskcausal_doc_maskrV   
chunk_idxss         r   chunk_causal_mask_modz:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   s>      	5 01Z	6@Q5RR
))XufMO++r   c                 D    | |f   | |f   k(  }| |f   dkD  }||z  }|S )zp
        Utilizes default attention mask to enable encoder and encoder-decoder
        attention masks.
        r   rL   )	rM   rN   rO   rP   rR   rS   rT   rC   rU   s	          r   default_mask_modz5make_flex_block_causal_mask.<locals>.default_mask_mod   sH    
 %Y%56,yRXGX:YY(F):;a?!M1
r   c                 .    |z   }|z   } | |||      S r   rL   )	rM   rN   rO   rP   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_modz-make_flex_block_causal_mask.<locals>.mask_mod   s(    x'H*I*9h)TTr   r!   )rd   BHQ_LENKV_LENdevice_compile)shapeflex_default_block_sizer&   nn
functionalrI   ri   clonefill_cumsumtor   r   )rC   rD   query_length
key_lengthrE   rF   
batch_sizetotal_seq_lenpad_lenri   r[   r]   rd   rV   rZ   rU   ra   rb   rc   s   `            @@@@@@r   make_flex_block_causal_maskrx      sC   D !2 7 7J"
$55:>UUG++//0AQRT[^hThPi/j%%F$**,L'"((*003::2>BH\]
,	 "25I5Q/Wl1:==(AJMM&)		U
 +

+G44	 	r   hidden_statesn_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    rH   N)rk   expandreshape)ry   rz   batchnum_key_value_headsslenhead_dims         r   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr   moduleattention_maskscalingsoftcaps_auxc           
      X   |j                  dd      dkD  rt        d      d }	d t        |t              r|}	n|d d d d d d d |j                  d   f   fd}
d}|j                  d   }||dz
  z  dk7  rTt        ||j                  d   |j                  d   z        }t        ||j                  d   |j                  d   z        }d	}|j                  d
      }|j                  j                  dk7  }|s|t        d      t        |||f|
|	|||| j                  dt        |      }|rt        r|\  }}|j                  }n|\  }}|j                  |j                        }||j                  \  }}}}|j                  dddd      j!                  |||d      }|j#                  d      }t%        j&                  t%        j(                  ||gd      dd      }t%        j*                  ||z
        }||z  }n|}d }|j-                  dd      j/                         }||fS )Ndropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c                 h    t        j                  | z        z  } | |   d   |   |   z   } | S )Nr   )r&   tanh)scorerM   rN   rO   rP   
score_maskr   s        r   	score_modz)flex_attention_forward.<locals>.score_mod!  sK    ejj99E!Jy1!4U;FCCE r   TrH   Fkernel_optionscpuzhAttention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA)r   
block_mask
enable_gqascaler   r%   rJ   )dim)r   keepdimr   )get
ValueError
isinstancer   rk   r   ri   typerB   r%   r<   r;   r:   rr   dtypeviewr|   	unsqueezer&   	logsumexpcatexp	transpose
contiguous)r   r=   r>   r?   r   r   r   r   r   r   r   r   num_local_query_headsr   r6   flex_attention_outputattention_outputauxr:   ru   	num_heads	seq_len_q_sinkslse_expandedcombined_lserenorm_factorr   s         `                    @r   flex_attention_forwardr     si    zz)S!A%a
 	

 JJ.),#
#
1a399R= 89
 J!KKN 	!6!:;AU[[^syy|;<%Q5;;q>!AB
ZZ 01N""e+J%+v
 	
 < %  (

3   $9!c''C$9!c ffU[[!2B2H2H/J	9aJJq"a+22:y)UVWE
 ==,L ??599lE5JPR+SY[eijL "IIl\&ABM/-?0'11!Q7BBDS  r   )F)NNNNT)NNN)*r2   typingr   r   r&   	packagingr   utilsr   r   utils.import_utilsr	   r
   r   r   r;   !torch.nn.attention.flex_attentionr   rl   r   r   r   r   
get_loggerr/   loggerr   booldictstrr<   TensortuplerB   intOffsetrx   r   rm   Modulefloatr   rL   r   r   <module>r      s<  8 #   9  08   !g^^@
 
		H	%&- &-R&d &tCQ]H^A^<^7_ &$ 	<<	 << \\E%,,455$ 
	 (,,0!o||o*o
 66>"T)o d{o od	UU\\ 	U# 	U%,, 	U$ ! !%f!HHOOf!<<f! 
f! <<	f!
 %,,34f! T\f! T\f! <<$f! 5<<,,-f!r   