
    i2                        d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z!  ejD                  e#      Z$e ed       G d de                    Z%e ed       G d de                    Z& G d dejN                        Z( G d dejN                        Z)	 dMdejN                  dejT                  dejT                  dejT                  d ejT                  dz  d!e+d"e+fd#Z,d$ Z- G d% d&ejN                        Z.dNd'ejT                  d(e+d)e/d*ejT                  fd+Z0 G d, d-ejN                        Z1 G d. d/ejN                        Z2 G d0 d1e      Z3 G d2 d3ejN                        Z4d4ejT                  d5e5ejT                     d*ejT                  fd6Z6 G d7 d8ejN                        Z7 G d9 d:ejN                        Z8 G d; d<ejN                        Z9 G d= d>ejN                        Z: G d? d@e      Z; G dA dBe      Z< G dC dDejN                        Z=e G dE dFe             Z>e G dG dHe>             Z? edI       G dJ dKe>             Z@g dLZAy)O    )Callable)	dataclassN)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )VJEPA2ConfigzO
    VJEPA Predictor outputs that also contains the masked encoder outputs
    )custom_introc                       e Zd ZU dZej
                  ed<   dZej
                  dz  ed<   dZe	ej
                  df   dz  ed<   dZ
e	ej
                  df   dz  ed<   dZej
                  dz  ed<   y)	$VJEPA2WithMaskedInputPredictorOutputa  
    masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
        The masked hidden state of the model.
    target_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `target_mask` is provided which is applied on VJEPA2Encoder outputs):
        The target hidden state of the model.
    last_hidden_stateNmasked_hidden_state.hidden_states
attentionstarget_hidden_state)__name__
__module____qualname____doc__torchFloatTensor__annotations__r   r   tupler   r         {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/vjepa2/modeling_vjepa2.pyr   r   #   s     (((48**T18:>M5**C/047>7;Je'',-4;48**T18r*   r   zs
    VJEPA outputs that also contains the masked encoder outputs
    Optionally contains the predictor outputs
    c                        e Zd ZU dZej
                  ed<   dZej
                  dz  ed<   dZe	ej
                  df   dz  ed<   dZ
e	ej
                  df   dz  ed<   dZedz  ed<    fd	Z xZS )
 VJEPA2WithMaskedInputModelOutputaq  
    masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
        The masked hidden state of the model.
    predictor_output (`VJEPA2WithMaskedInputPredictorOutput`, *optional*):
        The output from the Predictor module.
    r   Nr   .r   r   predictor_outputc                     t        t        | 	               }t        |d   t              r|d   j                         |d<   t        |      S )N)listsuperto_tuple
isinstancer   r(   )selfoutput	__class__s     r+   r3   z)VJEPA2WithMaskedInputModelOutput.to_tupleM   sD    eg&()fRj"FG,,.F2JV}r*   )r!   r"   r#   r$   r%   r&   r'   r   r   r(   r   r.   r   r3   __classcell__r7   s   @r+   r-   r-   8   s     (((48**T18:>M5**C/047>7;Je'',-4;DH:TAH r*   r-   c                   x     e Zd ZdZ	 d	dedef fdZed        Zde	j                  de	j                  fdZ xZS )
VJEPA2PatchEmbeddings3Dz"
    Image to Patch Embedding
    confighidden_sizec                 H   t         |           |j                  | _        |j                  | _        || _        t        j                  |j                  ||j                  |j                  |j                  f|j                  |j                  |j                  f      | _        y )N)in_channelsout_channelskernel_sizestride)	r2   __init__
patch_sizetubelet_sizer=   r   Conv3din_chansprojr5   r<   r=   r7   s      r+   rC   z VJEPA2PatchEmbeddings3D.__init__Y   s    
 	 ++"//&II$,,f.?.?ARARS''):):F<M<MN	
	r*   c                     | j                   | j                  z  | j                  | j                  z  z  | j                  | j                  z  z  S Nframes_per_cliprE   	crop_sizerD   r<   s    r+   num_patchesz#VJEPA2PatchEmbeddings3D.num_patchesj   sO     ##v':'::6#4#4466#4#446	
r*   pixel_values_videosreturnc                 f    | j                  |      j                  d      j                  dd      }|S )N   r   )rH   flatten	transpose)r5   rQ   xs      r+   forwardzVJEPA2PatchEmbeddings3D.forwardr   s.    II)*2215??1Er*      )r!   r"   r#   r$   r   intrC   staticmethodrP   r%   TensorrX   r8   r9   s   @r+   r;   r;   T   sS      

 
" 
 
5<< ELL r*   r;   c                   f     e Zd ZdZddedef fdZdej                  dej                  fdZ	 xZ
S )	VJEPA2Embeddings>
    Construct mask token, position and patch embeddings.
    r<   r=   c                     t         |           || _        || _        t	        ||      | _        | j
                  j                  | _        |j                  | _        y )Nr=   )r2   rC   r<   r=   r;   patch_embeddingsrP   rD   rI   s      r+   rC   zVJEPA2Embeddings.__init__|   sM    & 7K X00<< ++r*   rQ   rR   c                 l   |j                   d   }|j                  ddddd      }|| j                  j                  k  r)|j	                  dd| j                  j                  dd      }| j
                  j                  j                  j                  }|j                  |      }| j                  |      }|S )Nr   r   rT   r      )dtype)
shapepermuter<   rE   repeatrc   rH   weightrf   to)r5   rQ   
num_framestarget_dtype
embeddingss        r+   rX   zVJEPA2Embeddings.forward   s    (..q1
 299!Q1aH 000"5"<"<Q4;;C[C[]^`a"b,,1188>>144<4H**+>?
r*   rY   )r!   r"   r#   r$   r   r[   rC   r%   r]   rX   r8   r9   s   @r+   r_   r_   w   s6    ,| ,# ,5<< ELL r*   r_   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr0   )dimrf   )ptrainingr   rT   )r%   matmulrV   r   
functionalsoftmaxfloat32rk   rf   ru   rz   
contiguous)
ro   rp   rq   rr   rs   rt   ru   kwargsattn_weightsattn_outputs
             r+   eager_attention_forwardr      s     <<s}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L,,|U3K''1-88:K$$r*   c                    | j                         \  }}}}t        j                  |dz  | j                  | j                        }||dz  z  }dd|z  z  }|j                  d      |z  }|j                         }|j                         }	|j                  dddd      }|	j                  dddd      }	| j                  dd      }
|
j                  d	      \  }}t        j                  | |fd	      }
|
j                  d
      }
| |	z  |
|z  z   S )NrT   rf   deviceg       @g      ?i'  r0   r   )r0   rT   rx   rw   )sizer%   arangerf   r   	unsqueezesincosri   	unflattenunbindstackrU   )rW   posB	num_headsNDomegafreqemb_sinemb_cosyy1y2s                r+   rotate_queries_or_keysr      s   Ay!Q
 LLaqwwqxx@E	QWE%,E==u$D hhjGhhjGnnQ1a(GnnQ1a(G 	
B AXX"XFBbS"I2&A			"AKAK((r*   c                        e Zd Z	 	 ddededef fdZd Zd ZddZd	 Z		 dd
e
j                  dz  dee
j                  e
j                  f   fdZ xZS )VJEPA2RopeAttentionr<   r=   num_attention_headsc                 z   t         |           || _        || _        || _        ||z  dk7  rt        d|f d| d      t        ||z        | _        | j                  | j                  z  | _        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  ||      | _        |j                   | _        t        j$                  | j"                        | _        | j                  j(                  | j                  j*                  z  | _        | j                  j.                  | j                  j0                  z  | _        t        d| j                  dz  dz  z        | _        t        d| j                  dz  dz  z        | _        t        d| j                  dz  dz  z        | _        | j                  dz  | _        d	| _        y )
Nr   zThe hidden size z4 is not a multiple of the number of attention heads .biasrT   r         F)r2   rC   r<   r=   r   
ValueErrorr[   attention_head_sizeall_head_sizer   Linearqkv_biasrp   rq   rr   rH   attention_probs_dropout_probdropout_probDropoutru   rN   rD   	grid_sizerM   rE   
grid_depthd_dimh_dimw_dimrt   	is_causal)r5   r<   r=   r   r7   s       r+   rC   zVJEPA2RopeAttention.__init__   s    	&#6 ,,1"K>"2 3,-Q0 
 $'{5H'H#I !558P8PPYY{D,>,>V__U
99[$*<*<6??SYY{D,>,>V__U
IIk;7	"??zz$"3"34..$++2H2HH++559Q9QQt771<BCD
t771<BCD
t771<BCD
//5r*   c                 P    t        | j                  | j                  z        }||z  S rK   )r[   r   )r5   idstokens_per_frames      r+   _get_frame_posz"VJEPA2RopeAttention._get_frame_pos   s&    t~~>?&&&r*   c                     t        | j                  | j                  z        }| j                  |      }|||z  z
  }| j                  }||z  S rK   )r[   r   r   )r5   r   r   	frame_idstokens_per_rows        r+   _get_height_posz#VJEPA2RopeAttention._get_height_pos   sN    t~~>?'',	$y00n$$r*   Nc                    |j                   }|j                  d      }|-|j                  d      j                  d| j                  d      }nt        j                  ||      }t        | j                  | j                  z        }| j                  |      }| j                  }| j                  |      }	|||z  z
  ||	z  z
  }
||	|
fS )Nr   r   )r   r   r   ri   r   r%   r   r[   r   r   r   )r5   rW   masksr   
token_sizer   r   r   r   
height_ids	width_idss              r+   get_position_idsz$VJEPA2RopeAttention.get_position_ids  s    VVAY
 //!$++At/G/GKC,,z&9Ct~~>?'',	))#.
 +i77>J;VV	*i//r*   c                    |\  }}}d}t        |d||| j                  z   f   |      }|| j                  z  }t        |d||| j                  z   f   |      }|| j                  z  }t        |d||| j                  z   f   |      }	|| j                  z  }|| j                  k  r&|d|d f   }
t        j                  |||	|
gd      }|S t        j                  |||	gd      }|S )Nr   .)r   r0   r   )r   r   r   r   r   r%   cat)r5   qkpos_idsd_maskh_maskw_masksqkdqkhqkwqkrs              r+   apply_rotary_embeddingsz+VJEPA2RopeAttention.apply_rotary_embeddings  s    !($RQTZZ-?(?%@fM	TZZ$RQTZZ-?(?%@fM	TZZ$RQTZZ-?(?%@fM	TZZt'''S!"W+CCc3/R8B 	 Cc?3B	r*   position_maskrR   c           
      8   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  ||      }| j                  ||      }| j                  ||      }t        j                  | j                  j                  t              }	 |	| |||d | j                  | j                  | j                   sdn| j"                        \  }
}|
j%                         d d | j&                  fz   }| j)                  |
j+                  |            }
|
|fS )Nr0   r   rT   )r           r   rt   ru   rw   )rg   r   rp   viewrV   rq   rr   r   r   r   get_interfacer<   _attn_implementationr   r   rt   rz   r   r   r   rH   reshape)r5   r   r   input_shapehidden_shapequery_layer	key_layervalue_layerr   attention_interfacecontext_layerattention_probsnew_context_layer_shapes                r+   rX   zVJEPA2RopeAttention.forward(  s   
 $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR'']'K00GD	22;H(?(M(MKK,,.E)
 *=nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S		-"7"78O"PQo--r*   )rZ      rK   )r!   r"   r#   r   r[   rC   r   r   r   r   r%   r]   r(   rX   r8   r9   s   @r+   r   r      sz      #%	## # !	#J'%0*( .2!. ||d*!. 
u||U\\)	*	!.r*   r   input	drop_probrz   rR   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   r   r   )rg   ndimr%   randrf   r   floor_div)r   r   rz   	keep_probrg   random_tensorr6   s          r+   	drop_pathr   M  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr*   c                   t     e Zd ZdZd	dedz  f fdZdej                  dej                  fdZde	fdZ
 xZS )
VJEPA2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   c                 0    t         |           || _        y rK   )r2   rC   r   )r5   r   r7   s     r+   rC   zVJEPA2DropPath.__init__`  s    "r*   r   rR   c                 D    t        || j                  | j                        S rK   )r   r   rz   )r5   r   s     r+   rX   zVJEPA2DropPath.forwardd  s    FFr*   c                      d| j                    S )Nzp=)r   r5   s    r+   
extra_reprzVJEPA2DropPath.extra_reprg  s    DNN#$$r*   rK   )r!   r"   r#   r$   floatrC   r%   r]   rX   strr   r8   r9   s   @r+   r   r   ]  s@    b#%$, #GU\\ Gell G%C %r*   r   c                   f     e Zd Zddededef fdZdej                  dej                  fdZ	 xZ
S )		VJEPA2MLPr<   r=   	mlp_ratioc                     t         |           |x}}t        ||z        }t        j                  ||d      | _        t        |j                     | _        t        j                  ||d      | _	        y NTr   )
r2   rC   r[   r   r   fc1r   
hidden_act
activationfc2)r5   r<   r=   r   in_featuresout_featureshidden_featuresr7   s          r+   rC   zVJEPA2MLP.__init__l  sa    %00lkI5699[/E !2!2399_lFr*   hidden_staterR   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rK   )r   r   r   )r5   r  s     r+   rX   zVJEPA2MLP.forwardt  s2    xx-|4xx-r*   )rZ         @)r!   r"   r#   r   r[   r   rC   r%   r]   rX   r8   r9   s   @r+   r   r   k  s=    G| G# GQV GELL U\\ r*   r   c                        e Zd ZdZ	 	 	 	 ddededededef
 fdZ	 dd	ej                  d
ej                  dz  de
e   deej                  df   fdZ xZS )VJEPA2LayerzCThis corresponds to the Block class in the original implementation.r<   drop_path_rater=   r   r   c                    t         |           || _        || _        || _        || _        t        j                  ||j                        | _	        t        |||      | _        |j                  dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t#        |||      | _        y )Nepsr   )r=   r   )r2   rC   r<   r=   r   r   r   	LayerNormlayer_norm_epsnorm1r   	attentionr  r   Identityr   norm2r   mlp)r5   r<   r  r=   r   r   r7   s         r+   rC   zVJEPA2Layer.__init__~  s     	&#6 "\\+63H3HI
,V[BUV;A;P;PSV;V7\^\g\g\i\\+63H3HI
V	Rr*   Nr   r   r   rR   .c                     |}| j                  |      }| j                  ||      \  }}| j                  |      |z   }|}| j                  |      }| j	                  |      }| j                  |      |z   }||fS )N)r   )r  r  r   r  r  )r5   r   r   r   residualattention_outputr   s          r+   rX   zVJEPA2Layer.forward  s     !

=1)-' *8 *
&, '788C !

=1/}5@ l**r*   )r   rZ   r   r  rK   )r!   r"   r#   r$   r   r   r[   rC   r%   r]   r   r   r(   rX   r8   r9   s   @r+   r  r  {  s    M
 !$#%SS S 	S
 !S S. .2+||+ ||d*+ +,	+
 
u||S 	!+r*   r  c                   \     e Zd Zdef fdZ	 ddej                  dz  dee   de	fdZ
 xZS )	VJEPA2Encoderr<   c                 ^   t         |           || _        t        ||j                        | _        t        |j                        D cg c]2  }|j                  dkD  r|j                  |z  |j                  dz
  z  nd4 }}t        j                  t        |j                        D cg c]3  }t        |||   |j                  |j                  |j                        5 c}      | _        t        j                  |j                  |j                         | _        d| _        y c c}w c c}w )Nrb   r   r   r  r=   r   r   r  F)r2   rC   r<   r_   r=   rn   rangenum_hidden_layersr  r   
ModuleListr  r   r   layerr
  r  	layernormgradient_checkpointingr5   r<   idrop_path_ratesr7   s       r+   rC   zVJEPA2Encoder.__init__  s   *6v?Q?QR 6334
 LRKcKcfgKgV""Q&&*B*BQ*FGmpp
 
 ]] v778	  #21#5 & 2 2(.(B(B$..	

 f&8&8f>S>ST&+##

	s   
7D%)8D*NrQ   r   rR   c                     | j                  |      }t        | j                        D ]  \  }} ||d fi |}|d   } | j                  |      }t	        |      S )Nr   r   )rn   	enumerater  r  r
   )r5   rQ   r   r   r  layer_modulelayer_outputss          r+   rX   zVJEPA2Encoder.forward  sj    
 (;<(4 	-OA|(GGM)!,M	- }5+
 	
r*   rK   )r!   r"   r#   r   rC   r%   r]   r   r   r
   rX   r8   r9   s   @r+   r  r    sD    ,| ,4 48
"\\D0
 +,
 
	
r*   r  tensorr   c                    g }|D ]j  }|j                  | j                        }|j                  d      j                  dd| j	                  d            }|t        j                  | d|      gz  }l t        j                  |d      S )z
    Args:
        tensor (`torch.Tensor`):
            Tensor of shape [batch_size, num_patches, feature_dim]
        masks (`List[torch.Tensor]`):
            List of tensors of shape [batch_size, num_patches] containing indices of patches to keep
    r0   r   rx   indexr   r   )rk   r   r   ri   r   r%   gatherr   )r&  r   all_masked_tensorsmask	mask_keeps        r+   apply_masksr.    s      Mwwv}}%NN2&--aFKKOD	u||FKLLM
 99'Q//r*   c                        e Zd ZdZdef fdZed        Z	 ddej                  de
ej                     de
ej                     ded	eej                  ej                  f   f
d
Z xZS )VJEPA2PredictorEmbeddingsr`   r<   c                    t         |           || _        t        j                  |j
                  |j                        | _        d| _        |j                  | _
        |j                  | _        t        j                  t        j                  | j                  dd|j                              | _        |j                   | _        || _        y )Nr   r   )r2   rC   r<   r   r   r=   pred_hidden_sizepredictor_embeddingsnum_mask_tokenspred_zero_init_mask_tokenszero_init_mask_tokenspred_num_mask_tokens	Parameterr%   zerosmask_tokensrD   r5   r<   r7   s     r+   rC   z"VJEPA2PredictorEmbeddings.__init__  s    $&IIf.@.@&BYBY$Z! %+%F%F"%::<<D4H4H!QPVPgPg(hi ++r*   c                     | j                   dkD  rM| j                   | j                  z  | j                  | j                  z  z  | j                  | j                  z  z  S | j                  | j                  z  | j                  | j                  z  z  S Nr   rL   rO   s    r+   rP   z%VJEPA2PredictorEmbeddings.num_patches  s    !!A%''6+>+>>##v'8'88:##v'8'88: $$(9(99f>N>NRXRcRc>cddr*   r   context_masktarget_mask
mask_indexrR   c                    |j                  d      }| j                  |      }|| j                  z  }| j                  |   }|d   j	                         dz   }|j                  ||d      }t        ||      }|j                  t        |      dd      }t        j                  ||gd      }	t        j                  |d      }
t        j                  |d      }t        j                  |
|gd      }|	|fS )z
        hidden_states : encoder outputs (context)
        context_mask: tokens of the context (outputs from the encoder)
        target_mask: tokens to predict
        mask_index: index of the target mask to choose (useful for multiclip?)
        r   r   r   )
r   r3  r4  r:  maxri   r.  lenr%   r   )r5   r   r>  r?  r@  r   contexttargetmax_patch_numrn   cmtmr   s                r+   rX   z!VJEPA2PredictorEmbeddings.forward  s     q!++M:  $"6"66
!!*- $A**,q0q-3V[1 ..\!2Aq9YY0a8
 YY|+YY{*		2r(*5  r*   r   )r!   r"   r#   r$   r   rC   r\   rP   r%   r]   r1   r[   r(   rX   r8   r9   s   @r+   r0  r0    s    |  e e &!||&! 5<<(&! %,,'	&!
 &! 
u||U\\)	*&!r*   r0  c            
            e Zd Zdef fdZd Zd Zdej                  de	ej                     de	ej                     de
e   d	ef
d
Z xZS )VJEPA2Predictorr<   c                    t         |           || _        d| _        t	        |      | _        t        |j                        D cg c]2  }|j                  dkD  r|j                  |z  |j                  dz
  z  nd4 }}t        j                  t        |j                        D cg c]3  }t        |||   |j                  |j                  |j                        5 c}      | _        t        j                   |j                  |j"                        | _        t        j&                  |j                  |j(                  d      | _        y c c}w c c}w )NFr   r   r  r  Tr   )r2   rC   r<   r  r0  rn   r  pred_num_hidden_layersr  r   r  r  r2  pred_num_attention_headspred_mlp_ratior  r
  r  r  r   r=   rH   r  s       r+   rC   zVJEPA2Predictor.__init__.  s5   &+#3F; 6889
  0014 %%)V-J-JQ-NO
 
 ]] v<<=	  #21#5 & 7 7(.(G(G$33	

 f&=&=6CXCXYIIf55v7I7IPTU	+
	s   7E
$8Ec                 8   |j                  |j                        }t        j                  |d|      }|j                  |j                        }|j	                  d      j                  dd|j                  d            }t        j                  |d|      }||fS )Nr   r(  r0   )rk   r   r%   r*  r   expandr   )r5   r   position_masksargsorthidden_states_argsorts        r+   sort_tokenszVJEPA2Predictor.sort_tokensJ  s    **^223n!7K **]112 ' 1 1" 5 < <R]EWEWXZE[ \]AVWn,,r*   c                     |j                  |j                        }t        j                  |d      }|j	                  d      j                  dd|j                  d            }t        j                  |d|      }|S )Nr   r   r0   r(  )rk   r   r%   rR  r   rP  r   r*  )r5   r   rR  reverse_argsorts       r+   unsort_tokenszVJEPA2Predictor.unsort_tokensV  si    **]112--Q7)33B7>>r2}GYGYZ\G]^]Qr*   encoder_hidden_statesr>  r?  r   rR   c                    t        ||      }|j                  \  }}}| j                  |||      \  }}	t        j                  |	d      }
| j                  ||	|
      \  }}	t        | j                        D ]  \  }} |||	fi |}|d   } | j                  |      }| j                  ||
      }|d d |d f   }| j                  |      }t        |      S )Nr   r   r   r"  )r.  rg   rn   r%   rR  rT  r#  r  r  rW  rH   r
   )r5   rX  r>  r?  r   _N_ctxtr   r   rQ  rR  r  r$  r%  s                 r+   rX   zVJEPA2Predictor.forward]  s     !,,A< P,2261(,8M|]h(i%~ --A6(,(8(8X_(`%~(4 	-OA|(Q&QM)!,M	- }5**='B%aj1		-0+
 	
r*   )r!   r"   r#   r   rC   rT  rW  r%   r]   r1   r   r   r
   rX   r8   r9   s   @r+   rJ  rJ  -  sl    V| V8
-
$||
 5<<(
 %,,'	

 +,
 

r*   rJ  c            	            e Zd ZdZdef fdZ	 d	dej                  dej                  dz  deej                  ej                  f   fdZ	 xZ
S )
VJEPA2PoolerSelfAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr<   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   F)r2   rC   r<   r=   	embed_dimr   r   head_dimr   scaleattention_dropoutru   r   r   r   k_projv_projq_projout_projr;  s     r+   rC   z"VJEPA2PoolerSelfAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar*   Nr   rs   rR   c           
         |j                   dd }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }t        j                  | j                  j                  t              } || ||||| j                  | j                  | j                  sdn| j                        \  }	}
 |	j                   g |d j#                         }	| j%                  |	      }	|	|
fS )#Input shape: Batch x Time x ChannelNr0   r   rT   r   r   )rg   ra  rf  r   rV   rd  re  r   r   r<   r   r   r   rb  rz   ru   r   r   rg  )r5   r   rs   r   r   querieskeysvaluesr   r   r   s              r+   rX   z!VJEPA2PoolerSelfAttention.forward  s<    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ *k));;;;FFHmmK0L((r*   rK   r!   r"   r#   r$   r   rC   r%   r]   r(   rX   r8   r9   s   @r+   r]  r]  ~  sY    GB| B. /3)||) t+) 
u||U\\)	*	)r*   r]  c                        e Zd ZdZdef fdZ	 ddej                  dej                  dej                  dej                  dz  d	eej                  ej                  f   f
d
Z	 xZ
S )VJEPA2PoolerCrossAttentionz_It's different from other cross-attention layers, doesn't have output projection layer (o_proj)r<   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y r_  )r2   rC   r<   r=   r`  r   r   ra  r   rb  rc  ru   r   r   r   rd  re  rf  r;  s     r+   rC   z#VJEPA2PoolerCrossAttention.__init__  s    ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?r*   Nrj  rk  rl  rs   rR   c           
         |j                   \  }}}|j                   d   }| j                  |      }| j                  |      }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }t        j                  | j                  j                  t              }	 |	| ||||| j                  | j                  | j                  sdn| j                         \  }
}|
j#                  |||      j%                         }
|
|fS )ri  r   rT   r   r   )rg   rf  rd  re  r   r   ra  rV   r   r   r<   r   r   r   rb  rz   ru   r   r   )r5   rj  rk  rl  rs   
batch_sizeq_seq_lengthr`  kv_seq_lengthr   r   r   s               r+   rX   z"VJEPA2PoolerCrossAttention.forward  sT    /6mm+
L)

1++g&{{4 V$,,z<Waabcefgyy]DNNDMMR\\]^`abZV``abdef(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*lINYY[L((r*   rK   rm  r9   s   @r+   ro  ro    su    i@| @0 /3%)%) ll%) 	%)
 t+%) 
u||U\\)	*%)r*   ro  c                        e Zd Zdef fdZdej                  dej                  deej                  ej                  f   fdZ xZ	S )VJEPA2PoolerSelfAttentionLayerr<   c                 :   t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        j                  |j                  |j
                        | _	        t        ||j                        | _        y Nr  rb   )r2   rC   r   r
  r=   r  layer_norm1r]  	self_attnlayer_norm2r   r  r;  s     r+   rC   z'VJEPA2PoolerSelfAttentionLayer.__init__  sl    <<(:(:@U@UV26:<<(:(:@U@UVV1C1CDr*   r   rs   rR   c                     |}| j                  |      }| j                  ||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }||fS )aR  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
        )r   rs   )ry  rz  r{  r  )r5   r   rs   r  r   s        r+   rX   z&VJEPA2PoolerSelfAttentionLayer.forward  s{     !((7&*nn') '5 '
#| !=0 ((7/ =0l**r*   
r!   r"   r#   r   rC   r%   r]   r(   rX   r8   r9   s   @r+   rv  rv    sL    E| E+||+ + 
u||U\\)	*	+r*   rv  c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dz  deej                  ej                  f   fdZ xZ	S )
VJEPA2PoolerCrossAttentionLayerr<   c                 :   t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        j                  |j                  |j
                        | _	        t        ||j                        | _        y rx  )r2   rC   r   r
  r=   r  ry  ro  
cross_attnr{  r   r  r;  s     r+   rC   z(VJEPA2PoolerCrossAttentionLayer.__init__  sl    <<(:(:@U@UV4V<<<(:(:@U@UVV1C1CDr*   Nrj  r  rs   rR   c                     |}| j                  |      }| j                  ||||      ^}}||z   }|}| j                  |      }| j                  |      }||z   }|g|S )Nrs   )ry  r  r{  r  )r5   rj  r  rs   r  r   s         r+   rX   z'VJEPA2PoolerCrossAttentionLayer.forward#  s     ''5&*oo)	 '6 '
#|  ,.  ''5xx-,.*l**r*   rK   r}  r9   s   @r+   r  r    sd    E| E /3	++ ll+ t+	+
 
u||U\\)	*+r*   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )VJEPA2AttentivePoolerzAttentive Poolerr<   c                 F   t         |           t        j                  t	        j
                  dd|j                              | _        t        |      | _	        t        j                  t        |j                        D cg c]  }t        |       c}      | _        y c c}w r=  )r2   rC   r   r8  r%   r9  r=   query_tokensr  cross_attention_layerr  r  num_pooler_layersrv  self_attention_layers)r5   r<   rZ  r7   s      r+   rC   zVJEPA2AttentivePooler.__init__@  sr    LLQ6;M;M)NO%DV%L"%']]=B6C[C[=\]+F3]&
"]s   ?Br  rR   c                     | j                   D ]  } ||d       d   } | j                  j                  |j                  d   dd      }| j	                  ||      d   }|j                  d      S )Nr  r   r   )r  r  ri   rg   r  squeeze)r5   r  r  rj  s       r+   rX   zVJEPA2AttentivePooler.forwardH  sw    // 	GE dCAFL	G##**<+=+=a+@!QG11'<HK##A&&r*   )
r!   r"   r#   r$   r   rC   r%   r]   rX   r8   r9   s   @r+   r  r  =  s-    
| 
'ELL 'U\\ 'r*   r  c                       e Zd ZU eed<   dZdZdZdZg dZ	dZ
dZ eed       eed	d
      dZ ej"                         d        Zy)VJEPA2PreTrainedModelr<   vjepa2rQ   videoT)r  rv  r  r0  zencoder.layer)
layer_namer   )r)  r  )r   r   c                    | j                   j                  }t        |t              rt	        j
                  |j                  |       t        |j                  d      D ]w  \  }}||dz  z  }t	        j
                  |j                  j                  j                  |       t	        j
                  |j                  j                  j                  |       y |t        |j                        dz   dz  z  }t	        j
                  |j                  j                  j                  j                  |       yt        |t               rN|j"                  r t	        j$                  |j&                         yt	        j
                  |j&                  |       yt        |t(        j*                  t(        j,                  t(        j.                  f      rNt	        j
                  |j                  |       |j0                   t	        j$                  |j0                         yyt        |t(        j2                        r?t	        j$                  |j0                         t	        j4                  |j                         yy)zInitialize the weights)stdr   g      ?N)r<   initializer_ranger4   r  inittrunc_normal_r  r#  r  rz  rg  rj   r  r   rC  r  r0  r6  zeros_r:  r   r   Conv2drF   r   r
  ones_)r5   ro   init_stdr  r  r  s         r+   _init_weightsz#VJEPA2PreTrainedModel._init_weightsd  s    ;;00f34v22A%f&B&BAF B5!S&)""5??#;#;#B#BL""599==#7#7SAB c&">">?!CKKCv;;??CCJJPST 9:++F../""6#5#58DBIIryy ABv}}(;{{&FKK( '-KK$JJv}}% .r*   N)r!   r"   r#   r   r'   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attnr   r  r   _can_record_outputsr%   no_gradr  r)   r*   r+   r  r  P  so     +O&*# N'P$%8o^
 U]]_& &r*   r  c                        e Zd Zdef fdZdefdZe ed      e		 	 	 dde
j                  d	ee
j                     dz  d
ee
j                     dz  dedee   defd                     Zde
j                  fdZ xZS )VJEPA2Modelr<   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y rK   )r2   rC   r<   r  encoderrJ  	predictor	post_initr;  s     r+   rC   zVJEPA2Model.__init__  s;     $V,(0 	r*   rR   c                 B    | j                   j                  j                  S rK   )r  rn   rc   r   s    r+   get_input_embeddingsz VJEPA2Model.get_input_embeddings  s    ||&&777r*   F)tie_last_hidden_statesNrQ   r>  r?  skip_predictorr   c                    |t        d       | j                  d
d|i|}|j                  }|||j                  d      }|j                  d      }	t	        j
                  |	|j                        j                  d      j                  |df      g}t	        j
                  |	|j                        j                  d      j                  |df      g}|sN | j                  d
|||d|}
t        |
j                  t        ||      |
j                  |
j                        }nd}t        |t        ||      |j                  |j                  |	      }|S )az  
        context_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
            The mask position ids indicating which encoder output patches are going to be exposed to the predictor.
            By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating full context
            available to the predictor.
        target_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
            The mask position ids indicating which encoder output patches are going to be used as a prediction target
            for the predictor. By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating
            that the predictor should predict all encoder patches.
        skip_predictor (bool):
            flag to skip the predictor forward, useful if you just need the encoder outputs
        Nz'You have to specify pixel_values_videosrQ   r   r   r   )rX  r>  r?  )r   r    r   r   )r   r   r   r   r.   r)   )r   r  r   r   r%   r   r   r   ri   r  r   r.  r   r   r-   )r5   rQ   r>  r?  r  r   encoder_outputssequence_outputr   r   predictor_outputsr.   encoder_outputs                r+   rX   zVJEPA2Model.forward  sy   . &FGG+74<< ,
 3,
,
 *;;K$7#((+A$$Q'A!LL3F3M3MNXXYZ[bbdeghcijkL <<2E2L2LMWWXYZaacdfgbhijK1? 2&5)'2 	2  D"3"E"E$/$M/==,77	   $9- +O\ J)77&11-
 r*   c                 @    | j                  |d      }|j                  S )NT)r  )rX   r   )r5   rQ   r  s      r+   get_vision_featureszVJEPA2Model.get_vision_features  s!    &9$O///r*   )NNF)r!   r"   r#   r   rC   r;   r  r   r   r   r%   r]   r1   boolr   r   r-   rX   r  r8   r9   s   @r+   r  r    s    | 8&= 8  E2 3715$;"\\; 5<<(4/; %,,'$.	;
 ; +,; 
*;  3  ;z0%,, 0r*   r  z}
    V-JEPA 2 Model transformer with a video classification head on top (a linear layer on top of the attentive pooler).
    c                        e Zd Zdef fdZee	 d	dej                  dej                  dz  de	e
   deez  fd              Z xZS )
VJEPA2ForVideoClassificationr<   c                    t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y r   )r2   rC   
num_labelsr  r  r  poolerr   r   r=   
classifierr  r;  s     r+   rC   z%VJEPA2ForVideoClassification.__init__  sd      ++!&) ,F3))F$6$68I8IPTU 	r*   NrQ   labelsr   rR   c                     | j                   d|dd|}|j                  }| j                  |      }| j                  |      }d}|| j	                  ||| j
                        }t        |||j                  |j                        S )ag  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> import numpy as np
        >>> from transformers import AutoVideoProcessor, VJEPA2ForVideoClassification

        >>> device = "cuda"

        >>> video_processor = AutoVideoProcessor.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2")
        >>> model = VJEPA2ForVideoClassification.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2").to(device)

        >>> video = np.ones((64, 256, 256, 3))  # 64 frames, 256x256 RGB
        >>> inputs = video_processor(video, return_tensors="pt").to(device)

        >>> # For inference
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> logits = outputs.logits

        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])

        >>> # For training
        >>> labels = torch.ones(1, dtype=torch.long, device=device)
        >>> loss = model(**inputs, labels=labels).loss

        ```T)rQ   r  N)pooled_logitsr  r<   )losslogitsr   r   r)   )	r  r   r  r  loss_functionr<   r   r   r   )	r5   rQ   r  r   outputsr   pooler_outputr  r  s	            r+   rX   z$VJEPA2ForVideoClassification.forward  s    V $++ 
 3
 
 $55$56/%%F6RVR]R]%^D$!//))	
 	
r*   rK   )r!   r"   r#   r   rC   r   r   r%   r]   r   r   r(   r   rX   r8   r9   s   @r+   r  r    so    |   '+<
"\\<
 t#<
 +,	<

 
&	&<
  <
r*   r  )r  r  r  )r   )r   F)Bcollections.abcr   dataclassesr   r%   r    r   r  activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r   configuration_vjepa2r   
get_loggerr!   loggerr   r-   Moduler;   r_   r]   r   r   r   r   r  r   r   r   r  r  r1   r.  r0  rJ  r]  ro  rv  r  r  r  r  r  __all__r)   r*   r+   <module>r     s   % !   & ! 9 F F & _ _ 7 E . 
		H	% 
9; 9 9 {  * bii  Fryy T %II%<<% 
% <<	%
 LL4'% % %4)6z.")) z.|U\\ e T V[VbVb  %RYY %		  -+, -+`(
BII (
V0 0T%,,-? 0ELL 0"C!		 C!LN
bii N
b5)		 5)p=) =)B!+%? !+H+&@ +D'BII '& +&O +& +&\ P0' P0 P0f 
L
#8 L

L
^ Sr*   