
    if                        d dl Zd dlZd dl mZ d dlmZ d dlZd dlZd dl	m
c mZ d dlmZm
Z
 ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'  e       rd dl(m)Z)  e!       rd dl*m+Z+ d dl,m-Z-  G d de
j\                        Z/ G d de
j\                        Z0 G d de
j\                        Z1 G d de
j\                        Z2	 dRde
j\                  dej                  dej                  dej                  d ej                  dz  d!e3d"e3fd#Z4 G d$ d%e
j\                        Z5dSd&ej                  d'e3d(e6d)ej                  fd*Z7 G d+ d,e
j\                        Z8 G d- d.e
j\                        Z9 G d/ d0e      Z: G d1 d2e
j\                        Z;e e d34       G d5 d6e                    Z<	 dTd7ej                  d8ej                  d)ej                  fd9Z=d:ed;ed)efd<Z>d:ej                  d;ej                  d)ej                  fd=Z? G d> d?e
j\                        Z@d:ed;ed@eAd)efdAZBd:ej                  d;ej                  d@eAd)ej                  fdBZC G dC dDe
j\                        ZDe  G dE dFe             ZE G dG dHe
j                        ZG G dI dJe
j\                        ZH G dK dLe
j\                        ZI G dM dNe
j\                        ZJ e dO4       G dP dQeE             ZKdFdQgZLy)U    N)Callable)	dataclass)Tensornn   )initialization)ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_accelerate_available)merge_with_config_defaults)capture_outputs   )VideomtConfig)linear_sum_assignment)PartialState)reducec                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )VideomtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfconfigr#   r$   r%   r&   r+   	__class__s          }/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/videomt/modeling_videomt.pyr"   zVideomtPatchEmbeddings.__init__:   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hi    pixel_valuesreturnc                 :   |j                   d   }|| j                  k7  rt        d| j                   d| d      |j                  | j                  j
                  j                        }| j	                  |      j                  d      j                  dd      }|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .dtype   )	shaper%   
ValueErrortor-   weightr8   flatten	transpose)r.   r3   r%   
embeddingss       r1   forwardzVideomtPatchEmbeddings.forwardI   s    #))!,4,,,!../yaI 
 $T__-C-C-I-IJ__\2::1=GG1M
r2   )	__name__
__module____qualname____doc__r"   torchr   rA   __classcell__r0   s   @r1   r   r   3   s)    j
ELL 
U\\ 
r2   r   c                        e Zd ZdZdeddf fdZd	dej                  dej                  dz  dej                  fdZ xZ	S )
VideomtEmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    r/   r4   Nc                 .   t         |           || _        |j                  | _        t	        j
                  t        j                  dd|j                              | _	        t	        j
                  t        j                  d|j                  |j                              | _        t        |      | _        | j                  j                  }t	        j                   |j"                        | _        d|j                  z   | _        t	        j(                  ||j                        | _        | j-                  dt        j.                  |      j1                  d      d       t	        j
                  t        j                  dd|j                              | _        y )Nr   position_idsr   F)
persistent)r!   r"   r/   r$   r   	ParameterrF   randnr&   	cls_tokenzerosnum_register_tokensregister_tokensr   patch_embeddingsr+   Dropouthidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsregister_bufferarangeexpand
mask_token)r.   r/   r+   r0   s      r1   r"   zVideomtEmbeddings.__init__[   s    ++ekk!Q8J8J&KL!||EKK6;U;UW]WiWi,jk 6v >++77zz&"<"<=!"V%?%?!?#%<<V=O=O#P ^U\\+-F-M-Mg-Vchi,,u{{1a9K9K'LMr2   r3   bool_masked_posc                    |j                   dk(  rA|j                  \  }}}}}|j                  ||z  |||      }|F|j                  ||z  d      }n0|.|j                   dkD  r|j                  |j                  d   d      }|j                  d   }| j                  |      }|[|j	                  |j
                  t        j                        j                  d      }	t        j                  |	| j                  |      }| j                  j                  |dd      }
| j                  j                  |dd      }|| j                  | j                        z   }t        j                   |
||gd      }| j#                  |      }|S )N   rN   r9   r   )devicer8   r   dim)ndimr:   reshaperV   r<   rd   rF   bool	unsqueezewherer`   rR   r_   rU   r\   rL   catrY   )r.   r3   ra   
batch_size
num_framesr%   heightwidthr@   mask
cls_tokensrU   s               r1   rA   zVideomtEmbeddings.forwardk   sq   !BNBTBT?J
L&%'//
Z0GW]_deL*"1"9"9*z:QSU"V(_-A-AA-E-55o6K6KA6NPRSO!''*
**<8
&"%%Z->->ejj%Q[[\^_DT4??JGJ^^**:r2>
..55j"bI$":":4;L;L"MM
YY
OZHaP
\\*-
r2   N
rB   rC   rD   rE   r   r"   rF   r   rA   rG   rH   s   @r1   rJ   rJ   V   sM    N} N N ELL 5<<RVCV bgbnbn r2   rJ   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )
VideomtMLPr4   c                 ~   t         |           |j                  x}}t        |j                  |j                  z        }t        j                  ||d      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||d      | _        y )NTbias)r!   r"   r&   int	mlp_ratior   Linearfc1r'   
hidden_actstrr	   
activationfc2r.   r/   in_featuresout_featureshidden_featuresr0   s        r1   r"   zVideomtMLP.__init__   s    %+%7%77lf0063C3CCD99[/Ef''-$V%6%67DO$//DO99_lFr2   hidden_statec                 l    | j                  |      }| j                  |      }| j                  |      }|S rs   )r}   r   r   r.   r   s     r1   rA   zVideomtMLP.forward   s2    xx-|4xx-r2   r4   NrB   rC   rD   r"   rF   r   rA   rG   rH   s   @r1   rv   rv      s$    	GELL U\\ r2   rv   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )VideomtGatedMLPr4   c                 0   t         |           |j                  x}}t        |j                  |j                  z        }t        |dz  dz        dz   dz  dz  }t        j                  |d|z  d      | _        t        j                  ||d      | _        y Nr9   r         Trx   	r!   r"   r&   rz   r{   r   r|   
weights_inweights_outr   s        r1   r"   zVideomtGatedMLP.__init__       %+%7%77lf0063C3CCD2Q67!;AAE))K_1D4P99_lNr2   r   c                     | j                  |      }|j                  dd      \  }}t        j                  j	                  |      |z  }| j                  |      S Nr9   rN   re   r   chunkr   
functionalsilur   r.   r   x1x2hiddens        r1   rA   zVideomtGatedMLP.forward   S    |4##A2#.B##B'",''r2   r   r   rH   s   @r1   r   r      $    O(ELL (U\\ (r2   r   modulequerykeyvalueattention_maskscalingrY   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrN   )rf   r8   )ptrainingr   r9   )rF   matmulr?   r   r   softmaxfloat32r<   r8   rY   r   
contiguous)
r   r   r   r   r   r   rY   kwargsattn_weightsattn_outputs
             r1   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r2   c            
            e Zd ZdZ fdZ	 ddej                  dej                  dz  deej                  ej                  dz  f   fdZ xZ	S )	VideomtAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r!   r"   r/   r&   	embed_dimnum_attention_heads	num_headshead_dimr;   scaleattention_dropoutrY   	is_causalr   r|   k_projv_projq_projout_projr.   r/   r0   s     r1   r"   zVideomtAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar2   Nhidden_statesr   r4   c           
         |j                   dd }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }t        j                  | j                  j                  t              }	 |	| ||||| j                  | j                  | j                  sdn| j                        \  }
} |
j                   g |d j#                         }
| j%                  |
      }
|
|fS )z#Input shape: Batch x Time x ChannelNrN   r   r9           )r   r   rY   )r:   r   r   viewr?   r   r   r   get_interfacer/   _attn_implementationr   r   r   r   rY   rh   r   r   )r.   r   r   r   input_shapehidden_shapequerieskeysvaluesattention_interfacer   r   s               r1   rA   zVideomtAttention.forward   s<    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ *k));;;;FFHmmK0L((r2   rs   )
rB   rC   rD   rE   r"   rF   r   tuplerA   rG   rH   s   @r1   r   r      sV    GB. /3!)||!) t+!)
 
u||U\\D00	1!)r2   r   input	drop_probr   r4   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   )r   r8   rd   )r:   rg   rF   randr8   rd   floor_div)r   r   r   	keep_probr:   random_tensoroutputs          r1   	drop_pathr      s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr2   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
VideomtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r4   c                 0    t         |           || _        y rs   )r!   r"   r   )r.   r   r0   s     r1   r"   zVideomtDropPath.__init__  s    "r2   r   c                 D    t        || j                  | j                        S rs   )r   r   r   r.   r   s     r1   rA   zVideomtDropPath.forward  s    FFr2   c                      d| j                    S )Nzp=)r   r.   s    r1   
extra_reprzVideomtDropPath.extra_repr  s    DNN#$$r2   rs   )rB   rC   rD   rE   floatr"   rF   r   rA   r   r   rG   rH   s   @r1   r   r   
  sG    b#%$, #$ #GU\\ Gell G%C %r2   r   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )VideomtSwiGLUFFNr4   c                 0   t         |           |j                  x}}t        |j                  |j                  z        }t        |dz  dz        dz   dz  dz  }t        j                  |d|z  d      | _        t        j                  ||d      | _        y r   r   r   s        r1   r"   zVideomtSwiGLUFFN.__init__  r   r2   r   c                     | j                  |      }|j                  dd      \  }}t        j                  j	                  |      |z  }| j                  |      S r   r   r   s        r1   rA   zVideomtSwiGLUFFN.forward"  r   r2   r   r   rH   s   @r1   r   r     r   r2   r   c                        e Zd ZdZdeddf fdZ	 d	dej                  dej                  dz  dej                  fdZ xZ	S )
VideomtLayerzCThis corresponds to the Block class in the original implementation.r/   r4   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        |      | _
        |j                  dkD  rt        |j                        nt        j                         | _        t        j                  |j                  |j
                        | _        |j                   rt#        |      | _        nt'        |      | _        t        |      | _        y )Nepsr   )r!   r"   r   	LayerNormr&   layer_norm_epsnorm1r   	attentionVideomtLayerScalelayer_scale1drop_path_rater   Identityr   norm2use_swiglu_ffnr   mlprv   layer_scale2r   s     r1   r"   zVideomtLayer.__init__,  s    \\&"4"4&:O:OP
)&1-f5CICXCX[^C^)>)>?dfdododq\\&"4"4&:O:OP
  '/DH!&)DH-f5r2   r   r   c                 *   | j                  |      }| j                  ||      \  }}| j                  |      }| j                  |      |z   }| j	                  |      }| j                  |      }| j                  |      }| j                  |      |z   }|S rs   )r   r   r   r   r   r   r   )r.   r   r   hidden_states_normself_attention_output_layer_outputs          r1   rA   zVideomtLayer.forward<  s    
 "ZZ6#'>>2Dn#U q $ 1 12G H '<=M zz-0xx-((6 ~~l3mCr2   rs   rt   rH   s   @r1   r   r   )  sP    M6} 6 6& /3|| t+ 
	r2   r   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )r   r4   c                     t         |           t        j                  |j                  t        j                  |j                        z        | _        y rs   )	r!   r"   r   rP   layerscale_valuerF   onesr&   lambda1r   s     r1   r"   zVideomtLayerScale.__init__T  s8    ||F$;$;ejjI[I[>\$\]r2   r   c                      || j                   z  S rs   )r  r   s     r1   rA   zVideomtLayerScale.forwardX  s    dll**r2   r   r   rH   s   @r1   r   r   S  s$    ^+ELL +U\\ +r2   r   a  
    Class for outputs of [`VideomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~VideomtVideoProcessor.post_process_semantic_segmentation`] or
    [`~VideomtVideoProcessor.post_process_instance_segmentation`] or
    [`~VideomtVideoProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~VideomtVideoProcessor`] for details regarding usage.
    )custom_introc                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)	%VideomtForUniversalSegmentationOutputa  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_stater   
attentions)rB   rC   rD   rE   r  rF   FloatTensor__annotations__r  r  r	  r   r   r
   r2   r1   r  r  \  s    & &*D%

d
")59%++d2959%++d2926u((4/659M5**+d2926Je''(4/6r2   r  input_featurespoint_coordinatesc                     |j                         dk(  rd}|j                  d      }t        j                  j                  j
                  | d|z  dz
  fi |}|r|j                  d      }|S )a(  
    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

    Args:
        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
            A tensor that contains features map on a height * width grid
        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
        2)):
            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
        add_dim (`bool`):
            boolean value to keep track of added dimension

    Returns:
        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
        height_grid, width_grid):
            A tensor that contains features for points in `point_coordinates`.
    r   Tr9   g       @      ?)rf   rj   rF   r   r   grid_samplesqueeze)r  r  add_dimr   point_featuress        r1   sample_pointr    st    ( !#-77: XX((44^SK\E\_bEbmflmN'//2r2   inputslabelsc                    | j                         j                  d      } dt        j                  | |j                        z  }| j                  d      dddf   |j                  d      dddf   z   }d|dz   |dz   z  z
  }|S )a  
    A pair wise version of the dice loss, see `dice_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        `torch.Tensor`: The computed loss between each pairs.
    r   r9   rN   N)sigmoidr>   rF   r   Tsum)r  r  	numeratordenominatorr  s        r1   pair_wise_dice_lossr    s|     ^^%%a(FELL22I**R.D)FJJrN47,CCK	A+/22DKr2   c                 \   | j                   d   }t        j                  d      } || t        j                  |             } || t        j
                  |             }t        j                  ||z  |j                        }t        j                  ||z  d|z
  j                        }||z   }|S )a  
    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss between each pairs.
    r   none	reduction)r:   r   BCEWithLogitsLossrF   	ones_like
zeros_liker   r  )	r  r  height_and_width	criterioncross_entropy_loss_poscross_entropy_loss_negloss_posloss_negr  s	            r1   $pair_wise_sigmoid_cross_entropy_lossr-    s     ||A$$v6I&vuv/FG&vu/?/?/GH||25EEvxxPH||25EEF
~~VHhDKr2   c                        e Zd ZdZ	 ddedededef fdZ ej                         dej                  dej                  d	ej                  d
ej                  de
ee	      f
d       Z xZS )VideomtHungarianMatcheraq  This class computes an assignment between the labels and the predictions of the network.

    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).
    
cost_class	cost_mask	cost_dice
num_pointsc                     t         |           |dk(  r|dk(  r|dk(  rt        d      || _        || _        || _        || _        y)aH  Creates the matcher

        Params:
            cost_class (`float`, *optional*, defaults to 1.0):
                Relative weight of the classification error in the matching cost.
            cost_mask (`float`, *optional*,  defaults to 1.0):
                This is the relative weight of the focal loss of the binary mask in the matching cost.
            cost_dice (`float`, *optional*, defaults to 1.0):
                This is the relative weight of the dice loss of the binary mask in the matching cost.
            num_points (`int`, *optional*, defaults to 12544):
                No. of points to sample on which the mask loss will be calculated. The same set of K points are
                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
                matching.
        r   zAll costs can't be 0N)r!   r"   r;   r3  r0  r1  r2  )r.   r0  r1  r2  r3  r0   s        r1   r"   z VideomtHungarianMatcher.__init__  sK    " 	?yA~)q.344$$""r2   r  r  mask_labelsclass_labelsr4   c           	         g }|j                   d   }t        |      D ]  }||   j                  d      }||   }	|dd||   f    }
||   j                  |	      }|dddf   }|	dddf   }	t	        j
                  d| j                  d|	j                        }|j                  |j                   d   dd      }t        ||d      j                  d      }|j                  |	j                   d   dd      }t        |	|d      j                  d      }	t        |	|      }t        |	|      }| j                  |z  | j                  |
z  z   | j                  |z  z   }t	        j                   |t	        j"                  d	            }t	        j$                  |t	        j"                  d
            }t	        j&                  |d      }t)        |j+                               }|j-                  |        |D cg c]O  \  }}t	        j.                  |t        j0                        t	        j.                  |t        j0                        fQ }}}|S c c}}w )ao  
        Params:
            masks_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
            class_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
            class_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
                target) containing the class labels.
            mask_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes, height, width` containing the target masks.

        Returns:
            matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
            where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected labels (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
        r   rN   Nr   r9   rd   Falign_cornersg    _Bg    _r7   )r:   ranger   r<   rF   r   r3  rd   repeatr  r  r-  r  r1  r0  r2  minimumtensormaximum
nan_to_numr   cpuappend	as_tensorint64)r.   r  r  r5  r6  indicesrm   i
pred_probs	pred_maskr0  target_maskr  target_coordinatespred_coordinatesr1  r2  cost_matrixassigned_indicesjmatched_indicess                        r1   rA   zVideomtHungarianMatcher.forward  s/   8 *, *//2
z" 	-A-a088<J,Q/I %QQ%788J%a.++I6K%ag.K!!T'*I !&

1dooqIYIY Z!2!9!9+:K:KA:NPQST!U&{4FV[\ddefgK077	8JAqQ$Y0@PUV^^_`aI =YTI+I{CI..94t7SSVZVdVdgpVppK--U\\$5GHK--U\\%5HIK**;:K0EkooFW0XNN+,?	-F ho
_c_`bcU__Qekk2EOOAU[[4YZ
 
 
s   5AI)r  r  r  i 1  )rB   rC   rD   rE   r   rz   r"   rF   no_gradr   listr   rA   rG   rH   s   @r1   r/  r/    s     jo##27#JO#cf#4 U]]_D#llD $llD \\	D
 llD 
eFm	D Dr2   r/  	num_masksc                     | j                         j                  d      }d||z  j                  d      z  }|j                  d      |j                  d      z   }d|dz   |dz   z  z
  }|j                         |z  }|S )a4  
    Compute the DICE loss, similar to generalized IOU for masks as follows:

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).
        num_masks (`int`):
            The number of masks present in the current batch, used for normalization.

    Returns:
        `torch.Tensor`: The computed loss.
    r   r9   rN   )r  r>   r  )r  r  rR  probsr  r  r  s          r1   	dice_lossrU  @  sz    , NN$$Q'EUV^((,,I))B-&**R.0K	A+/22D88:	!DKr2   c                     t        j                  d      } || |      }|j                  d      j                         |z  }|S )a|  
    Args:
        inputs (`torch.Tensor`):
            A float tensor of arbitrary shape.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss.
    r!  r"  r   )r   r$  meanr  )r  r  rR  r(  cross_entropy_lossr  s         r1   sigmoid_cross_entropy_lossrY  ^  sD     $$v6I"662""1%))+i7DKr2   c                       e Zd Zdedeeef   f fdZdeee	      dee	   fdZ
dee   deeef   fdZd	ed
ee   deej                     deeef   fdZdej                  deej                     deej                     de	deeej                  f   f
dZd Zd Zdej                  dej                  fdZdej                  de	de	dedej                  f
dZ	 ddej                  d	ej                  deej                     d
eej                     deeej                  f   dz  deeej                  f   fdZd
ej                  dej0                  dej                  fdZ xZS )VideomtLossr/   weight_dictc                    t         |           t        | dg       |j                  | _        || _        |j
                  | _        t        j                  | j                  dz         }| j                  |d<   | j                  d|       |j                  | _        |j                  | _        |j                  | _        t        |j                  |j                   |j"                  | j                        | _        y)aQ  
        The Videomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
        of matched ground-truth / prediction (supervise class and mask)

        Args:
            config (`VideomtConfig`):
                The configuration for Videomt model also containing loss calculation specific parameters.
            weight_dict (`dict[str, float]`):
                A dictionary of weights to be applied to the different losses.
        scipyr   rN   empty_weight)r0  r2  r1  r3  N)r!   r"   r   
num_labelsr\  no_object_weighteos_coefrF   r   r]   train_num_pointsr3  oversample_ratioimportance_sample_ratior/  class_weightdice_weightmask_weightmatcher)r.   r/   r\  r_  r0   s       r1   r"   zVideomtLoss.__init__s  s     	$	* ++& //zz$//A"56==R^\: !11 & 7 7'-'E'E$.**((((	
r2   sizesr4   c                 n    |d   }|dd  D ]'  }t        |      D ]  \  }}t        ||   |      ||<    ) |S )Nr   r   )	enumeratemax)r.   rj  maxessublistindexitems         r1   _max_by_axiszVideomtLoss._max_by_axis  sS    aQRy 	7G(1 7t"5<6e7	7 r2   tensorsc                 `   | j                  |D cg c]  }t        |j                         c}      }t        |      g|z   }|\  }}}}|d   j                  }	|d   j
                  }
t        j                  ||	|
      }t        j                  |||ft        j                  |
      }t        |||      D ]o  \  }}}|d |j                  d   d |j                  d   d |j                  d   f   j                  |       d|d |j                  d   d |j                  d   f<   q ||fS c c}w )Nr   r   r   r9   F)rr  rQ  r:   lenr8   rd   rF   rS   r   ri   zipcopy_)r.   rs  r>  max_sizebatch_shaperm   r   ro   rp   r8   rd   padded_tensorspadding_maskspadded_tensorpadding_masks                  r1   _pad_images_to_max_in_batchz'VideomtLoss._pad_images_to_max_in_batch  s7   $$w%OVd6<<&8%OP7|nx/'2$
Avu
  ""[fM

J#>ejjY_`36wP]3^ 	G/FM<+FLLO+->v||A->@Q&,,q/@QQRXXY_`AFL*6<<?*,=fll1o,==>	G },, &Ps   D+r  r6  rE  c           	         |}|j                   \  }}}t        j                  | j                        }| j	                  |      }	t        j                  t        ||      D 
cg c]  \  }
\  }}|
|    c}}}
      }t        j                  ||f| j                  t
        j                  |j                        }|||	<   |j                  dd      } |||      }d|i}|S c c}}}
w )a  Compute the losses related to the labels using cross entropy.

        Args:
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `batch_size, num_queries, num_labels`
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.

        Returns:
            `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
        )r=   )
fill_valuer8   rd   r   r9   loss_cross_entropy)r:   r   CrossEntropyLossr_  $_get_predictions_permutation_indicesrF   rl   rv  fullr`  rD  rd   r?   )r.   r  r6  rE  pred_logitsrm   num_queriesr   r(  idxtargetrN  target_classes_otarget_classespred_logits_transposedloss_celossess                    r1   loss_labelszVideomtLoss.loss_labels  s    " +%0%6%6"
K''t/@/@A	77@ 99-0w-GHH>66AqVAYH
 %$//]h]o]o
 /s!,!6!6q!!<2NC&0 Is   #C!r  r5  rR  c                      j                  |      } j                  |      }||   } j                  |      \  }}	||   }|dddf   }|dddf   }t        j                         5   j                  | fd j                   j                   j                        }
t        ||
d      j                  d      }ddd       t        |
d      j                  d      }t        ||      t        |||      d}~~|S # 1 sw Y   ExY w)a  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.
            num_masks (`int)`:
                The number of masks, used for normalization.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
              masks.
        Nc                 &    j                  |       S rs   )calculate_uncertainty)logitsr.   s    r1   <lambda>z(VideomtLoss.loss_masks.<locals>.<lambda>  s    t99&A r2   Fr9  r   )	loss_mask	loss_dice)r   _get_targets_permutation_indicesr~  rF   rP  sample_points_using_uncertaintyr3  rd  re  r  r  rY  rU  )r.   r  r5  rE  rR  src_idxtgt_idx
pred_maskstarget_masksr   r  point_labelspoint_logitsr  s   `             r1   
loss_maskszVideomtLoss.loss_masks  s,   4 ;;GD77@)'2
 ::;Ga#G,  4(
#AtG, ]]_ 		i $ D DA%%,,! (6GW\]eefghL		i $J0AQVW__`ab 4L,PYZ"<yI

 )		i 		is   (AD  D	c                    t        j                  t        |      D cg c]  \  }\  }}t        j                  ||        c}}}      }t        j                  |D cg c]  \  }}|	 c}}      }||fS c c}}}w c c}}w rs   rF   rl   rl  	full_like)r.   rE  rF  srcr   batch_indicespredictions_indicess          r1   r  z0VideomtLoss._get_predictions_permutation_indices  sj    		iX_N`"a"a{q(35??3#:"ab#iiW(E#q(EF111 #b(E   #A7A>
c                    t        j                  t        |      D cg c]  \  }\  }}t        j                  ||        c}}}      }t        j                  |D cg c]  \  }}|	 c}}      }||fS c c}}}w c c}}w rs   r  )r.   rE  rF  r   tgtr  target_indicess          r1   r  z,VideomtLoss._get_targets_permutation_indices  sh    		iX_N`"a"a{q(1c5??3#:"ab#@HQC#@An,, #b#@r  r  c                 2    t        j                  |       }|S )a  
        In Videomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
        for the foreground class in `classes`.

        Args:
            logits (`torch.Tensor`):
            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
            the number of foreground classes. The values are logits.

        Returns:
            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
            uncertain locations having the highest uncertainty score.
        )rF   abs)r.   r  uncertainty_scoress      r1   r  z!VideomtLoss.calculate_uncertainty  s      %yy01!!r2   r3  rd  re  c           	         |j                   d   }t        ||z        }t        j                  ||d|j                        }t        ||d      }	 ||	      }
t        ||z        }||z
  }t        j                  |
dddddf   |d      d   }|t        j                  |t        j                  |j                  	      z  }||dddf   z  }|j                  d
d      |j                  d
      ddf   j                  ||d      }|dkD  r:t        j                  |t        j                  ||d|j                        gd      }|S )a  
        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
        prediction as input.

        Args:
            logits (`float`):
                Logit predictions for P points.
            uncertainty_function:
                A function that takes logit predictions for P points and returns their uncertainties.
            num_points (`int`):
                The number of points P to sample.
            oversample_ratio (`int`):
                Oversampling parameter.
            importance_sample_ratio (`float`):
                Ratio of points that are sampled via importance sampling.

        Returns:
            point_coordinates (`torch.Tensor`):
                Coordinates for P sampled points.
        r   r9   r8  Fr9  Nr   )krf   r   rN   re   )r:   rz   rF   r   rd   r  topkr^   longr   rl   )r.   r  uncertainty_functionr3  rd  re  	num_boxesnum_points_sampledr  r  point_uncertaintiesnum_uncertain_pointsnum_random_pointsr  shifts                  r1   r  z+VideomtLoss.sample_points_using_uncertainty,  sI   < LLO	 .>!>? "JJy2DaPVP]P]^#F,=US2<@"#:Z#GH&)==jj,Q1W59MSTUVWX"U\\)5::V\VcVc%dduQW~-222q9#((2,/JOOPY[oqrsq  %		"EJJy:KQW]WdWd$ef! ! r2   Nauxiliary_predictionsc                    | j                  ||||      }| j                  ||d   j                        }i | j                  ||||      | j	                  |||      }|jt        |      D ]\  \  }	}
|
d   }|
d   }| j                  ||||      }|j                         D ci c]  \  }}| d|	 | }}}|j                  |       ^ |S c c}}w )a  
        This performs the loss computation.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, num_labels)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
                if `use_auxiliary_loss` was set to `true` in [`VideomtConfig`], then it contains the logits from
                the inner layers of the VideomtMaskedAttentionDecoder.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
              masks.
            if `use_auxiliary_loss` was set to `true` in [`VideomtConfig`], the dictionary contains additional
            losses for each auxiliary predictions.
        r   r8  r  r  r   )	ri  get_num_masksrd   r  r  rl  rA   itemsupdate)r.   r  r  r5  r6  r  rE  rR  r  r  aux_outputs	loss_dictr   r   s                 r1   rA   zVideomtLoss.forwardc  s
   H ,,35I;Xde&&|LO<R<R&S	%
oo2K)T%
3\7K%

 !,$-.C$D ) ['23I'J$'23I'J$ LL)=?SU`bno	EN__EVWzsEuAcU^U2W	Wi()  Xs   "Crd   c                 &   t        d |D              }t        j                  |t        j                  |      }d}t	               r2t
        j                  i k7  rt        |      }t               j                  }t        j                  ||z  d      }|S )zk
        Computes the average number of target masks across the batch, for normalization purposes.
        c              3   2   K   | ]  }t        |        y wrs   )ru  ).0classess     r1   	<genexpr>z,VideomtLoss.get_num_masks.<locals>.<genexpr>  s     AGAs   r   r   )min)
r  rF   rC  r   r   r   _shared_stater   num_processesclamp)r.   r6  rd   rR  
world_sizes        r1   r  zVideomtLoss.get_num_masks  su     ALAA	OOIU[[P	
"$))R/"9-	)^99
KK	J 6A>	r2   rs   )rB   rC   rD   r   dictr   r   r"   rQ  rz   rr  r   r   r~  nparrayr  rF   r  r  r  r  r  rA   rd   r  rG   rH   s   @r1   r[  r[  r  s   !
} !
4U
;K !
F$tCy/ d3i -4< -E&RX.DY -" $* :>v, QVWYW_W_Q` 	c6k	 D<#ll< %,,'< rxx	<
 < 
c5<<	 <|2-"ELL "U\\ ""5!5! 	5!
 5! "'5! 
5!z AE5#ll5 $ll5 %,,'	5
 5<<(5  $C$56=5 
c5<<	 5n%,,  QVQ]Q] r2   r[  c                       e Zd ZU dZeed<   dZdZdZdZ	dgZ
dZeed	Z ej                          d
ej$                  ddfd       Zy)VideomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r/   videomtpixel_values_videos)videoFr   T)r   r
  r   r4   Nc                 z   | j                   j                  }t        |t        j                  t        j
                  t        j                  f      rt        j                  |j                  t        j                  d             |j                  t        j                  j                  j                  |j                        \  }}|dkD  rdt        j                  |      z  nd}t        j                  |j                  | |       nxt        |t        j                         r@t        j"                  |j                         t        j$                  |j                         nt        |t        j&                        rut        j(                  |j                  dd       |j*                  t-        |j                  dd      st        j$                  |j                  |j*                            nt        |t.              rCt1        |d	      rrt        j2                  |j4                  | j                   j6                         n<t        |t8              rt        j:                  |j<                  d|       t        j$                  |j>                         t        j@                  |jB                  t        jD                  |jB                  jF                  d
         jI                  d             nt        |tJ              rRt        jL                  |jN                  dz         }|jP                  |d
<   t        j@                  |jR                  |       n/t        |tT              rt        j"                  |jV                         t        |t8              r*t        j                  j%                  |jX                         y y )Nrc   )ar   r   r   )rW  std_is_hf_initializedFr  rN   rM   )-r/   initializer_ranger'   r   r|   r,   ConvTranspose2dinitkaiming_uniform_r=   mathsqrtry   rF   _calculate_fan_in_and_fan_outuniform_r   ones_zeros_r[   normal_padding_idxgetattrr   hasattr	constant_r  r   rJ   trunc_normal_rR   rU   rw  rL   r^   r:   r_   r[  r   r`  rb  r_  VideomtForUniversalSegmentationattn_mask_probsr`   )r.   r   r  fan_inr   boundr_  s          r1   _init_weightsz$VideomtPreTrainedModel._init_weights  sX   kk++fryy"))R5G5GHI!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659-JJv}}%KK$-LLSa8!!-gfmmMach6iFMM&*<*<=> 12vy)v~~t{{/K/KL 12v//csCKK../JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh, ::f&7&7!&;<L%LJJv**L9 ?@JJv--.f/0GGNN6,,- 1r2   )rB   rC   rD   rE   r   r  base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpar   r   _can_record_outputsrF   rP  r   Moduler  r  r2   r1   r  r    sp    
 !+O!&+#'(N%&
 U]]_.BII .$ . .r2   r  c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )VideomtLayerNorm2dc                 *    t         |   |||       y )N)r   elementwise_affine)r!   r"   )r.   r%   r   affiner0   s       r1   r"   zVideomtLayerNorm2d.__init__  s    36Jr2   r   r4   c                     |j                  dddd      }t        j                  || j                  | j                  | j
                  | j                        }|j                  dddd      }|S )Nr   r9   r   r   )permuteF
layer_normnormalized_shaper=   ry   r   r   s     r1   rA   zVideomtLayerNorm2d.forward  sb    #++Aq!Q7||L$2G2GVZV_V_aeaiaij#++Aq!Q7r2   )gư>Tr   rH   s   @r1   r  r    s$    KELL U\\ r2   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )VideomtScaleLayerr/   c                    t         |           |j                  }t        j                  ||dd      | _        t        |j                     | _        t        j                  ||dd|d      | _
        t        |      | _        y )Nr9   r   r   r   F)r   paddinggroupsry   )r!   r"   r&   r   r  conv1r	   r~   r   r,   conv2r  layernorm2dr.   r/   r&   r0   s      r1   r"   zVideomtScaleLayer.__init__  su    ((''[aXYZ
 !2!23YY

 .k:r2   r   r4   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rs   )r   r   r  r  r   s     r1   rA   zVideomtScaleLayer.forward  sB    

=16

=1((7r2   	rB   rC   rD   r   r"   rF   r   rA   rG   rH   s   @r1   r  r    s*    ;} ; U\\ ell r2   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )VideomtScaleBlockr/   c                     t         |           |j                  | _        t	        j
                  t        | j                        D cg c]  }t        |       c}      | _        y c c}w rs   )	r!   r"   num_upscale_blocks
num_blocksr   
ModuleListr;  r  blockr.   r/   r   r0   s      r1   r"   zVideomtScaleBlock.__init__  sH     33]]uT__G]#^!$5f$=#^_
#^s   A&r   r4   c                 8    | j                   D ]
  } ||      } |S rs   )r  )r.   r   r  s      r1   rA   zVideomtScaleBlock.forward	  s%    ZZ 	1E!-0M	1r2   r  rH   s   @r1   r  r    s,    `} `
U\\ ell r2   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )VideomtMaskHeadr/   c                    t         |           |j                  }t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        t        |j                     | _
        y rs   )r!   r"   r&   r   r|   r}   r   fc3r	   r~   r   r  s      r1   r"   zVideomtMaskHead.__init__  sa    ((99[+699[+699[+6 !2!23r2   r   r4   c                     | j                  | j                  |            }| j                  | j                  |            }| j                  |      }|S rs   )r   r}   r   r  r   s     r1   rA   zVideomtMaskHead.forward  sD    (?@(?@/r2   r  rH   s   @r1   r  r    s*    4} 4U\\ ell r2   r  zY
    The Videomt Model with head on top for instance/semantic/panoptic segmentation.
    c                   d    e Zd ZdZdef fdZdededededeeef   d	eeef   fd
Z	deeef   d	efdZ
eee	 	 	 	 ddej                  dz  deej                     dz  deej                     dz  deej                     dz  dee   d	efd                     Zd Zdej                  fdZ xZS )r  r  r/   c                 <   t         |   |       || _        |j                  | _        t	        |      | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        t%        |      | _        t)        |      | _        t        j,                  |j                  |j.                  dz         | _        |j2                  |j4                  z  |j2                  |j4                  z  f| _        |j8                  |j:                  |j<                  d| _        tA        || j>                        | _!        | jE                  dtG        jH                  |jJ                               t        j,                  |j                  |j                        | _&        | jO                          y c c}w )Nr   r   )r  r  r  )r/   r\  r  )(r!   r"   r/   num_hidden_layersrJ   r@   r   r   r&   r   	layernormr[   r  r   r  r;  r   layersr  upscale_blockr  	mask_headr|   r`  class_predictorr#   r$   	grid_sizerf  rh  rg  r\  r[  r(  r]   rF   r   r
  query_updater	post_initr  s      r1   r"   z(VideomtForUniversalSegmentation.__init__(  s    !'!9!9+F3f&8&8f>S>ST\\&"4"4f6H6HI
mm5IaIaCb$ca\&%9$cd.v6(0!yy););V=N=NQR=RS ++v/@/@@&BSBSW]WhWhBhi"("5"5++++.
 %F@P@PQ.

6;L;L0MNYYv'9'96;M;MN' %ds   >Hr  r  r5  r6  r  r4   c                     | j                  |||||      }| j                  j                         D ]'  \  }}|j                         D ]  \  }	}
||	v s|
|z  }
 ) |S )N)r  r  r5  r6  r  )r(  r\  r  )r.   r  r  r5  r6  r  r  r   r=   loss_keyr  s              r1   get_loss_dictz-VideomtForUniversalSegmentation.get_loss_dictE  s|     (,~~!5!5#%"7 (6 (
	  ++113 	#KC"+//"3 #$(?FND#	#
 r2   r  c                 4    t        |j                               S rs   )r  r   )r.   r  s     r1   get_lossz(VideomtForUniversalSegmentation.get_loss]  s    9##%&&r2   Npatch_offsetsr   c           	      J   d|v rt        d      |t        d      |j                  dk7  rt        d      ||t        d      |j                  \  }}}}	}
|j                  ||z  ||	|
      }| j	                  |      }| j
                  | j                  j                  z
  }| j                  d| D ]
  } ||      } |j                  |||j                  d   |j                  d	         }g }g }g }d}t        |      D ]S  }|dd|f   }|2| j                  j                  dddddf   j                  |d
d
      }nK| j                  |      | j                  j                  dddddf   j                  |j                         z   }t#        j$                  |j                  |j                         |fd      }| j                  |d D ]
  } ||      } | j'                  |      }| j)                  |      \  }}|j+                  |       |j+                  |       |j+                  |       |ddd| j                  j,                  ddf   }V t/        dt#        j$                  |d      t#        j$                  |d      t#        j$                  |d            S )a  
        pixel_values_videos (`torch.Tensor`, *optional*):
            Video inputs of shape `(batch_size, num_frames, num_channels, height, width)`.
        mask_labels (`list[torch.Tensor]`, *optional*):
            Not supported for 5D video inputs.
        class_labels (`list[torch.LongTensor]`, *optional*):
            Not supported for 5D video inputs.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            Unused for video inputs and only kept for modular compatibility.
        r3   zAUse `pixel_values_videos` with `VideomtForUniversalSegmentation`.Nz'You have to specify pixel_values_videosrc   zyVideomtForUniversalSegmentation only supports 5D video inputs of shape (batch_size, num_frames, channels, height, width).zTraining with 5D video inputs is not supported in `VideomtForUniversalSegmentation`. Flatten frames and use `EomtForUniversalSegmentation` instead.r   r9   rN   re   r   )r  r  r  r	  )r;   rg   r:   rh   r@   r  r/   r
  r  r   r;  r   r=   r_   r  r<   rd   rF   rl   r  predictrB  r  r  )r.   r  r5  r6  r$  r   rm   rn   r%   ro   rp   flat_pixel_valuesr   query_start_idxlayer_moduleall_masks_queries_logitsall_class_queries_logitsall_last_hidden_statespropagated_query	frame_idxframe_hidden_statesquery_tokenssequence_outputr  r  s                            r1   rA   z'VideomtForUniversalSegmentation.forward`  s   * V#`aa&FGG##q(E 
 "l&>Q 
 ?R>W>W;
Jfe/77
Z8OQ]_eglm(9:004;;3I3II KK(89 	8L(7M	8 &**:z=CVCVWXCY[h[n[nop[qr#% #% !#z* 	TI"/9"='#zz00q!<CCJPRTVW#112BCdjjFWFWX\^_abXbFcFfFf'..G   #())\__=P=W=W-XZm,ntu"v $O,< = H&23F&G#H #nn-@AO9=o9V6 "6$++,@A$++,@A"))/:216O8O8O6OQR3RS)	T, 5!&+C!K!&+C!K#ii(>AF	
 	
r2   c                 .    | j                   j                  S rs   )r@   rV   r   s    r1   get_input_embeddingsz4VideomtForUniversalSegmentation.get_input_embeddings  s    ///r2   r  c                    |d d d | j                   j                  d d f   }| j                  |      }|d d | j                   j                  | j                  j                  z   d d d f   }|j                  dd      } |j                  |j                  d   dg| j                   }| j                  |      }| j                  |      }t        j                  d||      }||fS )Nr   r9   r   rN   zbqc, bchw -> bqhw)r/   r  r  r@   rZ   r?   rh   r:   r  r  r  rF   einsum)r.   r  r0  class_logitsprefix_tokensmask_logitss         r1   r&  z'VideomtForUniversalSegmentation.predict  s    a!:4;;#:#:!:A=>++L9q$++"9"9DOO<]<]"]"_abbc%//15---m.A.A!.DbZ4>>Z~~l3**=9ll#6mTL((r2   )NNNN)rB   rC   rD   r  r   r"   r   r  r   r!  r#  r   r   r   rF   rQ  r   r   r  rA   r3  r&  rG   rH   s   @r1   r  r     sF    ,O} :$ % 	
   $CK0 
c6k	0'$sF{"3 ' '   48152637O
"\\D0O
 %,,'$.O
 5<<(4/	O

 ELL)D0O
 +,O
 
/O
    O
b0)ell )r2   r  )r   )r   F)F)Mcollections.abcr(   r  r   dataclassesr   numpyr  rF   torch.nn.functionalr   r   r  r    r   r  activationsr	   
file_utilsr
   r   r   modeling_layersr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   configuration_videomtr   scipy.optimizer   
accelerater   accelerate.utilsr   r  r   rJ   rv   r   r   r   r   ri   r   r   r   r   r   r  r  r  r-  r/  rz   rU  rY  r[  r  r   r  r  r  r  r  __all__r  r2   r1   <module>rK     s+  *   $ !      & ! L L 9 F & P P 7 5 0 4'' RYY  F,		 ,^ &(bii (0 %II%<<% 
% <<	%
 LL4'% % %.8)ryy 8)vU\\ e T V[VbVb %bii %(ryy ("'- 'T+		 + 	7K 7	 7< LQLL5:\\
\\@  6 , u|| X]XdXd 8gbii gTf f   <u|| U\\ VY ^c^j^j (u")) up	 1._ 1. 1.h 		 2			 	bii " 
`)&< `)
`)F $%F
Gr2   