
    i)                        d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	m
Z
 dd	lmZ dd
lmZmZmZmZmZmZmZmZmZmZmZ  G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Ze e
d       G d de                    Z  G d de      Z! G d d e      Z" G d! d"e      Z# G d# d$e      Z$ G d% d&e      Z%g d'Z&y)(    )	dataclassN)nn   )ModelOutput)Unpack)TransformersKwargsauto_docstring   )
EomtConfig)EomtEmbeddingsEomtForUniversalSegmentation	EomtLayerEomtLayerNorm2dEomtLayerScaleEomtMLPEomtPatchEmbeddingsEomtPreTrainedModelEomtScaleBlockEomtScaleLayerEomtSwiGLUFFNc                       e Zd ZdZy)VideomtConfigvideomtN)__name__
__module____qualname__
model_type     |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/videomt/modular_videomt.pyr   r   '   s    Jr   r   c                   D    e Zd Zdej                  dej                  fdZy)VideomtPatchEmbeddingspixel_valuesreturnc                 :   |j                   d   }|| j                  k7  rt        d| j                   d| d      |j                  | j                  j
                  j                        }| j	                  |      j                  d      j                  dd      }|S )N   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .)dtyper
   )	shapenum_channels
ValueErrorto
projectionweightr(   flatten	transpose)selfr#   r*   
embeddingss       r    forwardzVideomtPatchEmbeddings.forward,   s    #))!,4,,,!../yaI 
 $T__-C-C-I-IJ__\2::1=GG1M
r   N)r   r   r   torchTensorr3   r   r   r    r"   r"   +   s    
ELL 
U\\ 
r   r"   c                   |     e Zd Zdef fdZddej                  dej                  dz  dej                  fdZ xZS )	VideomtEmbeddingsconfigc                     t         |   |       t        |      | _        t	        j
                  t        j                  dd|j                              | _	        y )Nr&   )
super__init__r"   patch_embeddingsr   	Parameterr4   zeroshidden_size
mask_tokenr1   r8   	__class__s     r    r;   zVideomtEmbeddings.__init__:   s@      6v >,,u{{1a9K9K'LMr   Nr#   bool_masked_posr$   c                    |j                   dk(  rA|j                  \  }}}}}|j                  ||z  |||      }|F|j                  ||z  d      }n0|.|j                   dkD  r|j                  |j                  d   d      }|j                  d   }| j                  |      }|[|j	                  |j
                  t        j                        j                  d      }	t        j                  |	| j                  |      }| j                  j                  |dd      }
| j                  j                  |dd      }|| j                  | j                        z   }t        j                   |
||gd      }| j#                  |      }|S )N   r
   r   )devicer(   r&   dim)ndimr)   reshaper<   r,   rG   r4   bool	unsqueezewherer@   	cls_tokenexpandregister_tokensposition_embeddingsposition_idscatdropout)r1   r#   rC   
batch_size
num_framesr*   heightwidthr2   mask
cls_tokensrQ   s               r    r3   zVideomtEmbeddings.forward?   sq   !BNBTBT?J
L&%'//
Z0GW]_deL*"1"9"9*z:QSU"V(_-A-AA-E-55o6K6KA6NPRSO!''*
**<8
&"%%Z->->ejj%Q[[\^_DT4??JGJ^^**:r2>
..55j"bI$":":4;L;L"MM
YY
OZHaP
\\*-
r   N)	r   r   r   r   r;   r4   r5   r3   __classcell__rB   s   @r    r7   r7   9   s@    N} N
ELL 5<<RVCV bgbnbn r   r7   c                       e Zd Zy)
VideomtMLPNr   r   r   r   r   r    r`   r`   Y       r   r`   c                       e Zd Zy)VideomtGatedMLPNra   r   r   r    rd   rd   ]   rb   r   rd   c                       e Zd Zy)VideomtLayerNra   r   r   r    rf   rf   a   rb   r   rf   c                       e Zd Zy)VideomtLayerScaleNra   r   r   r    rh   rh   e   rb   r   rh   a  
    Class for outputs of [`VideomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~VideomtVideoProcessor.post_process_semantic_segmentation`] or
    [`~VideomtVideoProcessor.post_process_instance_segmentation`] or
    [`~VideomtVideoProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~VideomtVideoProcessor`] for details regarding usage.
    )custom_introc                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)	%VideomtForUniversalSegmentationOutputa  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentions)r   r   r   __doc__rl   r4   FloatTensor__annotations__rm   rn   ro   rp   tuplerq   r   r   r    rk   rk   i   s    & &*D%

d
")59%++d2959%++d2926u((4/659M5**+d2926Je''(4/6r   rk   c                   l     e Zd ZdZdZ ej                         dej                  ddf fd       Z	 xZ
S )VideomtPreTrainedModelpixel_values_videos)videomoduler$   Nc                     t         |   |       t        |t              r*t        j
                  j                  |j                         y y r\   )r:   _init_weights
isinstancer7   r   initzeros_r@   )r1   rz   rB   s     r    r|   z$VideomtPreTrainedModel._init_weights   s6    f%f/0GGNN6,,- 1r   )r   r   r   main_input_nameinput_modalitiesr4   no_gradr   Moduler|   r]   r^   s   @r    rw   rw      s:    +O!U]]_.BII .$ . .r   rw   c                       e Zd Zy)VideomtLayerNorm2dNra   r   r   r    r   r      rb   r   r   c                       e Zd Zy)VideomtScaleLayerNra   r   r   r    r   r      rb   r   r   c                       e Zd Zy)VideomtScaleBlockNra   r   r   r    r   r      rb   r   r   c                        e Zd ZdZdef fdZd Z	 	 	 	 ddej                  dz  de	ej                     dz  de	ej                     dz  de	ej                     dz  d	e
e   d
efdZ xZS )VideomtForUniversalSegmentationrx   r8   c                     t         |   |       t        j                  |j                  |j                        | _        y r\   )r:   r;   r   Linearr?   query_updaterrA   s     r    r;   z(VideomtForUniversalSegmentation.__init__   s/     YYv'9'96;M;MNr   c                     t        d      )NzNot needed for Videomt)AttributeError)	attn_maskprobnum_query_tokensencoder_start_tokensrG   s        r    _disable_attention_maskz7VideomtForUniversalSegmentation._disable_attention_mask   s    566r   Nmask_labelsclass_labelspatch_offsetskwargsr$   c           	      J   d|v rt        d      |t        d      |j                  dk7  rt        d      ||t        d      |j                  \  }}}}	}
|j                  ||z  ||	|
      }| j	                  |      }| j
                  | j                  j                  z
  }| j                  d| D ]
  } ||      } |j                  |||j                  d   |j                  d	         }g }g }g }d}t        |      D ]S  }|dd|f   }|2| j                  j                  dddddf   j                  |d
d
      }nK| j                  |      | j                  j                  dddddf   j                  |j                         z   }t#        j$                  |j                  |j                         |fd      }| j                  |d D ]
  } ||      } | j'                  |      }| j)                  |      \  }}|j+                  |       |j+                  |       |j+                  |       |ddd| j                  j,                  ddf   }V t/        dt#        j$                  |d      t#        j$                  |d      t#        j$                  |d            S )a  
        pixel_values_videos (`torch.Tensor`, *optional*):
            Video inputs of shape `(batch_size, num_frames, num_channels, height, width)`.
        mask_labels (`list[torch.Tensor]`, *optional*):
            Not supported for 5D video inputs.
        class_labels (`list[torch.LongTensor]`, *optional*):
            Not supported for 5D video inputs.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            Unused for video inputs and only kept for modular compatibility.
        r#   zAUse `pixel_values_videos` with `VideomtForUniversalSegmentation`.Nz'You have to specify pixel_values_videosrE   zyVideomtForUniversalSegmentation only supports 5D video inputs of shape (batch_size, num_frames, channels, height, width).zTraining with 5D video inputs is not supported in `VideomtForUniversalSegmentation`. Flatten frames and use `EomtForUniversalSegmentation` instead.r&   r
   rF   rH   r   )rl   rn   rm   ro   )r+   rJ   r)   rK   r2   num_hidden_layersr8   
num_blockslayersviewrangequeryr.   rP   r   r,   rG   r4   rT   	layernormpredictappendnum_queriesrk   )r1   rx   r   r   r   r   rV   rW   r*   rX   rY   flat_pixel_valuesrp   query_start_idxlayer_moduleall_masks_queries_logitsall_class_queries_logitsall_last_hidden_statespropagated_query	frame_idxframe_hidden_statesquery_tokenssequence_outputrn   rm   s                            r    r3   z'VideomtForUniversalSegmentation.forward   s   $ V#`aa&FGG##q(E 
 "l&>Q 
 ?R>W>W;
Jfe/77
Z8OQ]_eglm(9:004;;3I3II KK(89 	8L(7M	8 &**:z=CVCVWXCY[h[n[nop[qr#% #% !#z* 	TI"/9"='#zz00q!<CCJPRTVW#112BCdjjFWFWX\^_abXbFcFfFf'..G   #())\__=P=W=W-XZm,ntu"v $O,< = H&23F&G#H #nn-@AO9=o9V6 "6$++,@A$++,@A"))/:216O8O8O6OQR3RS)	T, 5!&+C!K!&+C!K#ii(>AF	
 	
r   )NNNN)r   r   r   r   r   r;   r   r4   r5   listr   r   rk   r3   r]   r^   s   @r    r   r      s    +OO} O7
 48152637O
"\\D0O
 %,,'$.O
 5<<(4/	O

 ELL)D0O
 +,O
 
/O
r   r   )r   rw   r   )'dataclassesr   r4   r   
file_utilsr   processing_utilsr   utilsr   r	   eomt.configuration_eomtr   eomt.modeling_eomtr   r   r   r   r   r   r   r   r   r   r   r   r"   r7   r`   rd   rf   rh   rk   rw   r   r   r   r   __all__r   r   r    <module>r      s    "   % & 7 0   J 0  @	 		m 		9 		 	 	7K 7	 78.0 .	 		 		 	Y
&B Y
xr   