
    iW                        d Z ddlZddlmZ ddlZddlmc mZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*  ejV                  e,      Z- ed      e
 G d de*                    Z.e ed       G d de                    Z/ G d de&      Z0 G d de#      Z1 G d  d!e       Z2 G d" d#e(      Z3 G d$ d%e"      Z4 G d& d'e!      Z5 G d( d)ejl                        Z7 G d* d+ejp                        Z9 G d, d-ejp                        Z: G d. d/ejp                        Z;e G d0 d1e             Z< ed2       G d3 d4e%             Z=g d5Z>y)6zPyTorch EoMT model.    N)	dataclass)strict)Tensornn   )initialization)ACT2FN)ModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )Dinov2EmbeddingsDinov2LayerDinov2LayerScaleDinov2PatchEmbeddings)#Mask2FormerForUniversalSegmentationMask2FormerLoss)SiglipAttention)	ViTConfigz$tue-mps/coco_panoptic_eomt_large_640)
checkpointc                   &   e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZeed<   dZeez  ed<   dZeed<   dZeed<   dZeee   z  eeef   z  ed<   dZeee   z  eeef   z  ed<   dZeed<   dZeed<   dZeez  ed<   dZeed<   dZeez  ed<   dZeed<   d	Zeed <   d!Zeed"<   d#Zeed$<   d%Zeed&<   d%Z eed'<   d(Z!eed)<   d*Z"eed+<   d,Z#eed-<   d.Z$eed/<   d	Z%eed0<    e&       Z' e&       Z( e&       Z) e&       Z* e&       Z+ e&       Z,d1 Z-y2)3
EomtConfiga  
    layerscale_value (`float`, *optional*, defaults to 1.0):
        Initial value for the LayerScale parameter.
    num_upscale_blocks (`int`, *optional*, defaults to 2):
        Number of upsampling blocks used in the decoder or segmentation head.
    use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
        Whether to use the SwiGLU feedforward neural network.
    num_blocks (`int`, *optional*, defaults to 4):
        Number of feature blocks or stages in the architecture.
    no_object_weight (`float`, *optional*, defaults to 0.1):
        Loss weight for the 'no object' class in panoptic/instance segmentation.
    class_weight (`float`, *optional*, defaults to 2.0):
        Loss weight for classification targets.
    mask_weight (`float`, *optional*, defaults to 5.0):
        Loss weight for mask prediction.
    train_num_points (`int`, *optional*, defaults to 12544):
        Number of points to sample for mask loss computation during training.
    oversample_ratio (`float`, *optional*, defaults to 3.0):
        Oversampling ratio used in point sampling for mask training.
    importance_sample_ratio (`float`, *optional*, defaults to 0.75):
        Ratio of points to sample based on importance during training.
    num_queries (`int`, *optional*, defaults to 200):
        Number of object queries in the Transformer.
    num_register_tokens (`int`, *optional*, defaults to 4):
        Number of learnable register tokens added to the transformer input.

    Example:

    ```python
    >>> from transformers import EomtConfig, EomtForUniversalSegmentation

    >>> # Initialize configuration
    >>> config = EomtConfig()

    >>> # Initialize model
    >>> model = EomtForUniversalSegmentation(config)

    >>> # Access config
    >>> config = model.config
    ```eomti   hidden_size   num_hidden_layers   num_attention_heads   	mlp_ratiogelu
hidden_act        hidden_dropout_probg{Gz?initializer_rangeư>layer_norm_epsi  
image_size
patch_sizer   num_channelsg      ?layerscale_valuedrop_path_rater   num_upscale_blocksattention_dropoutFuse_swiglu_ffn
num_blocksg?no_object_weightg       @class_weightg      @mask_weightdice_weighti 1  train_num_pointsg      @oversample_ratiog      ?importance_sample_ratio   num_queriesnum_register_tokensc                     t        d      )NzNot needed for EomtAttributeError)selfkwargss     v/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/eomt/modular_eomt.py__post_init__zEomtConfig.__post_init__   s    233    N).__name__
__module____qualname____doc__
model_typer   int__annotations__r!   r#   r%   r'   strr)   floatr*   r,   r-   listtupler.   r/   r0   r1   r2   r3   r4   boolr5   r6   r7   r8   r9   r:   r;   r<   r>   r?   rB   intermediate_sizeqkv_bias
pooler_actpooler_output_sizeencoder_strideattention_probs_dropout_probrF    rG   rE   r   r   4   s   'R JKs!!IsJ'**#u# NE 47Jd3i%S/1746Jd3i%S/16L#!e!"%NECK%%(us{( ND J!e!L%KK!c!!e!%)U)K  &(H!J')#%N#1#3 4rG   r   a  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )custom_introc                   <   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                      dz  ed	<   y)
"EomtForUniversalSegmentationOutputa*  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    patch_offsets (`list[torch.Tensor]`, *optional*):
        list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets)rH   rI   rJ   rK   r^   torchFloatTensorrN   r_   r`   ra   rb   rR   rc   rd   rQ   r   rZ   rG   rE   r]   r]      s    * &*D%

d
")59%++d2959%++d2926u((4/659M5**+d2926Je''(4/6/3M4%,3rG   r]   c                       e Zd Zy)EomtLossNrH   rI   rJ   rZ   rG   rE   rh   rh          rG   rh   c                       e Zd Zy)EomtPatchEmbeddingsNri   rZ   rG   rE   rl   rl      rj   rG   rl   c                   Z    e Zd ZdeddfdZd Zdej                  dej                  fdZy)EomtEmbeddingsconfigreturnNc                    t         j                  j                  |        || _        |j                  | _        t        j
                  t        j                  dd|j                              | _	        t        j
                  t        j                  d|j                  |j                              | _        t        |      | _        | j                  j                  }t        j                   |j"                        | _        d|j                  z   | _        t        j(                  ||j                        | _        | j-                  dt        j.                  |      j1                  d      d       y )N   position_idsrr   F)
persistent)r   Module__init__ro   r.   	Parameterre   randnr   	cls_tokenzerosr?   register_tokensrl   patch_embeddingsnum_patchesDropoutr)   dropoutnum_prefix_tokens	Embeddingposition_embeddingsregister_bufferarangeexpand)rC   ro   r   s      rE   rx   zEomtEmbeddings.__init__   s    
		4  ++ekk!Q8J8J&KL!||EKK6;U;UW]WiWi,jk 3F ;++77zz&"<"<=!"V%?%?!?#%<<V=O=O#P ^U\\+-F-M-Mg-VchirG   c                     t        d      )NzNot needed for Eomt ModelrA   rC   s    rE   interpolate_pos_encodingz'EomtEmbeddings.interpolate_pos_encoding   s    899rG   pixel_valuesc                    |j                   \  }}}}| j                  j                  j                  j                  }| j                  |j                  |            }| j                  j                  |dd      }| j                  j                  |dd      }|| j                  | j                        z   }t        j                  |||gd      }| j                  |      }|S )N)dtyperu   rr   dim)shaper~   
projectionweightr   tor{   r   r}   r   rs   re   catr   )rC   r   
batch_size_target_dtype
embeddings
cls_tokensr}   s           rE   forwardzEomtEmbeddings.forward   s    *00
Aq!,,77>>DD**<???+NO
^^**:r2>
..55j"bI$":":4;L;L"MM
YY
OZHaP
\\*-
rG   )	rH   rI   rJ   r   rx   r   re   r   r   rZ   rG   rE   rn   rn      s8    jz jd j :ELL U\\ rG   rn   c                       e Zd Zy)EomtAttentionNri   rZ   rG   rE   r   r      rj   rG   r   c                       e Zd Zy)EomtLayerScaleNri   rZ   rG   rE   r   r      rj   rG   r   c                   f    e Zd Z	 ddej                  dej                  dz  dej                  fdZy)	EomtLayerNrb   attention_maskrp   c                 *   | j                  |      }| j                  ||      \  }}| j                  |      }| j                  |      |z   }| j	                  |      }| j                  |      }| j                  |      }| j                  |      |z   }|S N)norm1	attentionlayer_scale1	drop_pathnorm2mlplayer_scale2)rC   rb   r   hidden_states_normself_attention_outputr   layer_outputs          rE   r   zEomtLayer.forward   s    
 "ZZ6#'>>2Dn#U q $ 1 12G H '<=M zz-0xx-((6 ~~l3mCrG   r   )rH   rI   rJ   re   r   r   rZ   rG   rE   r   r      s9     /3|| t+ 
	rG   r   c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )EomtLayerNorm2dc                 *    t         |   |||       y )N)epselementwise_affine)superrx   )rC   r/   r   affine	__class__s       rE   rx   zEomtLayerNorm2d.__init__   s    36JrG   hidden_staterp   c                     |j                  dddd      }t        j                  || j                  | j                  | j
                  | j                        }|j                  dddd      }|S )Nr   r   r   rr   )permuteF
layer_normnormalized_shaper   biasr   )rC   r   s     rE   r   zEomtLayerNorm2d.forward  sb    #++Aq!Q7||L$2G2GVZV_V_aeaiaij#++Aq!Q7rG   )r+   T)rH   rI   rJ   rx   re   r   r   __classcell__r   s   @rE   r   r      s$    KELL U\\ rG   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )EomtScaleLayerro   c                    t         |           |j                  }t        j                  ||dd      | _        t        |j                     | _        t        j                  ||dd|d      | _
        t        |      | _        y )Nr   )kernel_sizestrider   rr   F)r   paddinggroupsr   )r   rx   r   r   ConvTranspose2dconv1r	   r'   
activationConv2dconv2r   layernorm2drC   ro   r   r   s      rE   rx   zEomtScaleLayer.__init__
  su    ((''[aXYZ
 !2!23YY

 +;7rG   rb   rp   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   rC   rb   s     rE   r   zEomtScaleLayer.forward  sB    

=16

=1((7rG   	rH   rI   rJ   r   rx   re   r   r   r   r   s   @rE   r   r   	  s*    8z 8 U\\ ell rG   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )EomtScaleBlockro   c                     t         |           |j                  | _        t	        j
                  t        | j                        D cg c]  }t        |       c}      | _        y c c}w r   )	r   rx   r2   r5   r   
ModuleListranger   block)rC   ro   r   r   s      rE   rx   zEomtScaleBlock.__init__#  sG     33]]E$//DZ#[qN6$:#[\
#[s   A&rb   rp   c                 8    | j                   D ]
  } ||      } |S r   )r   )rC   rb   r   s      rE   r   zEomtScaleBlock.forward(  s%    ZZ 	1E!-0M	1rG   r   r   s   @rE   r   r   "  s,    ]z ]
U\\ ell rG   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )EomtMaskHeadro   c                    t         |           |j                  }t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        t        |j                     | _
        y r   )r   rx   r   r   Linearfc1fc2fc3r	   r'   r   r   s      rE   rx   zEomtMaskHead.__init__/  sa    ((99[+699[+699[+6 !2!23rG   rb   rp   c                     | j                  | j                  |            }| j                  | j                  |            }| j                  |      }|S r   )r   r   r   r   r   s     rE   r   zEomtMaskHead.forward8  sD    (?@(?@/rG   r   r   s   @rE   r   r   .  s*    4z 4U\\ ell rG   r   c                       e Zd ZU dZeed<   dZdZdZdZ	dgZ
dZeed	Z ej                          d
ej$                  ddfd       Zy)EomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    ro   r   r   )imageFr   T)rb   rc   modulerp   Nc                     | j                   j                  }t        |t        j                  t        j
                  t        j                  f      rt        j                  |j                  t        j                  d             |j                  xt        j                  j                  j                  |j                        \  }}|dkD  rdt        j                  |      z  nd}t        j                  |j                  | |       y y t        |t        j                         r?t        j"                  |j                         t        j$                  |j                         y t        |t        j&                        rtt        j(                  |j                  dd       |j*                  Et-        |j                  dd      s-t        j$                  |j                  |j*                            y y y t        |t.              rBt1        |d	      r5t        j2                  |j4                  | j                   j6                         y y t        |t8              rt        j:                  |j<                  d|       t        j$                  |j>                         t        j@                  |jB                  t        jD                  |jB                  jF                  d
         jI                  d             y t        |tJ              rRt        jL                  |jN                  dz         }|jP                  |d
<   t        j@                  |jR                  |       y t        |tT              r t        j"                  |jV                         y y )N   )ar   rr   r(   )meanstd_is_hf_initializedFlambda1ru   rt   ),ro   r*   
isinstancer   r   r   r   initkaiming_uniform_r   mathsqrtr   re   _calculate_fan_in_and_fan_outuniform_	LayerNormones_zeros_r   normal_padding_idxgetattrr   hasattr	constant_r   r0   rn   trunc_normal_r{   r}   copy_rs   r   r   r   rh   ones
num_labelseos_coefempty_weightEomtForUniversalSegmentationattn_mask_probs)rC   r   r   fan_inr   boundr   s          rE   _init_weightsz!EomtPreTrainedModel._init_weightsR  sJ   kk++fryy"))R5G5GHI!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659 ' -JJv}}%KK$-LLSa8!!-gfmmMach6iFMM&*<*<=> 7j-/vy)v~~t{{/K/KL */v//csCKK../JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh) ::f&7&7!&;<L%LJJv**L9 <=JJv--. >rG   )rH   rI   rJ   rK   r   rN   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpar   r   _can_record_outputsre   no_gradr   rw   r  rZ   rG   rE   r   r   ?  so    
 $O!&+#$N"#
 U]]_/BII /$ / /rG   r   zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                       e Zd ZdefdZd Zd Zdej                  fdZ	e
d        Zeee	 	 	 dd	ed
ee   dz  dee   dz  dee   dz  dee   defd                     Zy)r   ro   c                    t        j                  | |       || _        |j                  | _        t	        |      | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        t%        |      | _        t)        |      | _        t        j,                  |j                  |j.                  dz         | _        |j2                  |j4                  z  |j2                  |j4                  z  f| _        |j8                  |j:                  |j<                  d| _        tA        || j>                        | _!        | jE                  dtG        jH                  |jJ                               | jM                          y c c}w )N)r   rr   )loss_cross_entropy	loss_mask	loss_dice)ro   weight_dictr   )'r   rx   ro   r!   rn   r   r   r   r   r,   	layernormr   r>   queryr   r   r   layersr   upscale_blockr   	mask_headr   r   class_predictorr-   r.   	grid_sizer7   r8   r9   r  rh   	criterionr   re   r   r5   	post_init)rC   ro   r   s      rE   rx   z%EomtForUniversalSegmentation.__init__x  sr     v.!'!9!9(0f&8&8f>S>ST\\&"4"4f6H6HI
mmfF^F^@_$`1Yv%6$`a+F3%f-!yy););V=N=NQR=RS ++v/@/@@&BSBSW]WhWhBhi"("5"5++++.
 "T=M=MN.

6;L;L0MN% %as   G0c                 .    | j                   j                  S r   )r   r~   r   s    rE   get_input_embeddingsz1EomtForUniversalSegmentation.get_input_embeddings  s    ///rG   c                     t        d      )NzNote needed for Eomt Model.rA   r   s    rE   get_auxiliary_logitsz1EomtForUniversalSegmentation.get_auxiliary_logits  s    :;;rG   logitsc                    |d d d | j                   j                  d d f   }| j                  |      }|d d | j                   j                  | j                  j                  z   d d d f   }|j                  dd      } |j                  |j                  d   dg| j                   }| j                  |      }| j                  |      }t        j                  d||      }||fS )Nrr   r   r   ru   zbqc, bchw -> bqhw)ro   r>   r  r   r   	transposereshaper   r  r  r  re   einsum)rC   r  query_tokensclass_logitsprefix_tokensmask_logitss         rE   predictz$EomtForUniversalSegmentation.predict  s    a!:4;;#:#:!:A=>++L9q$++"9"9DOO<]<]"]"_abbc%//15---m.A.A!.DbZ4>>Z~~l3**=9ll#6mTL((rG   c                     |dk  r9t        j                  | j                  d   ||      |kD  }d| d d d ||d f   |<   | S )Nrr   r   )device)re   randr   )	attn_maskprobnum_query_tokensencoder_start_tokensr*  random_queriess         rE   _disable_attention_maskz4EomtForUniversalSegmentation._disable_attention_mask  sW    !8"ZZ	(:<LU[\_ccN VWIa***,@,AAB>RrG   Nr   mask_labelsclass_labelsrd   rD   rp   c                 (   d\  }}d}|t        d      | j                  |      }	t        | j                        D ]  \  }
}|
| j                  | j
                  j                  z
  k(  rp| j                  j                  dddddf   j                  |	j                  d   dd      j                  |	j                        }t        j                  ||	fd      }	|
| j                  | j
                  j                  z
  k\  r| j                  s7| j                   |
| j                  z
  | j
                  j                  z      dkD  r| j#                  |	      }| j%                  |      \  }}||fz  }||fz  }t        j&                  |	j                  d   |	j                  d   |	j                  d   |	j                  t        j(                        }t+        j,                  || j.                  d	
      }|j1                  |j3                  d      |j3                  d      d      }| j
                  j4                  }|| j                  j6                  z   }|dkD  |ddd||df<   | j9                  || j                   |
| j                  z
  | j
                  j                  z      |||j                        }|ddddf   j                  d| j
                  j:                  dd      }|j=                         j?                  | d      } ||	|      }	 | j#                  |	      }| j%                  |      \  }}||fz  }||fz  }d}|B|@d}tA        ||      D ]/  \  }}| jC                  ||||d      }|| jE                  |      z  }1 tG        |||||      S )ag  
        mask_labels (`list[torch.Tensor]`, *optional*):
            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
        class_labels (`list[torch.LongTensor]`, *optional*):
            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
        )rZ   rZ   Nz You have to specify pixel_valuesr   ru   rr   r   )r*  r   bilinear)sizemode)r-  r.  r/  r*  .g    er(   )r`   r_   r2  r3  auxiliary_predictions)r^   r`   r_   ra   rd   )$
ValueErrorr   	enumerater  r!   ro   r5   r  r   r   r   r   r*  re   r   trainingr   r  r(  r   rS   r   interpolater  viewr6  r>   r   r1  r#   rP   masked_fillzipget_loss_dictget_lossr]   )rC   r   r2  r3  rd   rD   masks_queries_logits_per_layerclass_queries_logits_per_layerr   rb   idxlayer_moduler  norm_hidden_statesr`   r_   interpolated_logitsr.  r/  sequence_outputr^   	loss_dicts                         rE   r   z$EomtForUniversalSegmentation.forward  s   * JPF&(F?@@5!*4;;!7 .	HCd,,t{{/E/EEE

))$1*5<<]=P=PQR=SUWY[\__`m`t`tu %		5-*@a Hd,,t{{/E/EEE!5!5cD<R<R6RUYU`U`UkUk6k!lop!p%)^^M%B"=A\\J\=]:$&:.3G2II..3G2II.!&!''*!''*!''*(//**" '(mm4Ht~~dn&o#&9&>&>',,Q/1D1I1I!1Lb'# $(;;#:#: '7$//:[:['[$ ObdeNeq"3#3"35I5JJK "&!=!="--cD4J4J.JT[[McMc.cd%5)=)00 "> " "04!=!D!DRIhIhjlnp!q!/!5!5!7!C!C^OUY!Z(GM].	H` ..759\\/5R22&+?*AA&&+?*AA&"|'?D>A.0N? 
1:$&: !..)=)= +!-*. / 	 i00
1 2!5!5-'
 	
rG   )NNN)rH   rI   rJ   r   rx   r  r  re   r   r(  staticmethodr1  r   r   r   rQ   r   r   r]   r   rZ   rG   rE   r   r   r  s    z 80<)ell )      ,0,0-1e
e
 &\D(e
 6lT)	e

 F|d*e
 +,e
 
,e
    e
rG   r   )r   r   r   )?rK   r   dataclassesr   re   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr   r    r   r   activationsr	   
file_utilsr
   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   dinov2.modeling_dinov2r   r   r   r    mask2former.modeling_mask2formerr   r   siglip.modeling_siglipr   vit.configuration_vitr   
get_loggerrH   loggerr   r]   rh   rl   rn   r   r   r   r   r   rw   r   r   r   r   r   __all__rZ   rG   rE   <module>r^     s     !    .  & ! . & 
 8 5  d 4 - 
		H	% ABO4 O4  CO4d 	4 4	 4>	 		/ 	!% !H	O 		% 	 0bll RYY 2	RYY 	299 " /// // //d 
f
#F f

f
R PrG   