
    i$                     z   d Z ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z#  ed      e G d de                    Z$ ed      e G d de                    Z% ed      e G d de                    Z& ed      e G d de                    Z' G d de      Z( G d de"      Z) G d de      Z* G d  d!e!      Z+ G d" d#e      Z,e G d$ d%e              Z- ed&'       G d( d)e#             Z. G d* d+e      Z/g d,Z0y)-zPyTorch SAM 2 model.    N)strict   )initialization)PreTrainedConfig)PreTrainedModel)Unpack)auto_docstring)TransformersKwargsmerge_with_config_defaults)capture_outputs   )CONFIG_MAPPING
AutoConfig)
Sam2ConfigSam2MaskDecoderConfigSam2PromptEncoderConfig)Sam2AttentionSam2FeedForwardSam2LayerNorm	Sam2ModelSam2PreTrainedModelSam2TwoWayAttentionBlockSam2VisionEncoderOutputSam2VisionModelzyonigozlan/EdgeTAM-hf)
checkpointc                       e Zd ZU dZdZdZdeiZdZe	e
z  dz  ed<   dZee   dz  ed<   dZedz  ed<   dZeed	<   d
Zeed<   d
Zeed<   dZeed<   dZee   dz  ed<   dZeed<   dZeed<   dZeed<   dZeed<    fdZ xZS )EdgeTamVisionConfiga  
    backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
        The list of channel dimensions for the backbone.
    backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
        The spatial sizes of the feature maps from the backbone.
    fpn_hidden_size (`int`, *optional*, defaults to 256):
        The hidden dimension of the FPN.
    fpn_kernel_size (`int`, *optional*, defaults to 1):
        The kernel size for the convolutions in the neck.
    fpn_stride (`int`, *optional*, defaults to 1):
        The stride for the convolutions in the neck.
    fpn_padding (`int`, *optional*, defaults to 0):
        The padding for the convolutions in the neck.
    fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
        The levels for the top-down FPN connections.
    num_feature_levels (`int`, *optional*, defaults to 3):
        The number of feature levels from the FPN to use.
    vision_configedgetam_vision_modelbackbone_configNbackbone_channel_listbackbone_feature_sizes   fpn_hidden_size   fpn_kernel_size
fpn_strider   fpn_paddingfpn_top_down_levelsr   num_feature_levelsgelu
hidden_actgư>layer_norm_epsg{Gz?initializer_rangec                 0   | j                   g dn| j                   | _         | j                  ddgddgddggn| j                  | _        | j                  ddgn| j                  | _        t        | j                  t
              rT| j                  j                  dd      | j                  d<   t        | j                  d      di | j                  | _        n.| j                  "t        j                  d	dd
g dd      | _        t        | ,  di | y )N)i     `   0   r#      @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r   r%   r   r   )in_chansfeatures_onlyout_indices)
model_args )r!   r"   r)   
isinstancer    dictgetr   r   from_pretrainedsuper__post_init__)selfkwargs	__class__s     |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/edgetam/modular_edgetam.pyrA   z!EdgeTamVisionConfig.__post_init__Q   s   "&"<"<"D$JdJd 	" 372M2M2Uc3Z#sb"X.[_[v[v 	# .2-E-E-MAq6SWSkSk d**D1151E1E1I1I,Xf1gD  .#1$2F2F|2T#U#mX\XlXl#mD !!)#-#=#=*()DQ]^$D  	''    )__name__
__module____qualname____doc__base_config_keyr5   r   sub_configsr    r=   r   __annotations__r!   listintr"   r$   r&   r'   r(   r)   r*   r,   strr-   floatr.   rA   __classcell__)rD   s   @rE   r   r   (   s    & &O'J:K 7;OT,,t3:.249t+2*.D4K.OSOSJK,0cT)0J NE #u#( (rF   r   c                       e Zd Zy)EdgeTamPromptEncoderConfigNrG   rH   rI   r;   rF   rE   rT   rT   e        	rF   rT   c                       e Zd Zy)EdgeTamMaskDecoderConfigNrU   r;   rF   rE   rX   rX   k   rV   rF   rX   c                       e Zd ZdZy)EdgeTamConfiga  
    prompt_encoder_config (Union[`dict`, `EdgeTamPromptEncoderConfig`], *optional*):
        Dictionary of configuration options used to initialize [`EdgeTamPromptEncoderConfig`].
    mask_decoder_config (Union[`dict`, `EdgeTamMaskDecoderConfig`], *optional*):
        Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`].

     Example:

     ```python
     >>> from transformers import (
     ...     EdgeTamVisionConfig,
     ...     EdgeTamPromptEncoderConfig,
     ...     EdgeTamMaskDecoderConfig,
     ...     EdgeTamModel,
     ... )

     >>> # Initializing a EdgeTamConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration
     >>> configuration = EdgeTamConfig()

     >>> # Initializing a EdgeTamModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration
     >>> model = EdgeTamModel(configuration)

     >>> # Accessing the model configuration
     >>> configuration = model.config

     >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig
     >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations
     >>> vision_config = EdgeTamVisionConfig()
     >>> prompt_encoder_config = EdgeTamPromptEncoderConfig()
     >>> mask_decoder_config = EdgeTamMaskDecoderConfig()

     >>> config = EdgeTamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
     ```
    N)rG   rH   rI   rJ   r;   rF   rE   rZ   rZ   q   s    !F 	rF   rZ   c                       e Zd Zy)EdgeTamLayerNormNrU   r;   rF   rE   r\   r\          rF   r\   c                       e Zd Zy)EdgeTamVisionEncoderOutputNrU   r;   rF   rE   r_   r_      r]   rF   r_   c                       e Zd Zy)EdgeTamAttentionNrU   r;   rF   rE   ra   ra      r]   rF   ra   c                       e Zd Zy)EdgeTamTwoWayAttentionBlockNrU   r;   rF   rE   rc   rc      r]   rF   rc   c                       e Zd Zy)EdgeTamFeedForwardNrU   r;   rF   rE   re   re      r]   rF   re   c                   >    e Zd ZdZ ej
                         d        Zy)EdgeTamPreTrainedModelNc                    t        j                  | |       t        |t              r-|j                   t        j                  |j                         y y t        |d      r,t        j                  |j                  |j                         y y )Npositional_embedding)std)r   _init_weightsr<   EdgeTamModelno_memory_embeddinginitzeros_hasattrnormal_ri   scale)rB   modules     rE   rk   z$EdgeTamPreTrainedModel._init_weights   sg    %%dF3fl+))5F667 6V34LL44&,,G 5rF   )rG   rH   rI   "_keys_to_ignore_on_load_unexpectedtorchno_gradrk   r;   rF   rE   rg   rg      s$    )-&U]]_H HrF   rg   zN
    The vision model from EdgeTAM without any head or projection on top.
    )custom_introc            
       p    e Zd ZeZdZi Zd Zee		 dde
j                  dz  dee   deez  fd              Zy)EdgeTamVisionModelpixel_valuesc                     t        d      Nz2Can't get input embeddings from timm wrapper modelNotImplementedErrorrB   s    rE   get_input_embeddingsz'EdgeTamVisionModel.get_input_embeddings       !"VWWrF   NrC   returnc           	      ^   |t        d       | j                  |fi |}|j                  }|D cg c]  }|j                  dddd       }}| j	                  |      \  }}|| j
                   d  d d d   }|| j
                   d  d d d   }t        |d   |||j                        S c c}w )Nz You have to specify pixel_valuesr   r   r   r%   )last_hidden_statefpn_hidden_statesfpn_position_encodinghidden_states)
ValueErrorbackboner   permuteneckr*   r_   r   )rB   rz   rC   backbone_outputintermediate_hidden_stateshidden_stater   r   s           rE   forwardzEdgeTamVisionModel.forward   s     ?@@ ($--??%4%F%F"[u%v<l&:&:1aA&F%v"%v3799=W3X00-t/F/F.F.HI$B$O 5t7N7N6N6P QRVTVRV W)8</"7)77	
 	
 &ws   B*)N)rG   rH   rI   r   config_classmain_input_name_can_record_outputsr   r   r   ru   FloatTensorr   r
   tupler_   r   r;   rF   rE   ry   ry      sp     'L$O X   26
''$.
 +,
 
+	+	
   
rF   ry   c                       e Zd Zg dZd Zy)rl   )z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterc                     t        d      r|   r}   r   s    rE   r   z!EdgeTamModel.get_input_embeddings   r   rF   N)rG   rH   rI   rt   r   r;   rF   rE   rl   rl      s    	*&XrF   rl   )rl   ry   rg   rZ   r   rT   rX   )1rJ   ru   huggingface_hub.dataclassesr    r   rn   configuration_utilsr   modeling_utilsr   processing_utilsr   utilsr	   utils.genericr
   r   utils.output_capturingr   autor   r   sam2.configuration_sam2r   r   r   sam2.modeling_sam2r   r   r   r   r   r   r   r   r   rT   rX   rZ   r\   r_   ra   rc   re   rg   ry   rl   __all__r;   rF   rE   <module>r      s~     . & 3 - & # K 5 - ` `	 	 	 238(* 8(  48(v 23	!8 	  4	 23	4 	  4	 23$	J $	  4$	N	} 		!8 		} 		": 		 	 
H0 
H 
H 
#
 #

#
LX9 X rF   