
    i~1                     \   d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	  ed	      e G d
 de                    Z
 ed	      e G d de                    Z ed	      e G d de                    Z ed	      e G d de                    Z ed	      e G d de                    Zg dZy)zSAM2 model configuration    )strict   )PreTrainedConfig)auto_docstring   )CONFIG_MAPPING
AutoConfigzfacebook/sam2.1-hiera-tiny)
checkpointc                       e Zd ZU dZdZdZdZeed<   dZ	eed<   dZ
eed	<   d
Zeee   z  d
z  ed<   d
Zeee   z  d
z  ed<   d
Zeee   z  d
z  ed<   d
Zeee   z  d
z  ed<   d
Zeee   z  d
z  ed<   d
Zee   d
z  ed<   dZeed<   d
Zee   d
z  ed<   d
Zee   d
z  ed<   d
Zee   d
z  ed<   d
Zee   d
z  ed<   d
Zee   d
z  ed<   dZeed<   dZeed<   dZeed<   dZeed<    fdZ xZS ) Sam2HieraDetConfiga,  
    patch_kernel_size (`list[int]`, *optional*, defaults to `[7, 7]`):
        The kernel size of the patch.
    patch_stride (`list[int]`, *optional*, defaults to `[4, 4]`):
        The stride of the patch.
    patch_padding (`list[int]`, *optional*, defaults to `[3, 3]`):
        The padding of the patch.
    query_stride (`list[int]`, *optional*, defaults to `[2, 2]`):
        The downsample stride between stages.
    window_positional_embedding_background_size (`list[int]`, *optional*, defaults to `[7, 7]`):
        The window size per stage when not using global attention.
    num_query_pool_stages (`int`, *optional*, defaults to 3):
        The number of query pool stages.
    blocks_per_stage (`list[int]`, *optional*, defaults to `[1, 2, 7, 2]`):
        The number of blocks per stage.
    embed_dim_per_stage (`list[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
        The embedding dimension per stage.
    num_attention_heads_per_stage (`list[int]`, *optional*, defaults to `[1, 2, 4, 8]`):
        The number of attention heads per stage.
    window_size_per_stage (`list[int]`, *optional*, defaults to `[8, 4, 14, 7]`):
        The window size per stage.
    global_attention_blocks (`list[int]`, *optional*, defaults to `[5, 7, 9]`):
        The blocks where global attention is used.
    backbone_configsam2_hiera_det_model`   hidden_size   num_attention_headsr   num_channelsN
image_sizepatch_kernel_sizepatch_stridepatch_paddingquery_stride+window_positional_embedding_background_sizenum_query_pool_stagesblocks_per_stageembed_dim_per_stagenum_attention_heads_per_stagewindow_size_per_stageglobal_attention_blocksg      @	mlp_ratiogelu
hidden_actư>layer_norm_eps{Gz?initializer_rangec                    | j                   | j                   nddg| _         | j                  | j                  nddg| _        | j                  | j                  nddg| _        | j                  | j                  nddg| _        | j                  | j                  nddg| _        | j
                  | j
                  nddg| _        | j                  | j                  ng d| _        | j                  | j                  ng d| _        | j                  | j                  ng d| _        | j                  | j                  ng d	| _	        | j                  | j                  ng d
| _
        t        | 0  di | y )N         r   r   )r   r   r)   r   )r           )r   r   r*      )r.   r*      r)   )   r)   	    )r   r   r   r   r   r   r   r   r   r   r   super__post_init__selfkwargs	__class__s     |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/sam2/configuration_sam2.pyr4   z Sam2HieraDetConfig.__post_init__J   s   -1__-H$//tUYl;?;Q;Q;]!7!7deghci151B1B1ND--UVXYTZ373E3E3QT//XY[\W]151B1B1ND--UVXYTZ ??K <<Q 	8
 :>9N9N9Z 5 5`l(,(@(@(LD$$Re 	  372T2T2`D..fr 	* +/*D*D*PD&&Vc 	" -1,H,H,TD((Zc 	$ 	''    ) __name__
__module____qualname____doc__base_config_key
model_typer   int__annotations__r   r   r   listr   r   r   r   r   r   r   r   r   r   r   r    floatr"   strr$   r&   r4   __classcell__r8   s   @r9   r   r      s]   2 (O'JK  L#)-Jd3i$&-04sT#Y-4+/L#S	/D(/,0M3c?T)0+/L#S	/D(/DH/cT1AH!"3")-d3i$&-,0cT)06:!49t#3:.249t+204T#Y-4IuJ NE #u#( (r:   r   c                       e Zd ZU dZdZdZdeiZdZe	e
z  dz  ed<   dZee   dz  ed<   dZedz  ed<   dZeed	<   d
Zeed<   d
Zeed<   dZeed<   dZee   dz  ed<   dZeed<   dZeed<   dZeed<   dZeed<    fdZ xZS )Sam2VisionConfiga  
    backbone_channel_list (`List[int]`, *optional*, defaults to `[768, 384, 192, 96]`):
        The list of channel dimensions for the backbone.
    backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
        The spatial sizes of the feature maps from the backbone.
    fpn_hidden_size (`int`, *optional*, defaults to 256):
        The hidden dimension of the FPN.
    fpn_kernel_size (`int`, *optional*, defaults to 1):
        The kernel size for the convolutions in the neck.
    fpn_stride (`int`, *optional*, defaults to 1):
        The stride for the convolutions in the neck.
    fpn_padding (`int`, *optional*, defaults to 0):
        The padding for the convolutions in the neck.
    fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
        The levels for the top-down FPN connections.
    num_feature_levels (`int`, *optional*, defaults to 3):
        The number of feature levels from the FPN to use.
    vision_configsam2_vision_modelr   Nbackbone_channel_listbackbone_feature_sizes   fpn_hidden_sizer   fpn_kernel_size
fpn_strider   fpn_paddingfpn_top_down_levelsr   num_feature_levelsr!   r"   r#   r$   r%   r&   c                 
   | j                   g dn| j                   | _         | j                  ddgddgddggn| j                  | _        | j                  ddgn| j                  | _        t        | j                  t
              rT| j                  j                  dd      | j                  d<   t        | j                  d      d	i | j                  | _        n| j                  t               | _        t        | (  d	i | y )
N)r-   r,   r+   r   rN      @   r   r   r@   r   r2   )rL   rM   rS   
isinstancer   dictgetr   r   r3   r4   r5   s     r9   r4   zSam2VisionConfig.__post_init__   s    #'#=#=#E4KeKe 	" 372M2M2Uc3Z#sb"X.[_[v[v 	# .2-E-E-MAq6SWSkSk d**D1151E1E1I1I,Xn1oD  .#1$2F2F|2T#U#mX\XlXl#mD !!)#5#7D ''r:   )r;   r<   r=   r>   r?   r@   r	   sub_configsr   rY   r   rB   rL   rC   rA   rM   rO   rP   rQ   rR   rS   rT   r"   rE   r$   rD   r&   r4   rF   rG   s   @r9   rI   rI   e   s    & &O$J:K 7;OT,,t3:.249t+2*.D4K.OSOSJK,0cT)0J NE #u#( (r:   rI   c                       e Zd ZU dZdZdZeed<   dZee	e   z  e
eef   z  ed<   dZee	e   z  e
eef   z  ed<   dZeed	<   d
Zeed<   dZeed<   dZeed<   dZeed<   y)Sam2PromptEncoderConfigaY  
    mask_input_channels (`int`, *optional*, defaults to 16):
        The number of channels to be fed to the `MaskDecoder` module.
    num_point_embeddings (`int`, *optional*, defaults to 4):
        The number of point embeddings to be used.
    scale (`float`, *optional*, defaults to 1):
        The scale factor for the prompt encoder.
    prompt_encoder_configrN   r   r(   r      
patch_sizemask_input_channelsr*   num_point_embeddingsr!   r"   r#   r$   r   scaleN)r;   r<   r=   r>   r?   r   rA   rB   r   rC   tupler`   ra   rb   r"   rE   r$   rD   rc   r2   r:   r9   r]   r]      s     .OK48Jd3i%S/1846Jd3i%S/16!! !#!J NE E3Nr:   r]   c                       e Zd ZU dZdZdZeed<   dZe	ed<   dZ
eed<   d	Zeed
<   dZeed<   d	Zeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   y)Sam2MaskDecoderConfiga  
    mlp_dim (`int`, *optional*, defaults to 2048):
        The dimension of the MLP in the two-way transformer.
    attention_downsample_rate (`int`, *optional*, defaults to 2):
        The downsample rate for the attention layers.
    num_multimask_outputs (`int`, *optional*, defaults to 3):
        The number of multimask outputs.
    iou_head_depth (`int`, *optional*, defaults to 3):
        The depth of the IoU head.
    iou_head_hidden_dim (`int`, *optional*, defaults to 256):
        The hidden dimension of the IoU head.
    dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`):
        Whether to use dynamic multimask via stability.
    dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05):
        The stability delta for the dynamic multimask.
    dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98):
        The stability threshold for the dynamic multimask.
    mask_decoder_configrN   r   r!   r"   i   mlp_dimr   num_hidden_layersr.   r   attention_downsample_rater   num_multimask_outputsiou_head_depthiou_head_hidden_dimTdynamic_multimask_via_stabilityg?!dynamic_multimask_stability_deltag\(\?"dynamic_multimask_stability_threshN)r;   r<   r=   r>   r?   r   rA   rB   r"   rE   rh   ri   r   rj   rk   rl   rm   rn   boolro   rD   rp   r2   r:   r9   rf   rf      s    & ,OKJGSs  %&s&!"3"NC"",0#T0/3%u304&4r:   rf   c                        e Zd ZU dZdZeeedZdZ	e
ez  dz  ed<   dZe
ez  dz  ed<   dZe
ez  dz  ed<   dZeed	<    fd
Z xZS )
Sam2Configag  
    prompt_encoder_config (Union[`dict`, `Sam2PromptEncoderConfig`], *optional*):
        Dictionary of configuration options used to initialize [`Sam2PromptEncoderConfig`].
    mask_decoder_config (Union[`dict`, `Sam2MaskDecoderConfig`], *optional*):
        Dictionary of configuration options used to initialize [`Sam2MaskDecoderConfig`].

    Example:

    ```python
    >>> from transformers import (
    ...     Sam2VisionConfig,
    ...     Sam2PromptEncoderConfig,
    ...     Sam2MaskDecoderConfig,
    ...     Sam2Model,
    ... )

    >>> # Initializing a Sam2Config with `"facebook/sam2.1_hiera_tiny"` style configuration
    >>> configuration = Sam2Config()

    >>> # Initializing a Sam2Model (with random weights) from the `"facebook/sam2.1_hiera_tiny"` style configuration
    >>> model = Sam2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Sam2Config from a Sam2VisionConfig, Sam2PromptEncoderConfig, and Sam2MaskDecoderConfig

    >>> # Initializing SAM2 vision encoder, memory attention, and memory encoder configurations
    >>> vision_config = Sam2VisionConfig()
    >>> prompt_encoder_config = Sam2PromptEncoderConfig()
    >>> mask_decoder_config = Sam2MaskDecoderConfig()

    >>> config = Sam2Config(vision_config, prompt_encoder_config, mask_decoder_config)
    ```sam2)rJ   r^   rg   NrJ   r^   rg   r%   r&   c                 |   t        | j                  t              rT| j                  j                  dd      | j                  d<   t	        | j                  d      di | j                  | _        n| j                  t	        d          | _        t        | j
                  t              rt        di | j
                  | _        n| j
                  t               | _        t        | j                  t              rt        di | j                  | _        n| j                  t               | _        t        | (  di | y )Nr@   rK   r2   )rX   rJ   rY   rZ   r   r^   r]   rg   rf   r3   r4   r5   s     r9   r4   zSam2Config.__post_init__  s   d(($//3/A/A/E/ElTg/hD|,!/0B0B<0P!Q!gTXTfTf!gD'!/0C!D!FDd00$7)@)^4C]C])^D&''/)@)BD&d..5'<'Xt?W?W'XD$%%-'<'>D$''r:   )r;   r<   r=   r>   r@   r	   r]   rf   r[   rJ   rY   r   rB   r^   rg   r&   rD   r4   rF   rG   s   @r9   rs   rs      sx    !F J#!84K 59M4**T18<@4"22T9@:> 0047>#u#( (r:   rs   )rs   r   rI   r]   rf   N)r>   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   autor   r	   r   rI   r]   rf   rs   __all__r2   r:   r9   <module>r{      s     . 3 # - 78I() I(  9I(X 786(' 6(  96(r 78.   9, 78!5, !5  9!5H 78A(! A(  9A(Hr:   