
    iy*                        d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
mZ  ed	
      e G d de                    Z ed	
      e G d de                    Z ed	
      e G d de                    Z ed	
      e G d de                    Z ed	
      e G d de                    Z ed	
      e G d de                    Z ed	
      e G d de                    Zg dZy)zSAM3 model configuration    )strict)CLIPTextConfig   )PreTrainedConfig)auto_docstring   )CONFIG_MAPPING
AutoConfigzfacebook/sam3)
checkpointc                       e Zd ZU dZdZdZdZeed<   dZ	eed<   dZ
eed	<   d
Zeed<   dZeed<   dZeee   z  eeef   z  ed<   dZeee   z  eeef   z  ed<   dZeed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   dZee   dz  ed<   dZedz  ed<   dZeee   z  eeef   z  ed <   dZeez  ed!<   d"Zeed#<    fd$Z xZS )%Sam3ViTConfiga8  
    rope_theta (`float`, *optional*, defaults to 10000.0):
        Base frequency for RoPE.
    window_size (`int`, *optional*, defaults to 24):
        Window size for windowed attention.
    global_attn_indexes (`list[int]`, *optional*, defaults to `[7, 15, 23, 31]`):
        Indexes of layers with global attention.
    pretrain_image_size (`int`, *optional*, defaults to 336):
        Pretrained model image size for position embedding initialization.
    hidden_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for hidden states.
    backbone_configsam3_vit_model   hidden_sizei  intermediate_size    num_hidden_layers   num_attention_headsr   num_channelsi  
image_size   
patch_sizegelu
hidden_actư>layer_norm_eps        attention_dropoutg     @
rope_theta   window_sizeNglobal_attn_indexeslayer_scale_init_valueiP  pretrain_image_sizehidden_dropout{Gz?initializer_rangec                 P    t        |   di | | j                  
g d| _        y y )N)             )super__post_init__r$   selfkwargs	__class__s     |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/sam3/configuration_sam3.pyr1   zSam3ViTConfig.__post_init__>   s,    ''##+'6D$ ,    )__name__
__module____qualname____doc__base_config_key
model_typer   int__annotations__r   r   r   r   r   listtupler   r   strr   floatr    r!   r#   r$   r%   r&   r'   r)   r1   __classcell__r5   s   @r6   r   r      s$    (O!JK!s!s!!L#48Jd3i%S/1846Jd3i%S/16J NE %(us{(JK,0cT)0+/EDL/=@tCy5c?:@"%NECK%#u#7 7r7   r   c                        e Zd ZU dZdZdZdeiZdZe	e
z  dz  ed<   dZeed<   dZedz  ed<   dZee   dz  ed	<   d
Zeed<   dZeed<   dZeed<    fdZed        Zej2                  d        Z xZS )Sam3VisionConfiga  
    fpn_hidden_size (`int`, *optional*, defaults to 256):
        The hidden dimension of the FPN.
    backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[288, 288], [144, 144], [72, 72]]`):
        The spatial sizes (height, width) of the feature maps from the backbone at different scales.
    scale_factors (`list[float]`, *optional*, defaults to `[4.0, 2.0, 1.0, 0.5]`):
        Scale factors for FPN multi-scale features. List of scaling factors for each FPN level.
    vision_configsam3_vision_modelr   N   fpn_hidden_sizebackbone_feature_sizesscale_factorsr   r   r   r   r(   r)   c                    | j                   g dn| j                   | _         | j                  ddgddgddgg| _        t        | j                  t              rT| j                  j                  dd      | j                  d<   t        | j                  d      di | j                  | _        n| j                  t        d          | _        t        |    di | y )N)g      @g       @g      ?g      ?i      H   r=   r   r/   )	rM   rL   
isinstancer   dictgetr	   r0   r1   r2   s     r6   r1   zSam3VisionConfig.__post_init__^   s    595G5G5O1UYUgUg&&.,/:SzB8*LD'd**D1151E1E1I1I,Xh1iD  .#1$2F2F|2T#U#mX\XlXl#mD !!)#12B#C#ED ''r7   c                 .    | j                   j                  S )z"Image size for the vision encoder.r   r   r3   s    r6   r   zSam3VisionConfig.image_sizek   s     ##...r7   c                 &    || j                   _        y)z-Set the image size and propagate to backbone.NrU   r3   values     r6   r   zSam3VisionConfig.image_sizep   s     +0'r7   )r8   r9   r:   r;   r<   r=   r
   sub_configsr   rR   r   r?   rK   r>   rL   r@   rM   rC   r   rB   r   r)   r1   propertyr   setterrD   rE   s   @r6   rG   rG   D   s     &O$J:K 7;OT,,t3:OS*.D4K.(,M4;%,J NE #u#( / / 0 0r7   rG   c                       e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZeez  ed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   y)Sam3GeometryEncoderConfigzc
    roi_size (`int`, *optional*, defaults to 7):
        ROI size for box pooling operations.
    sam3_geometry_encoderrJ   r   r   
num_layers   r      r   皙?dropoutrelur   r   r'   r   r   r+   roi_sizer(   r)   N)r8   r9   r:   r;   r=   r   r>   r?   r`   r   r   rd   rC   r   rB   r'   r   rf   r)   r/   r7   r6   r^   r^   v   s}    
 )JKJ  !s!GUS[J"%NECK% NE Hc#u#r7   r^   c                       e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZeez  ed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   y)Sam3DETREncoderConfigzo
    hidden_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for hidden states.
    sam3_detr_encoderrJ   r      r`   ra   r   rb   r   rc   rd   re   r   r   r'   r   r   r(   r)   N)r8   r9   r:   r;   r=   r   r>   r?   r`   r   r   rd   rC   r   rB   r'   r   r)   r/   r7   r6   rh   rh      ss    
 %JKJ  !s!GUS[J"%NECK% NE #u#r7   rh   c                       e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZeed<   dZeez  ed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   y)Sam3DETRDecoderConfigz]
    num_queries (`int`, *optional*, defaults to 200):
        Number of object queries.
    sam3_detr_decoderrJ   r   rj   r`      num_queriesra   r   rb   r   rc   rd   re   r   r   r'   r   r   r(   r)   N)r8   r9   r:   r;   r=   r   r>   r?   r`   ro   r   r   rd   rC   r   rB   r'   r   r)   r/   r7   r6   rl   rl      s}    
 %JKJK  !s!GUS[J"%NECK% NE #u#r7   rl   c                   p    e Zd ZU dZdZdZeed<   dZeed<   dZ	e
ed<   d	Ze
ez  ed
<   dZeed<   dZe
ed<   y)Sam3MaskDecoderConfigz
    num_upsampling_stages (`int`, *optional*, defaults to 3):
        Number of upsampling stages in the pixel decoder (FPN).
    sam3_mask_decoderrJ   r   r   num_upsampling_stagesr   r   r   rd   ra   r   r(   r)   N)r8   r9   r:   r;   r=   r   r>   r?   rs   r   rC   rd   r   r)   r/   r7   r6   rq   rq      sQ    
 %JK!"3" NE GUS[  #u#r7   rq   c                       e Zd ZU dZdZdZeeee	e
edZdZeez  dz  ed<   dZeez  dz  ed<   dZeez  dz  ed<   dZeez  dz  ed	<   dZeez  dz  ed
<   dZeez  dz  ed<   dZeed<    fdZed        Zej6                  d        Z xZS )
Sam3ConfigaD  
    geometry_encoder_config (`dict` or `Sam3GeometryEncoderConfig`, *optional*):
        Configuration for the geometry encoder.
    detr_encoder_config (`dict` or `Sam3DETREncoderConfig`, *optional*):
        Configuration for the DETR encoder.
    detr_decoder_config (`dict` or `Sam3DETRDecoderConfig`, *optional*):
        Configuration for the DETR decoder.
    mask_decoder_config (`dict` or `Sam3MaskDecoderConfig`, *optional*):
        Configuration for the mask decoder.

    Example:
    ```python
    >>> from transformers import Sam3Config, Sam3Model

    >>> # Initializing a SAM3 configuration
    >>> configuration = Sam3Config()

    >>> # Initializing a model from the configuration
    >>> model = Sam3Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    sam3T)rH   text_configgeometry_encoder_configdetr_encoder_configdetr_decoder_configmask_decoder_configNrH   rw   rx   ry   rz   r{   r(   r)   c                    | j                   t               | _         t        | j                   t              rt        d
i | j                   | _         | j                  t        d
i ddddddddd	| _        t        | j                  t              rt        d
i | j                  | _        | j                  t               | _        t        | j                  t              rt        d
i | j                  | _        | j                  t               | _        t        | j                  t              rt        d
i | j                  | _        | j                  t               | _
        t        | j                  t              rt        d
i | j                  | _
        | j                  t               | _        t        | j                  t              rt        d
i | j                  | _        t        | <  d
i | y )Ni   r   i   i   r"   r   r   r   )
vocab_sizer   r   projection_dimr   r   max_position_embeddingsr   r/   )rH   rG   rQ   rR   rw   r   rx   r^   ry   rh   rz   rl   r{   rq   r0   r1   r2   s     r6   r1   zSam3Config.__post_init__   s   %!1!3Dd(($/!1!GD4F4F!GD#-  "'#')-&))++-/1"(	 D d&&--A0@0@AD''/+D+FD(d22D9+D+dtGcGc+dD(##+'<'>D$d..5'<'Xt?W?W'XD$##+'<'>D$d..5'<'Xt?W?W'XD$##+'<'>D$d..5'<'Xt?W?W'XD$''r7   c                 .    | j                   j                  S )zImage size for the SAM3 model.rH   r   rV   s    r6   r   zSam3Config.image_size$  s     !!,,,r7   c                 &    || j                   _        y)z2Set the image size and propagate to vision config.Nr   rX   s     r6   r   zSam3Config.image_size)  s     ).%r7   )r8   r9   r:   r;   r=   is_compositionrG   r   r^   rh   rl   rq   rZ   rH   rR   r   r?   rw   rx   ry   rz   r{   r)   rC   r1   r[   r   r\   rD   rE   s   @r6   ru   ru      s    2 JN)%#<444K 59M4**T1826K((4/6>BT$44t;B:> 0047>:> 0047>:> 0047>#u#*(X - - . .r7   ru   )ru   r   rG   r^   rh   rl   rq   N)r;   huggingface_hub.dataclassesr   transformersr   configuration_utilsr   utilsr   autor	   r
   r   rG   r^   rh   rl   rq   ru   __all__r/   r7   r6   <module>r      sE    . ' 3 # - ?+&7$ &7  ,&7R ?+-0' -0  ,-0` ?+$ 0 $  ,$( ?+$, $  ,$& ?+$, $  ,$( ?+$, $  ,$  ?+a.! a.  ,a.Hr7   