
    i&                        d Z ddlmZ ddlmZ ddlmZmZ  ej                  e	      Z
 ed      e G d d	e                    Z ed      e G d
 de                    Z ed      e G d de                    Zg dZy)zGroupViT model configuration    )strict   )PreTrainedConfig)auto_docstringloggingznvidia/groupvit-gcc-yfcc)
checkpointc                      e Zd ZU dZdZdZdZeed<   dZ	eed<   dZ
eed	<   d
Zeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeez  ed<   dZeez  ed<   dZeed<   dZeed<   dZedz  ed<   dZedz  ed<   d Zeee   z  dz  ed!<   y)"GroupViTTextConfigaz  
    Example:

    ```python
    >>> from transformers import GroupViTTextConfig, GroupViTTextModel

    >>> # Initializing a GroupViTTextModel with nvidia/groupvit-gcc-yfcc style configuration
    >>> configuration = GroupViTTextConfig()

    >>> model = GroupViTTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```groupvit_text_modeltext_configi   
vocab_size   hidden_sizei   intermediate_size   num_hidden_layers   num_attention_headsM   max_position_embeddings
quick_gelu
hidden_acth㈵>layer_norm_eps        dropoutattention_dropout{Gz?initializer_range      ?initializer_factor   Npad_token_idi  bos_token_idi  eos_token_id)__name__
__module____qualname____doc__
model_typebase_config_keyr   int__annotations__r   r   r   r   r   r   strr   floatr   r   r   r!   r#   r$   r%   list     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/groupvit/configuration_groupvit.pyr
   r
      s     'J#OJK!s!s  #%S%"J" NE GUS[%(us{(#u# ## L#* $L#*$+0L#S	/D(0r2   r
   c                      e Zd ZU dZdZdZdZeed<   dZ	eed<   dZ
eed	<   d
Zee   eedf   z  ed<   dZee   eedf   z  ed<   dZee   eedf   z  ed<   dZeed<   dZeee   z  eeef   z  ed<   dZeee   z  eeef   z  ed<   dZeed<   dZeed<   dZeed<   dZeez  ed<   dZeez  ed<   d Zeed!<   d"Zeed#<   d"Zeed$<   d%Zeeez     eeez  df   z  ed&<   d' Zy())GroupViTVisionConfiga  
    depths (`list[int]`, *optional*, defaults to [6, 3, 3]):
        The number of layers in each encoder block.
    num_group_tokens (`list[int]`, *optional*, defaults to [64, 8, 0]):
        The number of group tokens for each stage.
    num_output_groups (`list[int]`, *optional*, defaults to [64, 8, 8]):
        The number of output groups for each stage, 0 means no group.
    assign_eps (`float`, *optional*, defaults to `1.0`):
        Epsilon used in layer norm
    assign_mlp_ratio (`list[int]`, *optional*, defaults to `[0.5, 4]`):
        Ratio used to infer hidden size of MLP layers.

    Example:

    ```python
    >>> from transformers import GroupViTVisionConfig, GroupViTVisionModel

    >>> # Initializing a GroupViTVisionModel with nvidia/groupvit-gcc-yfcc style configuration
    >>> configuration = GroupViTVisionConfig()

    >>> model = GroupViTVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```groupvit_vision_modelvision_configi  r   i   r   r   r   )   r   r   .depths)@      r   num_group_tokens)r:   r;   r;   num_output_groupsr8   r      
image_size   
patch_sizer   num_channelsgelur   r   r   r   r   r   r   r   r    r!   
assign_eps)g      ?r   assign_mlp_ratioc                     | j                   t        | j                        k7  r9t        j	                  d| j                    dt        | j                                yy)zOPart of `@strict`-powered validation. Validates the architecture of the config.z&Manually setting num_hidden_layers to z1, but we expect num_hidden_layers = sum(depth) = N)r   sumr9   loggerwarning)selfs    r3   validate_architecturez*GroupViTVisionConfig.validate_architecturer   sR    !!S%55NN89O9O8P Q!!$T[[!1 24 6r2   N)r&   r'   r(   r)   r*   r+   r   r,   r-   r   r   r9   r0   tupler<   r=   r   r?   rA   rB   r   r.   r   r/   r   r   r   r!   rD   rE   rK   r1   r2   r3   r5   r5   ?   sV   4 )J%OK!s!s*3FDIc3h'34>d3i%S/1>5?tCy5c?2?  47Jd3i%S/1746Jd3i%S/16L#J NE GUS[%(us{(#u# ##JDLd53;'%S0@*AALr2   r5   c                        e Zd ZU dZdZeedZdZe	e
z  dz  ed<   dZe	e
z  dz  ed<   dZeed<   d	Zeed
<   dZeed<   dZeed<   dZeed<   dZeed<    fdZ xZS )GroupViTConfiga  
    projection_intermediate_dim (`int`, *optional*, defaults to 4096):
        Dimensionality of intermediate layer of text and vision projection layers.
    output_segmentation (`bool`, *optional*, defaults to False):
        Whether or not to return the segmentation logits.
    groupvit)r   r7   Nr   r7   r   projection_dimi   projection_intermediate_dimg/L
F@logit_scale_init_valuer   r   r    r!   Foutput_segmentationc                    | j                   i }t        j                  d       nAt        | j                   t              r| j                   j                         }n| j                   }| j                  i }t        j                  d       nAt        | j                  t              r| j                  j                         }n| j                  }|j                  dd       }|j                  dd       }|t	        di |j                         }|j                         D ]B  \  }}||v s|||   k7  s|dk7  s||v r
d| d| d}	nd	| d
}	t        j                  |	       D |j                  |       |t        di |j                         }
d|
v r3|
d   j                         D ci c]  \  }}t        |      | c}}|
d<   |
j                         D ]B  \  }}||v s|||   k7  s|dk7  s||v r
d| d| d}	nd| d
}	t        j                  |	       D |j                  |
       t	        di || _         t        di || _        t        | 4  di | y c c}}w )NzS`text_config` is `None`. Initializing the `GroupViTTextConfig` with default values.zW`vision_config` is `None`. initializing the `GroupViTVisionConfig` with default values.text_config_dictvision_config_dicttransformers_version`zp` is found in both `text_config_dict` and `text_config` but with different values. The value `text_config_dict["z"]` will be used instead.zn`text_config_dict` is provided which will be used to initialize `GroupViTTextConfig`. The value `text_config["z"]` will be overridden.id2labelzv` is found in both `vision_config_dict` and `vision_config` but with different values. The value `vision_config_dict["zt`vision_config_dict` is provided which will be used to initialize `GroupViTVisionConfig`. The value `vision_config["r1   )r   rH   info
isinstancer
   to_dictr7   r5   popitemsupdater.   super__post_init__)rJ   kwargsr   r7   rU   rV   _text_config_dictkeyvaluemessage_vision_config_dict	__class__s              r3   ra   zGroupViTConfig.__post_init__   s   #KKKmn((*<=**224K**K%MKKqr**,@A ..668M ..M "::&8$?#ZZ(<dC' 2 F5E F N N P 0557 )
U+%%;s3C*COeHe..u %<<?5@Y[  336%7NP   KK()" 01)"6"L9K"L"T"T"V006I*6U6[6[6]3(2UCHeO3#J/
 2779 )
U-'E]35G,GCSiLi00u %FFIUJce  99<=TV   KK()"   !45 .<<1BMB''93s   &I)r&   r'   r(   r)   r*   r
   r5   sub_configsr   dictr   r-   r7   rP   r,   rQ   rR   r/   r   r!   rS   boolra   __classcell__)rh   s   @r3   rN   rN   {   s     J"4G[\K26K((4/648M4**T18NC'++$*E*#u# ## %%Q( Q(r2   rN   )rN   r
   r5   N)r)   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r   
get_loggerr&   rH   r
   r5   rN   __all__r1   r2   r3   <module>rr      s    # . 3 , 
		H	% 56!1) !1  7!1H 567+ 7  77t 56e(% e(  7e(P Kr2   