
    i                         d Z ddlmZ ddlmZ ddlmZmZ  ej                  e	      Z
 ed      e G d d	e                    Z ed      e G d
 de                    Z ed      e G d de                    Zg dZy)zPix2Struct model configuration    )strict   )PreTrainedConfig)auto_docstringloggingzgoogle/pix2struct-base)
checkpointc                   l   e Zd ZU dZdZdgZddddddddZdZee	d	<   d
Z
ee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZeez  e	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZed z  e	d!<   d"Zeee   z  d z  e	d#<   d Zed z  e	d$<   dZee	d%<   d&Zee	d'<   dZ ee	d(<   y ))Pix2StructTextConfiga  
    relative_attention_num_buckets (`int`, *optional*, defaults to 32):
        The number of buckets to use for each attention layer.
    relative_attention_max_distance (`int`, *optional*, defaults to 128):
        The maximum distance of the longer sequences for the bucket separation.
    dense_act_fn (`Union[Callable, str]`, *optional*, defaults to `"gelu_new"`):
        The non-linear activation function (function or string).

    Example:

    ```python
    >>> from transformers import Pix2StructTextConfig, Pix2StructTextModel

    >>> # Initializing a Pix2StructTextConfig with google/pix2struct-base style configuration
    >>> configuration = Pix2StructTextConfig()

    >>> # Initializing a Pix2StructTextModel (with random weights) from the google/pix2struct-base style configuration
    >>> model = Pix2StructTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```pix2struct_text_modelpast_key_valueshidden_size	num_heads
num_layers)r   num_attention_headsnum_hidden_layersdecoder_attention_headsencoder_attention_headsencoder_layersdecoder_layersiD  
vocab_size   @   d_kv   d_ff       relative_attention_num_buckets   relative_attention_max_distanceg?dropout_rateư>layer_norm_epsilon      ?initializer_factorgelu_newdense_act_fnr   decoder_start_token_idF	use_cacheNpad_token_id   eos_token_idbos_token_idtie_word_embeddingsT
is_decoderadd_cross_attention)!__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   r   r   r   r   r   r    r!   floatr#   r%   r'   strr(   r)   boolr*   r,   listr-   r.   r/   r0        /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/pix2struct/configuration_pix2struct.pyr
   r
      s   . )J#4"5$*)#.#.&&M JKD#ND#JIs*,"C,+.#S. #L%#+# $$ ##"L#""#C#It L#* +,L#S	/D(,#L#*# %%J %%r?   r
   c                       e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   dZ
eed	<   d
Zeed<   d
Zeed<   dZeed<   dZeed<   dZeez  ed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   y)Pix2StructVisionConfiga  
    patch_embed_hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the input patch_embedding layer in the Transformer encoder.
    d_ff (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    d_kv (`int`, *optional*, defaults to 64):
        Dimensionality of the key, query, value projections per attention head.
    The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
    dense_act_fn (`Union[Callable, str]`, *optional*, defaults to `"gelu_new"`):
        The non-linear activation function (function or string).
    seq_len (`int`, *optional*, defaults to 4096):
        Maximum sequence length (here number of patches) supported by the model.
    relative_attention_num_buckets (`int`, *optional*, defaults to 32):
        The number of buckets to use for each attention layer.
    relative_attention_max_distance (`int`, *optional*, defaults to 128):
        The maximum distance (in tokens) to use for each attention layer.

    Example:

    ```python
    >>> from transformers import Pix2StructVisionConfig, Pix2StructVisionModel

    >>> # Initializing a Pix2StructVisionConfig with google/pix2struct-base style configuration
    >>> configuration = Pix2StructVisionConfig()

    >>> # Initializing a Pix2StructVisionModel (with random weights) from the google/pix2struct-base style configuration
    >>> model = Pix2StructVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```pix2struct_vision_modelr   r   patch_embed_hidden_sizer   r   r   r   r   r   r   r&   r'   r"   layer_norm_epsg        r!   attention_dropoutg|=initializer_ranger$   r%   i   seq_lenr   r   r   r    N)r1   r2   r3   r4   r5   r   r8   r9   rD   r   r   r   r   r'   r;   rE   r:   r!   rF   rG   r%   rH   r   r    r>   r?   r@   rB   rB   U   s    B +JK#&S&D#D#Ns!!"L#" NE  #L%#+#%(us{($u$ ##GS*,"C,+.#S.r?   rB   c                        e Zd ZU dZdZeedZdZe	e
z  dz  ed<   dZe	e
z  dz  ed<   dZeed<   d	Zeed
<   dZeed<   dZeed<   dZeed<    fdZ xZS )Pix2StructConfiga  
    is_vqa (`bool`, *optional*, defaults to `False`):
        Whether the model has been fine-tuned for VQA or not.

    Example:

    ```python
    >>> from transformers import Pix2StructConfig, Pix2StructForConditionalGeneration

    >>> # Initializing a Pix2StructConfig with google/pix2struct-base style configuration
    >>> configuration = Pix2StructConfig()

    >>> # Initializing a Pix2StructForConditionalGeneration (with random weights) from the google/pix2struct-base style configuration
    >>> model = Pix2StructForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Pix2StructConfig from a Pix2StructTextConfig and a Pix2StructVisionConfig

    >>> # Initializing a Pix2Struct text and Pix2Struct vision configuration
    >>> config_text = Pix2StructTextConfig()
    >>> config_vision = Pix2StructVisionConfig()

    >>> config = Pix2StructConfig(text_config=config_text, vision_config=config_vision)
    ```
pix2struct)text_configvision_configNrL   rM   r$   r%   g{Gz?rG   Fis_vqar.   Tis_encoder_decoderc                 X   | j                   <t        | j                  | j                        | _         t        j                  d       nft        | j                   t              rL| j                  | j                   d<   | j                  | j                   d<   t        di | j                   | _         | j                  %t               | _        t        j                  d       n4t        | j                  t              rt        di | j                  | _        | j                   j                  | _
        | j                   j                  | _        | j                   j                  | _        | j                  | j                   _        | j                  | j                  _        t        | <  di | y )N)rO   r.   zU`text_config` is `None`. initializing the `Pix2StructTextConfig` with default values.rO   r.   zY`vision_config` is `None`. initializing the `Pix2StructVisionConfig` with default values.r>   )rL   r
   rO   r.   loggerinfo
isinstancedictrM   rB   r(   r*   r,   rG   super__post_init__)selfkwargs	__class__s     r@   rV   zPix2StructConfig.__post_init__   sO   #3#'#:#:$($<$< D KKop(($/595L5LD126:6N6ND233Gd6F6FGD%!7!9DKKst**D1!7!M$:L:L!MD&*&6&6&M&M# ,,99 ,,99-1-C-C*/3/E/E,''r?   )r1   r2   r3   r4   r5   r
   rB   sub_configsrL   rT   r   r9   rM   r%   r:   rG   rN   r<   r.   rO   rV   __classcell__)rY   s   @r@   rJ   rJ      s    6 J"6I_`K26K((4/648M4**T18 ###u#FD %%##( (r?   rJ   )rJ   r
   rB   N)r4   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r   
get_loggerr1   rQ   r
   rB   rJ   __all__r>   r?   r@   <module>ra      s    % . 3 , 
		H	% 347&+ 7&  57&t 342/- 2/  52/j 34@(' @(  5@(F Qr?   