
    ij                     j    d Z ddlmZ ddlmZ ddlmZ  ed      e G d d	e                    Zd	gZy
)zPerceiver model configuration    )strict   )PreTrainedConfig)auto_docstringzdeepmind/language-perceiver)
checkpointc                      e Zd ZU dZdZdZeed<   dZeed<   dZ	eed<   d	Z
eed
<   dZeed<   dZeed<   dZeed<   dZedz  ed<   dZedz  ed<   dZeed<   d	Zeed<   d	Zeed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   dZeed <   d!Zeed"<   d#Zeed$<   d%Zeee   z  eeef   z  ed&<   d'Z ee   eed(f   z  ed)<   d*Z!eed+<   d,Z"eed-<   d*Z#eed.<   d/Z$ee   eed(f   z  ed0<   d1Z%eed2<   d3Z&eed4<   y)5PerceiverConfiga  
    num_latents (`int`, *optional*, defaults to 256):
        The number of latents.
    d_latents (`int`, *optional*, defaults to 1280):
        Dimension of the latent embeddings.
    num_blocks (`int`, *optional*, defaults to 1):
        Number of blocks in the Transformer encoder.
    num_self_attends_per_block (`int`, *optional*, defaults to 26):
        The number of self-attention layers per block.
    num_self_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each self-attention layer in the Transformer encoder.
    num_cross_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each cross-attention layer in the Transformer encoder.
    qk_channels (`int`, *optional*):
        Dimension to project the queries + keys before applying attention in the cross-attention and self-attention
        layers of the encoder. Will default to preserving the dimension of the queries if not specified.
    v_channels (`int`, *optional*):
        Dimension to project the values before applying attention in the cross-attention and self-attention layers
        of the encoder. Will default to preserving the dimension of the queries if not specified.
    cross_attention_shape_for_attention (`str`, *optional*, defaults to `"kv"`):
        Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
    self_attention_widening_factor (`int`, *optional*, defaults to 1):
        Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
    cross_attention_widening_factor (`int`, *optional*, defaults to 1):
        Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder.
    use_query_residual (`float`, *optional*, defaults to `True`):
        Whether to add a query residual in the cross-attention layer of the encoder.
    image_size (`int`, *optional*, defaults to 56):
        Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
    train_size (`list[int]`, *optional*, defaults to `[368, 496]`):
        Training size of the images for the optical flow model.
    num_frames (`int`, *optional*, defaults to 16):
        Number of video frames used for the multimodal autoencoding model.
    audio_samples_per_frame (`int`, *optional*, defaults to 1920):
        Number of audio samples per frame for the multimodal autoencoding model.
    samples_per_patch (`int`, *optional*, defaults to 16):
        Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
    output_shape (`list[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
        Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
        autoencoding model. This excludes the channel dimension.
    output_num_channels (`int`, *optional*, defaults to 512):
        Number of output channels for each modalitiy decoder.

    Example:

    ```python
    >>> from transformers import PerceiverModel, PerceiverConfig

    >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
    >>> configuration = PerceiverConfig()

    >>> # Initializing a model from the deepmind/language-perceiver style configuration
    >>> model = PerceiverModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	perceiver   num_latentsi   	d_latentsi   d_model   
num_blocks   num_self_attends_per_block   num_self_attention_headsnum_cross_attention_headsNqk_channels
v_channelskv#cross_attention_shape_for_attentionself_attention_widening_factorcross_attention_widening_factorgelu
hidden_actg?attention_probs_dropout_probg{Gz?initializer_rangeg-q=layer_norm_epsTuse_query_residuali  
vocab_sizei   max_position_embeddings8   
image_size)ip  i  .
train_size   
num_framesi  audio_samples_per_framesamples_per_patch)r   r'      r+   output_shapei   output_num_channelsi   _label_trainable_num_channels)'__name__
__module____qualname____doc__
model_typer   int__annotations__r   r   r   r   r   r   r   r   r   strr   r   r   r   floatr   r    r!   boolr"   r#   r%   listtupler&   r(   r)   r*   r,   r-   r.        /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/perceiver/configuration_perceiver.pyr	   r	      sn   8t JKIsGSJ&(($%c%%&s&"Kt"!Jd
!/3'3*+"C++,#S,J03 %#+3#u#!NE!##J#'S'46Jd3i%S/16.8JS	E#s(O+8J#'S's0AL$s)eCHo-A"")-!3-r<   r	   N)	r2   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r;   r<   r=   <module>rB      sK    $ . 3 # 89W.& W.  :W.t 
r<   