
    i                         d dl mZ ddlmZ ddlmZ ddlmZmZ ddl	m
Z
mZ  ej                  e      Z ed	      e G d
 de                    Z ed	      e G d de                    Z ed	      e G d de                    Zg dZy)    )strict   )PreTrainedConfig)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)auto_docstringlogging   )CONFIG_MAPPING
AutoConfigz"Salesforce/instructblip-flan-t5-xl)
checkpointc                       e Zd ZU dZdZdZdZeed<   dZ	eed<   dZ
eed	<   d
Zeed<   dZeee   z  eeef   z  ed<   dZeee   z  eeef   z  ed<   dZeed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   y)InstructBlipVideoVisionConfigaH  
    Example:

    ```python
    >>> from transformers import InstructBlipVideoVisionConfig, InstructBlipVideoVisionModel

    >>> # Initializing a InstructBlipVideoVisionConfig with Salesforce/instructblip-flan-t5-xl style configuration
    >>> configuration = InstructBlipVideoVisionConfig()

    >>> # Initializing a InstructBlipVideoVisionModel (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
    >>> model = InstructBlipVideoVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```instructblipvideo_vision_modelvision_config  hidden_sizei   intermediate_size'   num_hidden_layers   num_attention_heads   
image_size   
patch_sizegelu
hidden_actgư>layer_norm_epsg        attention_dropoutg|=initializer_rangeTqkv_biasN)__name__
__module____qualname____doc__
model_typebase_config_keyr   int__annotations__r   r   r   r   listtupler   r   strr   floatr   r    r!   bool     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/configuration_instructblipvideo.pyr   r   !   s      2J%OK!s!s!!47Jd3i%S/1746Jd3i%S/16J NE %(us{($u$Hdr0   r   c                       e Zd ZU dZdZdZdZeed<   dZ	eed<   dZ
eed	<   dZeed
<   dZeed<   dZeed<   dZeez  ed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   dZedz  ed<   dZeed<   dZeed<   y)InstructBlipVideoQFormerConfiga3  
    cross_attention_frequency (`int`, *optional*, defaults to 2):
        The frequency of adding cross-attention to the Transformer layers.
    encoder_hidden_size (`int`, *optional*, defaults to 1408):
        The hidden size of the hidden states for cross-attention.

    Examples:

    ```python
    >>> from transformers import InstructBlipVideoQFormerConfig, InstructBlipVideoQFormerModel

    >>> # Initializing a InstructBlipVideo Salesforce/instructblip-flan-t5-xl style configuration
    >>> configuration = InstructBlipVideoQFormerConfig()

    >>> # Initializing a model (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
    >>> model = InstructBlipVideoQFormerModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```instructblipvideo_qformerqformer_configi:w  
vocab_sizei   r      r   r   i   r   r   r   g?hidden_dropout_probattention_probs_dropout_probi   max_position_embeddings{Gz?r    g-q=r   r   Npad_token_idr	   cross_attention_frequencyr   encoder_hidden_size)r"   r#   r$   r%   r&   r'   r6   r(   r)   r   r   r   r   r   r,   r8   r-   r9   r:   r    r   r<   r=   r>   r/   r0   r1   r3   r3   D   s    ( -J&OJKs!!!s!J'**03 %#+3#&S&#u#!NE! L#* %&s&##r0   r3   c                        e Zd ZU dZdZddiZeeedZ	dZ
eez  dz  ed<   dZeez  dz  ed<   dZeez  dz  ed	<   d
Zeed<   dZeed<   dZeed<   dZedz  ed<    fdZ xZS )InstructBlipVideoConfiga  
    qformer_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
    num_query_tokens (`int`, *optional*, defaults to 32):
        The number of query tokens passed through the Transformer.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instructblip-flan-t5-xl style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PreTrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config)
    ```instructblipvideovideo_token_idvideo_token_index)text_configr5   r   Nr   r5   rD       num_query_tokensg      ?initializer_factorr;   r    c                 R   | j                   (t        d          | _         t        j                  d       nSt	        | j                   t
              r9| j                   j                  dd      }t        |   di | j                   | _         | j                  %t               | _        t        j                  d       n4t	        | j                  t
              rt        di | j                  | _        | j                  %t               | _	        t        j                  d       n4t	        | j                  t
              rt        di | j                  | _	        | j                  j                  | j                  _        | j                   j                  t        v | _        t!        | D  di | y )NoptzTtext_config is None. Initializing the text config with default values (`OPTConfig`).r&   z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.z``vision_config` is `None`. initializing the `InstructBlipVideoVisionConfig` with default values.r/   )rD   r
   loggerinfo
isinstancedictgetr5   r3   r   r   r   r>   r&   r   use_decoder_only_language_modelsuper__post_init__)selfkwargstext_model_type	__class__s      r1   rQ   z%InstructBlipVideoConfig.__post_init__   sK   #-e46DKKno(($/"..22<GO-o>RAQAQRD&"@"BDKKvw++T2"@"W4CVCV"WD%!>!@DKKr **D1!>!TASAS!TD262D2D2P2P//3/?/?/J/JNo/o,''r0   )r"   r#   r$   r%   r&   attribute_mapr   r3   r   sub_configsr   rM   r   r)   r5   rD   rF   r(   rG   r-   r    rC   rQ   __classcell__)rU   s   @r1   r@   r@   n   s    "H %J%':;M!86K 59M4**T1859ND++d2926K((4/6c ###u#$(sTz(( (r0   r@   )r@   r3   r   N)huggingface_hub.dataclassesr   configuration_utilsr   models.auto.modeling_autor   utilsr   r   autor
   r   
get_loggerr"   rJ   r   r3   r@   __all__r/   r0   r1   <module>r`      s   , / 3 J , - 
		H	% ?@$4   AB ?@%$%5 %$  A%$P ?@N(. N(  AN(b ir0   