
    im_                        d dl Z d dlmZ d dlmZmZmZ d dlmZm	Z	m
Z
mZmZmZmZmZ ddlmZ ddlmZ ddlmZmZ  ed	
      e G d de                    Z ed	
      e G d de                    Z ed	
      e G d de                    Z G d de      Z G d de      Z G d de      Z G d de
      Z G d de      Z G d de	      Zg dZ y)    N)strict)InstructBlipConfigInstructBlipQFormerConfigInstructBlipVisionConfig)'BaseModelOutputWithVisionQformerOutputs$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelTransformersKwargs   )BaseModelOutputWithPooling)Unpack)auto_docstringcan_return_tuplez"Salesforce/instructblip-flan-t5-xl)
checkpointc                       e Zd ZdZy)InstructBlipVideoVisionConfigaH  
    Example:

    ```python
    >>> from transformers import InstructBlipVideoVisionConfig, InstructBlipVideoVisionModel

    >>> # Initializing a InstructBlipVideoVisionConfig with Salesforce/instructblip-flan-t5-xl style configuration
    >>> configuration = InstructBlipVideoVisionConfig()

    >>> # Initializing a InstructBlipVideoVisionModel (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
    >>> model = InstructBlipVideoVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```N__name__
__module____qualname____doc__     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   (   s    r   r   c                       e Zd ZdZy)InstructBlipVideoQFormerConfiga3  
    cross_attention_frequency (`int`, *optional*, defaults to 2):
        The frequency of adding cross-attention to the Transformer layers.
    encoder_hidden_size (`int`, *optional*, defaults to 1408):
        The hidden size of the hidden states for cross-attention.

    Examples:

    ```python
    >>> from transformers import InstructBlipVideoQFormerConfig, InstructBlipVideoQFormerModel

    >>> # Initializing a InstructBlipVideo Salesforce/instructblip-flan-t5-xl style configuration
    >>> configuration = InstructBlipVideoQFormerConfig()

    >>> # Initializing a model (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
    >>> model = InstructBlipVideoQFormerModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Nr   r   r   r   r    r    <   s    r   r    c                   <    e Zd ZU dZddiZdZedz  ed<    e       Z	y)InstructBlipVideoConfiga  
    qformer_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
    num_query_tokens (`int`, *optional*, defaults to 32):
        The number of query tokens passed through the Transformer.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instructblip-flan-t5-xl style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PreTrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config)
    ```video_token_idvideo_token_indexN)
r   r   r   r   attribute_mapr$   int__annotations__AttributeErrorimage_token_indexr   r   r   r"   r"   T   s-    "H &':;M$(sTz(&(r   r"   c                       e Zd ZdZy) InstructBlipVideoPreTrainedModel)videotextNr   r   r   input_modalitiesr   r   r   r+   r+      s    (r   r+   c                       e Zd ZdZy)InstructBlipVideoVisionModelr,   Nr.   r   r   r   r1   r1      s    r   r1   c                       e Zd Zy)InstructBlipVideoQFormerModelNr   r   r   r   r   r   r3   r3          r   r3   c                       e Zd Zy)4InstructBlipVideoForConditionalGenerationModelOutputNr4   r   r   r   r7   r7      r5   r   r7   c                   @   e Zd Zee	 	 	 	 	 	 	 	 ddej                  dej                  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	de	dz  de
e   deez  fd              Zy)InstructBlipVideoModelNpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskinputs_embedsinterpolate_pos_encoding	use_cachekwargsreturnc           	      x   |j                   \  }}}}}|j                  ||z  |||      } | j                  d||	d|}|d   }t        j                  |j                         d d t        j                  |j                        }| j                  j                  |j                   d   dd      }t        j                  |j                         d d t        j                  |j                        }|t        j                  |      }|j                  |d      }|j                  |d      }t        j                  ||gd      } | j                  d|||||d|}|d   d d d |j                  d      d d f   }| j                  |      }|j                  || j                  j                   |z  d      }|Q | j"                  j%                         |      }|| j                  j&                  k(  }|t        j                  |      }nl| | j%                         t        j(                  | j                  j&                  t        j                  |j                              k(  }|j+                  d      }|j-                  d      j/                  |      j1                  |j                        }|j1                  |j                  |j2                        }|j5                  ||      }| j                  j6                  r | j"                  d|||
d|}n | j"                  d|||||
d	|}t9        |||
      S )Nr:   rB   r   dtypedevicedim   r=   r>   query_embedsencoder_hidden_statesencoder_attention_maskrA   r>   rC   )rA   r>   r?   r@   rC   )vision_outputsqformer_outputslanguage_model_outputsr   )shapereshapevision_modeltorchonessizelongrK   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionconfignum_query_tokenslanguage_modelget_input_embeddingsr#   tensorall	unsqueeze	expand_astorJ   masked_scatteruse_decoder_only_language_modelr7   )selfr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   
batch_sizeframeschannelheightwidthrT   image_embedsimage_attention_maskr^   query_attention_maskquery_outputsquery_outputlanguage_model_inputsspecial_image_maskoutputss                              r   forwardzInstructBlipVideoModel.forward   s`   $ 6B5G5G2
FGVU#++J,?&RWX*** 
%%=
 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a$ 
'1%".#7
 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j$++JfJfioJoqs t FD//DDFyQM!*dkk.H.H!H%!&!;!.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)d)) +-# 	G *d)) +-"3'=# G D))#*
 	
r   )NNNNNNFN)r   r   r   r   r   rZ   FloatTensor
LongTensorTensorboolr   r   tupler7   r~   r   r   r   r9   r9      s   
 ;?.22659:>-1).!%Z
''Z
 !,,Z
 !& 0 04 7	Z

 $$t+Z
 ((4/Z
 !++d2Z
 !& 0 04 7Z
 ||d*Z
 #'Z
 $;Z
 +,Z
 
E	EZ
  Z
r   r9   c                   4   e Zd Zee	 	 ddej                  dej                  dej                  dz  dedz  de	e
   deez  fd              Zd	 Zd
ej                  dej                  fdZee	 	 	 	 	 	 	 	 	 ddej                  dej                  dej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dededz  de	e
   deez  fd              Z ej$                         	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dedej                  fd       Zy))InstructBlipVideoForConditionalGenerationNr:   r;   r<   rB   rD   rE   c           	         |j                   \  }}}}	}
|j                  ||z  ||	|
      } | j                  d
||d|}t        |j                  |j
                  |j                  |j                  |d      }|d   }t        j                  |j                         dd t        j                  |j                        }| j                  j                  |j                   d   dd      }t        j                  |j                         dd t        j                  |j                        }|t        j                  |      }|j!                  |d      }|j!                  |d      }t        j"                  ||gd      } | j$                  d
|||||d	|}||_        |d   ddd|j                  d      ddf   }| j)                  |      }|j                  || j*                  j,                  |z  d      }||_        |S )a  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.
        rG   N)last_hidden_statepooler_outputhidden_states
attentionsrT   rU   r   rH   rI   rL   rN   rO   r   )rW   rX   rY   r   r   r   r   r   rZ   r[   r\   r]   rK   r^   r_   r`   ra   rb   rc   rU   rd   re   rf   )rp   r:   r;   r<   rB   rD   rq   rr   rs   rt   ru   rT   rv   rw   r^   rx   rU   rz   video_featuress                      r   get_video_featuresz<InstructBlipVideoForConditionalGeneration.get_video_features   s(   ( 6B5G5G2
FGVU#++J,?&RWX5FT5F5F 6
%%=6
 6

 A,>>(66(66%00) 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a&$,, 
'1%".#7
 
 *9&&q)!-C|/@/@/C-CQ*FG 11,? (//
DKK<X<X[a<acef'5$r   c                      t        d      )Nz=No need to inherit as this architecture only supports videos.)r(   )super_kwargss    r   get_image_featuresz<InstructBlipVideoForConditionalGeneration.get_image_features8  s    \]]r   r=   rA   c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S )zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        rI   rH   )rh   rZ   ri   re   r#   r]   rK   rj   rk   rl   rm   )rp   r=   rA   r|   s       r   get_placeholder_maskz>InstructBlipVideoForConditionalGeneration.get_placeholder_mask;  s     !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H/99"=GGVYYZgZnZno!!r   r>   r?   r@   labelsrC   c           
          | j                   |f|||
d|}|j                  }|j                  }|j                  }| | j	                         |      }|t        j                  |      }|j                  |j                  |j                        }| j                  ||      }|j                  ||      }| j                  j                  rT | j                  d	|||d|}|d   }d}|	f | j                  d	||	| j                  j                   j"                  d|}n1 | j                  d	|||||	|d|}|j$                  }|j&                  }t)        |||||      S )
a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```r;   r<   rB   NrA   rS   r   )logitsr   
vocab_size)rA   r>   r?   r@   r   rC   )lossr   rT   rU   rV   r   )r   r   rU   rT   rh   rZ   r`   rm   rK   rJ   r   rn   re   ro   rg   loss_functiontext_configr   r   r   r7   )rp   r:   r;   r<   r=   r>   r?   r@   rA   r   rB   rC   rD   r   r{   rU   rT   r|   r}   r   r   s                        r   r~   z1InstructBlipVideoForConditionalGeneration.forwardJ  s   ` CZ$BYBYC
/#9%=	C

 C
 !/ < <(88'66 7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_;;66)d)) +-# 	G QZFD!)t)) !&T[[=T=T=_=_ci
 *d)) +-"3'=# G <<D^^FC)+#*
 	
r   c                 V   t        | d      r| j                          |j                  d   }	| j                  ||||      }
|
j                  }||| j
                  j                  g| j
                  j                  z  dz  }|| j
                  j                  j                  gz   }t        j                  |gt        j                  |j                        }|j                  |	d      } | j                         |      }|t        j                   |      }|j#                  |j                  |j$                        }| j'                  ||      }|j)                  ||      }||d}| j*                  j
                  j,                  s||d	<    | j*                  j.                  d
i ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapr   r      rI   rN   r   )rA   r>   r=   r   )hasattr_preprocess_acceleraterW   r   r   re   r$   rf   r   bos_token_idrZ   ri   r]   rK   repeatrh   r`   rm   rJ   r   rn   rg   is_encoder_decodergenerate)rp   r:   r;   r<   r=   r>   rA   rB   generate_kwargsrq   r   r{   video_tokensstart_tokensr|   inputsr}   s                    r   r   z2InstructBlipVideoForConditionalGeneration.generate  s   D 4)'')!''*
BFBYBY/#9%=	 CZ C
 !/ < <   $ = =>A]A]]`aa+t{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_#0NS""))<<"+F;.$%%..KK?Kr   )NF)	NNNNNNNFN)NNNNNF)r   r   r   r   r   rZ   r   r   r   r   r   r   r   r   r   r   r7   r~   no_gradr   r   r   r   r   r      s   
 ;?05C''C !++C !& 0 04 7	C
 #'+C +,C 
8	8C  CJ^"e.>.> "uO`O` " 
 ;?.22659:>26*.).!%D
''D
 !,,D
 !& 0 04 7	D

 $$t+D
 ((4/D
 !++d2D
 !& 0 04 7D
 ((4/D
   4'D
 #'D
 $;D
 +,D
 
E	ED
  D
L U]]_ 6::>-12626).C''C !++d2C !& 0 04 7	C
 ##d*C ((4/C ((4/C #'C 
		C Cr   r   )r"   r    r   r1   r+   r3   r9   r   )!rZ   huggingface_hub.dataclassesr   ;transformers.models.instructblip.configuration_instructblipr   r   r   6transformers.models.instructblip.modeling_instructblipr   r   r	   r
   r   r   r   r   modeling_outputsr   processing_utilsr   utilsr   r   r   r    r"   r+   r1   r3   r7   r9   r   __all__r   r   r   <module>r      s     . 
	 	 	 ; & 5 ?@$<   A$ ?@%>   A, ?@')0 ')  A')T)'B )#: 	$< 		;j 	]
. ]
@f0T fR		r   