
    i<                     :   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ  ej0                  e      Ze G d de             Z ed       G d de             Z ed       G d dee             Zg dZy)zPyTorch Fuyu model.    N)nn   )Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)	AutoModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )
FuyuConfigc                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg ZdZy)FuyuPreTrainedModelconfigmodel)imagetextTpast_key_valuesN)__name__
__module____qualname__r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placement     w/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/fuyu/modeling_fuyu.pyr   r       s=    (&*#"&N"3r(   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                   @    e Zd Zdef fdZd Zd Zdej                  de	ej                     dej                  dej                  fd	Z
eed
ej                  dee   deez  fd              Zdej&                  dej                  dej                  fdZee	 	 	 	 	 	 	 	 ddej&                  dz  dej                  dz  dej                  dz  dej                  dz  dej&                  dz  dedz  dej                  dz  dedz  dee   deez  fd              Z xZS )	FuyuModelr   c                    t         |   |       |j                  | _        |j                  j
                  | _        t        j                  |j                        | _        t        j                  |j                  |j                  z  |j                  z  |j                        | _        d| _        | j!                          y )NF)super__init__pad_token_idpadding_idxtext_config
vocab_sizer
   from_configlanguage_modelr   Linear
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initselfr   	__class__s     r)   r/   zFuyuModel.__init__4   s     !.. ,,77'33F4F4FG#%99 1 11F4G4GGI[I[$
  ',#r(   c                 6    | j                   j                         S N)r5   get_input_embeddingsr>   s    r)   rB   zFuyuModel.get_input_embeddingsA   s    ""7799r(   c                 :    | j                   j                  |       y rA   )r5   set_input_embeddingsr>   values     r)   rE   zFuyuModel.set_input_embeddingsD   s    007r(   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
         |j                   d   t        |      k(  s't        dt        |      d|j                   d         |j                         }t	        |j                   d         D ]  }t        j                  ||   dk\  d      d   }||   |   }|j                   d   ||   j                   d   kD  r,t        d||   j                   d|j                   d| d	      ||   |   j                  |j                        |||f<    |S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r>   rH   rI   rJ   output_embeddings	batch_idxdst_indicessrc_indicess           r)   gather_continuous_embeddingsz&FuyuModel.gather_continuous_embeddingsG   sZ   (  %%a(C0E,FFJs3H/I.KKjQ`QfQfghQiPkl  ,11344Q78 	I  --(A)(LPQ(Q\`abcdK 4I>{KK  #&;I&F&L&LQ&OO ^7LY7W7]7]6_ `I6A6G6G5II[\e[ffgi  9Ni8XYd8e8h8h!((9i45	  ! r(   pixel_valueskwargsc                 <    | j                  |      }t        |      S )z
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        )last_hidden_state)r:   r   )r>   r]   r^   patch_embeddingss       r)   get_image_featureszFuyuModel.get_image_featuress   s!      33LA)<LMMr(   	input_idsinputs_embedsimage_featuresc                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        dtyperW   r   r   z6Image features and image tokens do not match, tokens: z, features: )rB   rT   tensorr   image_token_idlongrW   allsumrO   	unsqueeze	expand_asrV   r   numel)r>   rc   rd   re   special_image_maskn_image_tokensn_image_featuress          r)   get_placeholder_maskzFuyuModel.get_placeholder_mask   s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r(   Nimage_patchesimage_patches_indicesattention_maskposition_idsr   	use_cachec	           	      l   |du |duz  rt        d      |  | j                  j                         |      }|j                  d   }
|i||j                  n|j                  }||j                         nd}t        j                  ||
|z   t        j                  |      }|j                  d      }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d
|||||d	|	}|S )a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rg   T)return_dict)rd   re   )rd   rx   ry   r   rz   r'   )rQ   r5   rB   rO   rW   get_seq_lengthrT   arangerl   ro   rb   r`   rV   rh   ru   masked_scatter)r>   rc   rv   rw   rx   ry   r   rd   rz   r^   seq_lenrW   past_key_values_lengthra   rr   outputss                   r)   forwardzFuyuModel.forward   sl   , -t";<YZZ FD//DDFyQM%%a()2)>Y%%MDXDXFIXId_%C%C%Ejk" <<&2H(HPUPZPZciL (11!4L$#66}RV6Wii/22=3G3GI\I\]!%!:!:GW "; " *889KM]^M%$%% 
')%+
 
 r(   )NNNNNNNN)r   r   r   r   r/   rB   rE   rT   Tensorlistr\   r   r   FloatTensorr   r   tupler   rb   
LongTensorru   r   boolr   r   __classcell__r?   s   @r)   r,   r,   .   s   z :8*!*!  $ELL1*! $)<<	*!
 
*!X N!--N9?@R9SN	+	+N  N"))":?:K:K"]b]n]n"0  .2-159.204(,26!%5##d*5 ||d*	5
  %||d25 t+5 &&-5 5 ((4/5 $;5 +,5 
'	'5  5r(   r,   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                       e Zd ZddiZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 dde
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  dedz  de
j                  dz  dedz  de
j                  dz  dedz  dee   deez  fd              Z	 	 	 	 	 	 d fd	Z xZS )FuyuForCausalLMzlm_head.weightz(model.language_model.embed_tokens.weightr   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NF)bias)r.   r/   r,   r   r   r6   r2   r9   r3   lm_headr<   r=   s     r)   r/   zFuyuForCausalLM.__init__   sS     v&
yy!3!3!?!?ASASA^A^ejkr(   c                 6    | j                   j                         S rA   )r   rB   rC   s    r)   rB   z$FuyuForCausalLM.get_input_embeddings   s    zz..00r(   c                 :    | j                   j                  |       y rA   )r   rE   rF   s     r)   rE   z$FuyuForCausalLM.set_input_embeddings   s    

''.r(   Nrc   rv   rw   rx   ry   r   rd   rz   labelslogits_to_keepr^   rK   c                     | j                   d||||||||d|}|d   }t        |
t              rt        |
 d      n|
}| j	                  |dd|ddf         }d}|	4 | j
                  d||	| j                  j                  j                  d|}t        |||j                  |j                  |j                        S )a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```)rc   rv   rw   rd   rx   ry   r   rz   r   N)logitsr   r3   )lossr   r   hidden_states
attentionsr'   )r   
isinstanceintslicer   loss_functionr   r2   r3   r   r   r   r   )r>   rc   rv   rw   rx   ry   r   rd   rz   r   r   r^   r   r   slice_indicesr   r   s                    r)   r   zFuyuForCausalLM.forward   s    j $** 

'"7')%+

 

  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r(   c           
      r    t        
|   |f||||||d|}	|s|j                  dd      r
d |	d<   d |	d<   |	S )N)r   rx   rd   rv   rw   is_first_iterationrz   Trw   rv   )r.   prepare_inputs_for_generationget)r>   rc   r   rx   rd   rv   rw   r   r^   model_inputsr?   s             r)   r   z-FuyuForCausalLM.prepare_inputs_for_generation9  se     w<	
+)''"71	
 	
 "fjjd&C48L01,0L)r(   )
NNNNNNNNNr   )NNNNNF)r   r   r   _tied_weights_keysr   r/   rB   rE   r   r   rT   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r)   r   r      sc    +,VWz 1/  .2-159.204(,26!%&*%&P
##d*P
 ||d*	P

  %||d2P
 t+P
 &&-P
 P
 ((4/P
 $;P
 t#P
 d
P
 +,P
 
'	'P
  P
j "  r(   r   )r   r   r,   )__doc__rT   r   cache_utilsr   
generationr   modeling_outputsr   r   modeling_utilsr	   models.auto.modeling_autor
   processing_utilsr   utilsr   r   r   r   r   configuration_fuyur   
get_loggerr   loggerr   r,   r   __all__r'   r(   r)   <module>r      s         ) R - 2 & j j * 
		H	% 
4/ 
4 
4 
[# [
[| 
@)? @
@F Br(   