Ë
    ¥ãiÆ<  ã                   ó:  — d Z ddlZddlmZ ddlmZ ddlmZ ddlmZm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ  ej0                  e«      Ze G d„ de«      «       Z ed¬«       G d„ de«      «       Z ed¬«       G d„ dee«      «       Zg d¢Zy)zPyTorch Fuyu model.é    N)Únné   )ÚCache)ÚGenerationMixin)ÚBaseModelOutputWithPoolingÚCausalLMOutputWithPast)ÚPreTrainedModel)Ú	AutoModel)ÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tupleÚloggingÚtorch_compilable_checké   )Ú
FuyuConfigc                   ó<   — e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg ZdZy)ÚFuyuPreTrainedModelÚconfigÚmodel)ÚimageÚtextTÚpast_key_valuesN)Ú__name__Ú
__module__Ú__qualname__r   Ú__annotations__Úbase_model_prefixÚinput_modalitiesÚsupports_gradient_checkpointingÚ_supports_attention_backendÚ_supports_flash_attnÚ_supports_sdpaÚ_supports_flex_attnÚ_no_split_modulesÚ_skip_keys_device_placement© ó    úw/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/fuyu/modeling_fuyu.pyr   r       s=   … àÓØÐØ(ÐØ&*Ð#Ø"&ÐØÐØ€NØÐØÐØ"3Ñr(   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )Úcustom_introc                   ó@  ‡ — e Zd Zdefˆ fd„Zd„ Zd„ Zdej                  de	ej                     dej                  dej                  fd	„Z
eed
ej                  dee   deez  fd„«       «       Zdej&                  dej                  dej                  fd„Zee	 	 	 	 	 	 	 	 ddej&                  dz  dej                  dz  dej                  dz  dej                  dz  dej&                  dz  dedz  dej                  dz  dedz  dee   deez  fd„«       «       Zˆ xZS )Ú	FuyuModelr   c                 ó„  •— t         ‰|   |«       |j                  | _        |j                  j
                  | _        t        j                  |j                  «      | _        t        j                  |j                  |j                  z  |j                  z  |j                  «      | _        d| _        | j!                  «        y )NF)ÚsuperÚ__init__Úpad_token_idÚpadding_idxÚtext_configÚ
vocab_sizer
   Úfrom_configÚlanguage_modelr   ÚLinearÚ
patch_sizeÚnum_channelsÚhidden_sizeÚvision_embed_tokensÚgradient_checkpointingÚ	post_init©Úselfr   Ú	__class__s     €r)   r/   zFuyuModel.__init__4   s–   ø€ Ü‰Ñ˜Ô Ø!×.Ñ.ˆÔØ ×,Ñ,×7Ñ7ˆŒÜ'×3Ñ3°F×4FÑ4FÓGˆÔÜ#%§9¡9Ø×Ñ × 1Ñ 1Ñ1°F×4GÑ4GÑGÈ×I[ÑI[ó$
ˆÔ ð ',ˆÔ#à‰Õr(   c                 ó6   — | j                   j                  «       S ©N)r5   Úget_input_embeddings©r>   s    r)   rB   zFuyuModel.get_input_embeddingsA   s   € Ø×"Ñ"×7Ñ7Ó9Ð9r(   c                 ó:   — | j                   j                  |«       y rA   )r5   Úset_input_embeddings©r>   Úvalues     r)   rE   zFuyuModel.set_input_embeddingsD   s   € Ø×Ñ×0Ñ0°Õ7r(   Úword_embeddingsÚcontinuous_embeddingsÚimage_patch_input_indicesÚreturnc           
      ó  — |j                   d   t        |«      k(  s't        dt        |«      ›d|j                   d   ›«      ‚|j                  «       }t	        |j                   d   «      D ]ž  }t        j                  ||   dk\  d¬«      d   }||   |   }|j                   d   ||   j                   d   kD  r,t        d||   j                   ›d|j                   ›d|› d	«      ‚||   |   j                  |j                  «      |||f<   Œ  |S )
aÙ  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)Úas_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element ú.)	ÚshapeÚlenÚ
ValueErrorÚcloneÚrangeÚtorchÚnonzeroÚtoÚdevice)r>   rH   rI   rJ   Úoutput_embeddingsÚ	batch_idxÚdst_indicesÚsrc_indicess           r)   Úgather_continuous_embeddingsz&FuyuModel.gather_continuous_embeddingsG   sZ  € ð(  ×%Ñ% aÑ(¬CÐ0EÓ,FÒFÜØJ¬sÐ3HÓ/IÐ.KÐKjÐQ`×QfÑQfÐghÑQiÐPkÐlóð ð ,×1Ñ1Ó3ÐÜ˜×4Ñ4°QÑ7Ó8ò 	ˆIô  Ÿ-™-Ð(AÀ)Ñ(LÐPQÑ(QÐ\`ÔaÐbcÑdˆKð 4°IÑ>¸{ÑKˆKà× Ñ  Ñ#Ð&;¸IÑ&F×&LÑ&LÈQÑ&OÒOÜ Ø^Ð7LÈYÑ7W×7]Ñ7]Ð6_ð `IØ6A×6GÑ6GÐ5IÐI[Ð\eÐ[fÐfgðióð ð 9NÈiÑ8XÐYdÑ8e×8hÑ8hØ!×(Ñ(ó9Ð˜i¨Ð4Ò5ð	ð  !Ð r(   Úpixel_valuesÚkwargsc                 ó<   — | j                  |«      }t        |¬«      S )z®
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        )Úlast_hidden_state)r:   r   )r>   r]   r^   Úpatch_embeddingss       r)   Úget_image_featureszFuyuModel.get_image_featuress   s!   € ð  ×3Ñ3°LÓAÐÜ)Ð<LÔMÐMr(   Ú	input_idsÚinputs_embedsÚimage_featuresc                 óN  — |€m| | j                  «       t        j                  | j                  j                  t        j
                  |j                  ¬«      «      k(  }|j                  d«      }n|| j                  j                  k(  }|j                  «       }|j                  d   |j                  d   z  }|j                  d«      j                  |«      j                  |j                  «      }t        ||   j                  «       |j                  «       k(  d|› d|› «       |S )zï
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        ©ÚdtyperW   éÿÿÿÿr   r   z6Image features and image tokens do not match, tokens: z, features: )rB   rT   Útensorr   Úimage_token_idÚlongrW   ÚallÚsumrO   Ú	unsqueezeÚ	expand_asrV   r   Únumel)r>   rc   rd   re   Úspecial_image_maskÚn_image_tokensÚn_image_featuress          r)   Úget_placeholder_maskzFuyuModel.get_placeholder_mask   s  € ð ÐØ!.Ð2M°$×2KÑ2KÓ2MÜ—‘˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÔgó3ñ "Ðð "4×!7Ñ!7¸Ó!;Ñà!*¨d¯k©k×.HÑ.HÑ!HÐà+×/Ñ/Ó1ˆØ)×/Ñ/°Ñ2°^×5IÑ5IÈ!Ñ5LÑLÐØ/×9Ñ9¸"Ó=×GÑGÈÓV×YÑYÐZg×ZnÑZnÓoÐÜØÐ,Ñ-×3Ñ3Ó5¸×9MÑ9MÓ9OÑOØDÀ^ÐDTÐT`ÐaqÐ`rÐsô	
ð "Ð!r(   NÚimage_patchesÚimage_patches_indicesÚattention_maskÚposition_idsr   Ú	use_cachec	           	      ól  — |du |duz  rt        d«      ‚|€  | j                  j                  «       |«      }|j                  d   }
|€i||j                  n|j                  }||j                  «       nd}t        j                  ||
|z   t        j                  |¬«      }|j                  d«      }|i| j                  |d¬«      j                  }|j                  |j                  |j                  «      }| j                  |||¬«      }|j                  ||«      } | j                  d
|||||d	œ|	¤Ž}|S )aä  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rg   T)Úreturn_dict)rd   re   )rd   rx   ry   r   rz   r'   )rQ   r5   rB   rO   rW   Úget_seq_lengthrT   Úarangerl   ro   rb   r`   rV   rh   ru   Úmasked_scatter)r>   rc   rv   rw   rx   ry   r   rd   rz   r^   Úseq_lenrW   Úpast_key_values_lengthra   rr   Úoutputss                   r)   ÚforwardzFuyuModel.forward—   sl  € ð, ˜Ð -°tÐ";Ò<ÜÐYÓZÐZàÐ ØF˜D×/Ñ/×DÑDÓFÀyÓQˆMà×%Ñ% aÑ(ˆàÐØ)2Ð)>Y×%Ò%ÀM×DXÑDXˆFØIXÐId _×%CÑ%CÔ%EÐjkÐ"Ü Ÿ<™<Ø&¨Ð2HÑ(HÔPU×PZÑPZÐciôˆLð (×1Ñ1°!Ó4ˆLàÐ$Ø#×6Ñ6°}ÐRVÐ6ÓW×iÑiÐØ/×2Ñ2°=×3GÑ3GÈ×I\ÑI\Ó]ÐØ!%×!:Ñ!:Ø¨ÐGWð ";ó "Ðð *×8Ñ8Ð9KÐM]Ó^ˆMà%$×%Ñ%ð 
Ø'Ø)Ø%Ø+Øñ
ð ñ
ˆð ˆr(   )NNNNNNNN)r   r   r   r   r/   rB   rE   rT   ÚTensorÚlistr\   r   r   ÚFloatTensorr   r   Útupler   rb   Ú
LongTensorru   r   Úboolr   rƒ   Ú__classcell__©r?   s   @r)   r,   r,   .   s¼  ø„ ð˜zõ ò:ò8ð*!àŸ™ð*!ð  $ E§L¡LÑ1ð*!ð $)§<¡<ð	*!ð
 
‰ó*!ðX ØðNØ!×-Ñ-ðNØ9?Ð@RÑ9SðNà	Ð+Ñ	+òNó ó ðNð"Ø×)Ñ)ð"Ø:?×:KÑ:Kð"Ø]b×]nÑ]nó"ð0 Øð .2à-1Ø59Ø.2Ø04Ø(,Ø26Ø!%ñ5à×#Ñ# dÑ*ð5ð —|‘| dÑ*ð	5ð
  %Ÿ|™|¨dÑ2ð5ð Ÿ™ tÑ+ð5ð ×&Ñ&¨Ñ-ð5ð  ™ð5ð ×(Ñ(¨4Ñ/ð5ð ˜$‘;ð5ð Ð+Ñ,ð5ð 
Ð'Ñ	'ò5ó ó ô5r(   r,   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                   ó†  ‡ — e Zd ZddiZdefˆ fd„Zd„ Zd„ Zee		 	 	 	 	 	 	 	 	 	 dde
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  dedz  de
j                  dz  dedz  de
j                  dz  dedz  dee   deez  fd„«       «       Z	 	 	 	 	 	 dˆ fd„	Zˆ xZS )ÚFuyuForCausalLMzlm_head.weightz(model.language_model.embed_tokens.weightr   c                 óî   •— t         ‰|   |«       t        |«      | _        t	        j
                  |j                  j                  |j                  j                  d¬«      | _	        | j                  «        y )NF)Úbias)r.   r/   r,   r   r   r6   r2   r9   r3   Úlm_headr<   r=   s     €r)   r/   zFuyuForCausalLM.__init__Ù   sS   ø€ Ü‰Ñ˜Ô Ü˜vÓ&ˆŒ
Ü—y‘y ×!3Ñ!3×!?Ñ!?À×ASÑAS×A^ÑA^ÐejÔkˆŒØ‰Õr(   c                 ó6   — | j                   j                  «       S rA   )r   rB   rC   s    r)   rB   z$FuyuForCausalLM.get_input_embeddingsß   s   € Øz‰z×.Ñ.Ó0Ð0r(   c                 ó:   — | j                   j                  |«       y rA   )r   rE   rF   s     r)   rE   z$FuyuForCausalLM.set_input_embeddingsâ   s   € Ø
‰
×'Ñ'¨Õ.r(   Nrc   rv   rw   rx   ry   r   rd   rz   ÚlabelsÚlogits_to_keepr^   rK   c                 ó‚  —  | j                   d||||||||dœ|¤Ž}|d   }t        |
t        «      rt        |
 d«      n|
}| j	                  |dd…|dd…f   «      }d}|	4 | j
                  d||	| j                  j                  j                  dœ|¤Ž}t        |||j                  |j                  |j                  ¬«      S )a‘  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```)rc   rv   rw   rd   rx   ry   r   rz   r   N)Úlogitsr“   r3   )Úlossr–   r   Úhidden_statesÚ
attentionsr'   )r   Ú
isinstanceÚintÚslicer   Úloss_functionr   r2   r3   r   r   r˜   r™   )r>   rc   rv   rw   rx   ry   r   rd   rz   r“   r”   r^   r‚   r˜   Úslice_indicesr–   r—   s                    r)   rƒ   zFuyuForCausalLM.forwardå   sí   € ðj $—*‘*ð 

ØØ'Ø"7Ø'Ø)Ø%Ø+Øñ

ð ñ

ˆð   ™
ˆä8BÀ>ÔSVÔ8Wœ˜~˜o¨tÔ4Ð]kˆØ—‘˜mªA¨}ºaÐ,?Ñ@ÓAˆàˆØÐØ%4×%Ñ%ð Ø f¸¿¹×9PÑ9P×9[Ñ9[ñØ_eñˆDô &ØØØ#×3Ñ3Ø!×/Ñ/Ø×)Ñ)ô
ð 	
r(   c           
      ór   •— t        ‰
|   |f||||||dœ|¤Ž}	|s|j                  dd«      r
d |	d<   d |	d<   |	S )N)r   rx   rd   rv   rw   Úis_first_iterationrz   Trw   rv   )r.   Úprepare_inputs_for_generationÚget)r>   rc   r   rx   rd   rv   rw   r    r^   Úmodel_inputsr?   s             €r)   r¡   z-FuyuForCausalLM.prepare_inputs_for_generation9  se   ø€ ô ‘wÑ<Øð	
à+Ø)Ø'Ø'Ø"7Ø1ñ	
ð ñ	
ˆñ " f§j¡j°¸dÔ&Cà48ˆLÐ0Ñ1Ø,0ˆL˜Ñ)àÐr(   )
NNNNNNNNNr   )NNNNNF)r   r   r   Ú_tied_weights_keysr   r/   rB   rE   r   r   rT   rˆ   r„   r   r†   r‰   r›   r   r   r‡   r   rƒ   r¡   rŠ   r‹   s   @r)   r   r   Ñ   sc  ø„ ð +Ð,VÐWÐð˜zõ ò1ò/ð Øð .2à-1Ø59Ø.2Ø04Ø(,Ø26Ø!%Ø&*Ø%&ñP
à×#Ñ# dÑ*ðP
ð —|‘| dÑ*ð	P
ð
  %Ÿ|™|¨dÑ2ðP
ð Ÿ™ tÑ+ðP
ð ×&Ñ&¨Ñ-ðP
ð  ™ðP
ð ×(Ñ(¨4Ñ/ðP
ð ˜$‘;ðP
ð —‘˜tÑ#ðP
ð ˜d™
ðP
ð Ð+Ñ,ðP
ð 
Ð'Ñ	'òP
ó ó ðP
ðj ØØØØ"Ø ÷ñ r(   r   )r   r   r,   )Ú__doc__rT   r   Úcache_utilsr   Ú
generationr   Úmodeling_outputsr   r   Úmodeling_utilsr	   Úmodels.auto.modeling_autor
   Úprocessing_utilsr   Úutilsr   r   r   r   r   Úconfiguration_fuyur   Ú
get_loggerr   Úloggerr   r,   r   Ú__all__r'   r(   r)   ú<module>r±      s¸   ðñ ã Ý å  Ý )ß RÝ -Ý 2Ý &ß jÕ jÝ *ð 
ˆ×	Ñ	˜HÓ	%€ð ô
4˜/ó 
4ó ð
4ñ ðôô
[Ð#ó [óð
[ñ| ðôô
@Ð)¨?ó @óð
@òF Br(   