Ë
    ¥ãiÆ<  ã                   ó  — d dl Z d dlmZ d dl mZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"  ejF                  e$«      Z% ed¬«      e G d„ de«      «       «       Z& G d„ de!«      Z' G d„ de"«      Z( ed¬«      e G d„ de«      «       «       Z) G d„ ded¬«      Z* G d„ de«      Z+ G d „ d!e«      Z, G d"„ d#e«      Z- G d$„ d%e «      Z. G d&„ d'e«      Z/g d(¢Z0y))é    N)Ústrict)Únné   )ÚCacheÚDynamicCache)ÚGenerationConfig)ÚFlashAttentionKwargs)ÚBaseModelOutputWithPooling)ÚImagesKwargsÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tupleÚloggingÚtorch_compilable_checké   )ÚIdefics3ConfigÚIdefics3VisionConfig)ÚIdefics3ImageProcessor)ÚIdefics3ImageProcessorPil)ÚIdefics3BaseModelOutputWithPastÚ Idefics3ForConditionalGenerationÚIdefics3ModelÚIdefics3PreTrainedModelÚIdefics3VisionTransformerz$HuggingFaceTB/SmolVLM2-2.2B-Instruct)Ú
checkpointc                   ó   — e Zd ZdZdZy)ÚSmolVLMVisionConfiga  
    Example:

    ```python
    >>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
    >>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig

    >>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
    >>> configuration = SmolVLMVisionConfig()

    >>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
    >>> model = SmolVLMVisionTransformer(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Úsmolvlm_visionN©Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_type© ó    ú|/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/smolvlm/modular_smolvlm.pyr   r   *   s   „ ñð" "Jr'   r   c                   ó   — e Zd Zy)ÚSmolVLMPreTrainedModelN©r!   r"   r#   r&   r'   r(   r*   r*   A   ó   „ Ør'   r*   c                   ó   — e Zd Zy)ÚSmolVLMVisionTransformerNr+   r&   r'   r(   r.   r.   E   r,   r'   r.   c                   ó   — e Zd ZdZdZy)ÚSmolVLMConfigaÆ  
    scale_factor (`int`, *optional*, defaults to 2):
        The scale factor for the image encoder.

    Example:
    ```python
    >>> from transformers import SmolVLMModel, SmolVLMConfig
    >>> # Initializing configuration
    >>> configuration = SmolVLMConfig()
    >>> # Initializing a model from the configuration
    >>> model = SmolVLMModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```ÚsmolvlmNr    r&   r'   r(   r0   r0   I   s   „ ñð Jr'   r0   c                   ó:   — e Zd ZU dZeed<   eeef   ed<   eed<   y)ÚSmolVLMImageProcessorKwargsaz  
    do_image_splitting (`bool`, *optional*, defaults to `True`):
        Whether to split the image into sub-images concatenated with the original image. They are split into patches
        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
    return_row_col_info (`bool`, *optional*, defaults to `False`):
        Whether to return the row and column information of the images.
    Údo_image_splittingÚmax_image_sizeÚreturn_row_col_infoN)	r!   r"   r#   r$   ÚboolÚ__annotations__ÚdictÚstrÚintr&   r'   r(   r3   r3   ^   s#   … ñð ÓØ˜˜c˜‘NÓ"ØÔr'   r3   F)Útotalc                   ó   — e Zd Zy)ÚSmolVLMImageProcessorNr+   r&   r'   r(   r>   r>   n   r,   r'   r>   c                   ó   — e Zd Zy)ÚSmolVLMImageProcessorPilNr+   r&   r'   r(   r@   r@   r   r,   r'   r@   c                   ó   — e Zd Zy)ÚSmolVLMBaseModelOutputWithPastNr+   r&   r'   r(   rB   rB   v   r,   r'   rB   c                   ó  — e Zd ZdZdej
                  dej                  dej                  fd„Ze e	d¬«      	 dd	ej                  d
ej
                  dz  dee   deez  fd„«       «       Ze e	d¬«      e	 	 	 	 	 	 	 	 	 ddej
                  dz  dej                  dz  dej
                  dz  dedz  dej                  dz  d	ej                  dz  d
ej"                  dz  dej                  dz  dedz  dee   deez  fd„«       «       «       Zy)ÚSmolVLMModelz§
    A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
    in forward. Instead, we override inputs_merger here with custom logic.
    Ú	input_idsÚinputs_embedsÚimage_hidden_statesc                 ó   — |j                   \  }}}|€a| | j                  «       t        j                  | j                  j
                  t        j                  |j                  ¬«      «      k(  }|d   }n|| j                  j
                  k(  }|j                  d¬«      }t        t        j                  ||z  dk(  «      d«       ||z  }t        j                  j                  j                  |j                  d¬«      dd¬«      }	|	d d	 }
|j                  d	¬«      }|dz
  |z  }|dz
  |z  }|
j                  d«      |z   }t        j                   |«      }|||   ||   d d …f   ||<   t        j"                  |j                  d	«      ||«      }|S )
N©ÚdtypeÚdevice).r   é   ©Údimr   zCAt least one sample has <image> tokens not divisible by patch_size.)rL   r   )Úvalueéÿÿÿÿ)ÚshapeÚget_input_embeddingsÚtorchÚtensorÚconfigÚimage_token_idÚlongrK   Úsumr   Úallr   Ú
functionalÚpadÚcumsumÚ	unsqueezeÚ
zeros_likeÚwhere)ÚselfrE   rF   rG   Ú_Ú
patch_sizeÚ
image_maskÚnum_image_tokensÚblocks_per_sampleÚoffsetsÚblock_offsetÚrow_cumÚ	chunk_idxÚ	local_idxÚ	block_idxÚimage_embedsÚmerged_embedss                    r(   Úinputs_mergerzSmolVLMModel.inputs_merger€   s“  € ð /×4Ñ4Ñˆˆ:qàÐØ&Ð*E¨$×*CÑ*CÓ*EÜ—‘˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÔgó+ñ ˆJð $ FÑ+‰Jà" d§k¡k×&@Ñ&@Ñ@ˆJà%Ÿ>™>¨a˜>Ó0ÐÜÜI‰IÐ&¨Ñ3°qÑ8Ó9ØQô	
ð -°
Ñ:Ðä—(‘(×%Ñ%×)Ñ)Ð*;×*BÑ*BÀqÐ*BÓ*IÈ6ÐYZÐ)Ó[ˆØ˜s |ˆØ×#Ñ#¨Ð#Ó+ˆØ˜q‘[ ZÑ/ˆ	Ø˜q‘[ JÑ.ˆ	Ø ×*Ñ*¨1Ó-°	Ñ9ˆ	ä×'Ñ'¨Ó6ˆØ#6°yÀÑ7LÈiÐXbÑNcÒefÐ7fÑ#gˆZÑ äŸ™ J×$8Ñ$8¸Ó$<¸lÈMÓZˆØÐr'   zVEncodes images into continuous embeddings that can be forwarded to the language model.)Úcustom_introNÚpixel_valuesÚpixel_attention_maskÚkwargsÚreturnc                 óÖ  — |j                   \  }}}}}|j                  | j                  ¬«      } |j                  ||z  g|j                   dd ¢­Ž }|j                   dd j	                  «       }	|dk(  j                  d¬«      |	k7  }
|
dxx   t        j                  |
«       z  cc<   ||
   j                  «       }|€Lt        j                  d	D cg c]  }|j                   |   ‘Œ c}t        j                  |j                  ¬
«      }n6 |j                  ||z  g|j                   dd ¢­Ž }||
   j                  «       }| j                  j                  j                  }|j                  d||¬«      }|j                  d||¬«      }|j                  d¬«      dkD  j                  «       } | j                   d||ddœ|¤Ž}|j"                  }| j%                  |«      }||_        |S c c}w )a4  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        pixel_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask indicating padded regions in the image.
        )rJ   r   NrL   g        )rP   éþÿÿÿéýÿÿÿrM   r   )r   r   r   )ÚsizerJ   rK   )Ú	dimensionrw   Ústep)rP   ru   T)rp   Úpatch_attention_maskÚreturn_dictr&   )rQ   ÚtorJ   ÚviewÚnumelrX   rS   ÚanyÚ
contiguousÚonesr7   rK   rU   Úvision_configrb   ÚunfoldÚvision_modelÚlast_hidden_stateÚ	connectorÚpooler_output)r`   rp   rq   rr   Ú
batch_sizeÚ
num_imagesÚnum_channelsÚheightÚwidthÚnb_values_per_imageÚreal_images_indsÚirb   Úpatches_subgridrz   Úimage_outputsrG   Úimage_featuress                     r(   Úget_image_featureszSmolVLMModel.get_image_features¡   s  € ð  ?K×>PÑ>PÑ;ˆ
J ¨f°eØ#—‘¨T¯Z©ZÓ8ˆØ(|×(Ñ(¨°jÑ)@ÐZÀ<×CUÑCUÐVWÐVXÐCYÒZˆð +×0Ñ0°°Ð4×:Ñ:Ó<ÐØ(¨CÑ/×4Ñ4¸Ð4ÓFÐJ]Ñ]Ðð 	˜Ó¤§	¡	Ð*:Ó ;Ð;Ñ;Óà#Ð$4Ñ5×@Ñ@ÓBˆàÐ'Ü#(§:¡:Ø5>Ö?°l×(Ñ(¨Ó+Ò?Ü—j‘jØ#×*Ñ*ô$Ñ ð $=Ð#7×#<Ñ#<¸ZÈ*Ñ=TÐ#vÐWk×WqÑWqÐrsÐrtÐWuÒ#vÐ Ø#7Ð8HÑ#I×#TÑ#TÓ#VÐ Ø—[‘[×.Ñ.×9Ñ9ˆ
Ø.×5Ñ5ÀÈ
ÐYcÐ5ÓdˆØ)×0Ñ0¸1À:ÐT^Ð0Ó_ˆØ /× 3Ñ 3¸Ð 3Ó AÀAÑ E×KÑKÓMÐð *˜×)Ñ)ð 
Ø%Ð<PÐ^bñ
Øflñ
ˆð ,×=Ñ=Ðð Ÿ™Ð(;Ó<ˆØ&4ˆÔ#àÐùò/ @s   ÃG&aØ  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        Úattention_maskÚposition_idsÚpast_key_valuesÚ	use_cachec
           	      ó6  — | j                   r/| j                  j                  r|	rt        j	                  d«       d}	||j
                  \  }}n||j
                  \  }}}nt        d«      ‚|	r|€t        | j                  ¬«      }|€9 | j                  j                  «       |«      j                  |j                  «      }||t        d«      ‚|:| j                  ||d¬«      j                  }|j                  |j                  «      }n)|'|j                  | j                  |j                  ¬«      }|| j                  |||¬	«      } | j                  d|||||	d
œ|
¤Ž}t!        |j"                  |j$                  |j&                  |j(                  |¬«      S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embeds)rU   zMYou cannot specify both pixel_values and image_hidden_states at the same timeT)r{   rI   )rE   rF   rG   )rF   r”   r•   r–   r—   )r…   r–   Úhidden_statesÚ
attentionsrG   r&   )ÚtrainingÚ
text_modelÚgradient_checkpointingÚloggerÚwarning_oncerQ   Ú
ValueErrorr   rU   rR   r|   rK   r“   r‡   rJ   rn   rB   r…   r–   r™   rš   )r`   rE   r”   r•   r–   rF   rp   rq   rG   r—   rr   rˆ   Ú
seq_lengthra   Úoutputss                  r(   ÚforwardzSmolVLMModel.forwardÙ   s½  € ð4 =Š=˜TŸ_™_×CÒCÉ	Ü×ÑØlôð ˆIàÐ Ø%.§_¡_Ñ"ˆJ™
ØÐ&Ø(5×(;Ñ(;Ñ%ˆJ˜
¡AäÐTÓUÐUá˜Ð0Ü*°$·+±+Ô>ˆOàÐ ØB˜DŸO™O×@Ñ@ÓBÀ9ÓM×PÑPÐQZ×QaÑQaÓbˆMàÐ#Ð(;Ð(GÜÐlÓmÐmàÐ#Ø"&×"9Ñ"9ØÐ2Àð #:ó #ç‰mð  ð #6×"8Ñ"8¸×9MÑ9MÓ"NÑØ Ð,Ø"5×"8Ñ"8¸t¿z¹zÐR_×RfÑRfÐ"8Ó"gÐàÐ*Ø ×.Ñ.Ø#Ø+Ø$7ð /ó ˆMð "$—/‘/ð 
Ø'Ø)Ø%Ø+Øñ
ð ñ
ˆô .Ø%×7Ñ7Ø#×3Ñ3Ø!×/Ñ/Ø×)Ñ)Ø 3ô
ð 	
r'   )N)	NNNNNNNNN)r!   r"   r#   r$   rS   Ú
LongTensorÚTensorrn   r   r   ÚFloatTensorr   r   Útupler
   r“   r   Ú
BoolTensorr7   r	   rB   r£   r&   r'   r(   rD   rD   z   sÇ  „ ñð
Ø×)Ñ)ðØ:?¿,¹,ðØ]b×]iÑ]ióðB ÙØmôð 9=ñ2à×'Ñ'ð2ð $×.Ñ.°Ñ5ð2ð Ð+Ñ,ð	2ð
 
Ð+Ñ	+ò2óó ð2ðh Ùðô
ð ð .2Ø.2Ø04Ø(,Ø26Ø15Ø8<Ø8<Ø!%ñA
à×#Ñ# dÑ*ðA
ð Ÿ™ tÑ+ðA
ð ×&Ñ&¨Ñ-ð	A
ð
  ™ðA
ð ×(Ñ(¨4Ñ/ðA
ð ×'Ñ'¨$Ñ.ðA
ð $×.Ñ.°Ñ5ðA
ð #×.Ñ.°Ñ5ðA
ð ˜$‘;ðA
ð Ð-Ñ.ðA
ð 
Ð/Ñ	/òA
ó ó
ó ñA
r'   rD   c                   ó0   ‡ — e Zd ZddiZˆ fd„Zˆ fd„Zˆ xZS )ÚSmolVLMForConditionalGenerationzlm_head.weightz$model.text_model.embed_tokens.weightc                 óJ  •— t         ‰|   |«       t        |«      | _        t	        j
                  |«      | j                  j                  _        t        j                  |j                  j                  |j                  j                  d¬«      | _        | j                  «        y )NF)Úbias)ÚsuperÚ__init__rD   Úmodelr   Úfrom_model_configrœ   Úgeneration_configr   ÚLinearÚtext_configÚhidden_sizeÚ
vocab_sizeÚlm_headÚ	post_init)r`   rU   Ú	__class__s     €r(   r®   z(SmolVLMForConditionalGeneration.__init__-  sq   ø€ Ü‰Ñ˜Ô Ü! &Ó)ˆŒ
Ü2B×2TÑ2TÐU[Ó2\ˆ
‰
×ÑÔ/Ü—y‘y ×!3Ñ!3×!?Ñ!?À×ASÑAS×A^ÑA^ÐejÔkˆŒØ‰Õr'   c                 ó$   •— t        ‰|   di |¤Ž y)aÔ	  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> import torch
        >>> from PIL import Image
        >>> from io import BytesIO

        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
        >>> from transformers.image_utils import load_image

        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

        >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
        >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", dtype=torch.bfloat16, device_map="auto")

        >>> # Create inputs
        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {"type": "video", "path": path/to/video},
        ...             {"type": "text", "text": "What is happening in this video?"},
        ...         ]
        ...     }
        ... ]

        >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)

        >>> # Generate
        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

        >>> print(generated_texts)
        ```Nr&   )r­   r£   )r`   Úsuper_kwargsr¸   s     €r(   r£   z'SmolVLMForConditionalGeneration.forward4  s   ø€ ôd 	‰‰Ñ'˜,Ó'r'   )r!   r"   r#   Ú_tied_weights_keysr®   r£   Ú__classcell__)r¸   s   @r(   rª   rª   *  s   ø„ Ø*Ð,RÐSÐô÷2(ð 2(r'   rª   )r   r0   r>   r@   rª   r*   rD   r.   )1rS   Úhuggingface_hub.dataclassesr   r   Úcache_utilsr   r   Ú
generationr   Úmodeling_flash_attention_utilsr	   Úmodeling_outputsr
   Úprocessing_utilsr   r   Úutilsr   r   r   r   r   Úidefics3.configuration_idefics3r   r   Ú"idefics3.image_processing_idefics3r   Ú&idefics3.image_processing_pil_idefics3r   Úidefics3.modeling_idefics3r   r   r   r   r   Ú
get_loggerr!   rž   r   r*   r.   r0   r3   r>   r@   rB   rD   rª   Ú__all__r&   r'   r(   ú<module>rÊ      s  ðó" Ý .Ý ç .Ý *Ý BÝ :ß 4ß jÕ jß RÝ GÝ N÷õ ð 
ˆ×	Ñ	˜HÓ	%€ñ ÐAÔBØô"Ð.ó "ó ó Cð"ô*	Ð4ô 	ô	Ð8ô 	ñ ÐAÔBØôNó ó ó Cðô& ,°eõ ô 	Ð2ô 	ô	Ð8ô 	ô	Ð%Dô 	ôm
=ô m
ô`<(Ð&Fô <(ò~	r'   