
    i<                        d dl Z d dlmZ d dl mZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"  ejF                  e$      Z% ed      e G d de                    Z& G d de!      Z' G d de"      Z( ed      e G d de                    Z) G d ded      Z* G d de      Z+ G d  d!e      Z, G d" d#e      Z- G d$ d%e       Z. G d& d'e      Z/g d(Z0y))    N)strict)nn   )CacheDynamicCache)GenerationConfig)FlashAttentionKwargs)BaseModelOutputWithPooling)ImagesKwargsUnpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )Idefics3ConfigIdefics3VisionConfig)Idefics3ImageProcessor)Idefics3ImageProcessorPil)Idefics3BaseModelOutputWithPast Idefics3ForConditionalGenerationIdefics3ModelIdefics3PreTrainedModelIdefics3VisionTransformerz$HuggingFaceTB/SmolVLM2-2.2B-Instruct)
checkpointc                       e Zd ZdZdZy)SmolVLMVisionConfiga  
    Example:

    ```python
    >>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
    >>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig

    >>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
    >>> configuration = SmolVLMVisionConfig()

    >>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
    >>> model = SmolVLMVisionTransformer(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```smolvlm_visionN__name__
__module____qualname____doc__
model_type     |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/smolvlm/modular_smolvlm.pyr   r   *   s    " "Jr'   r   c                       e Zd Zy)SmolVLMPreTrainedModelNr!   r"   r#   r&   r'   r(   r*   r*   A       r'   r*   c                       e Zd Zy)SmolVLMVisionTransformerNr+   r&   r'   r(   r.   r.   E   r,   r'   r.   c                       e Zd ZdZdZy)SmolVLMConfiga  
    scale_factor (`int`, *optional*, defaults to 2):
        The scale factor for the image encoder.

    Example:
    ```python
    >>> from transformers import SmolVLMModel, SmolVLMConfig
    >>> # Initializing configuration
    >>> configuration = SmolVLMConfig()
    >>> # Initializing a model from the configuration
    >>> model = SmolVLMModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```smolvlmNr    r&   r'   r(   r0   r0   I   s     Jr'   r0   c                   :    e Zd ZU dZeed<   eeef   ed<   eed<   y)SmolVLMImageProcessorKwargsaz  
    do_image_splitting (`bool`, *optional*, defaults to `True`):
        Whether to split the image into sub-images concatenated with the original image. They are split into patches
        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
    return_row_col_info (`bool`, *optional*, defaults to `False`):
        Whether to return the row and column information of the images.
    do_image_splittingmax_image_sizereturn_row_col_infoN)	r!   r"   r#   r$   bool__annotations__dictstrintr&   r'   r(   r3   r3   ^   s#     cN"r'   r3   F)totalc                       e Zd Zy)SmolVLMImageProcessorNr+   r&   r'   r(   r>   r>   n   r,   r'   r>   c                       e Zd Zy)SmolVLMImageProcessorPilNr+   r&   r'   r(   r@   r@   r   r,   r'   r@   c                       e Zd Zy)SmolVLMBaseModelOutputWithPastNr+   r&   r'   r(   rB   rB   v   r,   r'   rB   c                      e Zd ZdZdej
                  dej                  dej                  fdZe e	d      	 dd	ej                  d
ej
                  dz  dee   deez  fd              Ze e	d      e	 	 	 	 	 	 	 	 	 ddej
                  dz  dej                  dz  dej
                  dz  dedz  dej                  dz  d	ej                  dz  d
ej"                  dz  dej                  dz  dedz  dee   deez  fd                     Zy)SmolVLMModelz
    A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
    in forward. Instead, we override inputs_merger here with custom logic.
    	input_idsinputs_embedsimage_hidden_statesc                     |j                   \  }}}|a| | j                         t        j                  | j                  j
                  t        j                  |j                              k(  }|d   }n|| j                  j
                  k(  }|j                  d      }t        t        j                  ||z  dk(        d       ||z  }t        j                  j                  j                  |j                  d      dd      }	|	d d	 }
|j                  d	      }|dz
  |z  }|dz
  |z  }|
j                  d      |z   }t        j                   |      }|||   ||   d d f   ||<   t        j"                  |j                  d	      ||      }|S )
Ndtypedevice).r      dimr   zCAt least one sample has <image> tokens not divisible by patch_size.)rL   r   )value)shapeget_input_embeddingstorchtensorconfigimage_token_idlongrK   sumr   allr   
functionalpadcumsum	unsqueeze
zeros_likewhere)selfrE   rF   rG   _
patch_size
image_masknum_image_tokensblocks_per_sampleoffsetsblock_offsetrow_cum	chunk_idx	local_idx	block_idximage_embedsmerged_embedss                    r(   inputs_mergerzSmolVLMModel.inputs_merger   s    /44:q&*E$*C*C*ET[[77uzzR_RfRfg+ J $F+J"dkk&@&@@J%>>a>0II&3q89Q	
 -
:((%%))*;*B*Bq*B*I6YZ)[s|###+q[Z/	q[J.	 **1-	9	''6#6y7LiXbNcef7f#gZ J$8$8$<lMZr'   zVEncodes images into continuous embeddings that can be forwarded to the language model.)custom_introNpixel_valuespixel_attention_maskkwargsreturnc                    |j                   \  }}}}}|j                  | j                        } |j                  ||z  g|j                   dd  }|j                   dd j	                         }	|dk(  j                  d      |	k7  }
|
dxx   t        j                  |
       z  cc<   ||
   j                         }|Lt        j                  d	D cg c]  }|j                   |    c}t        j                  |j                  
      }n6 |j                  ||z  g|j                   dd  }||
   j                         }| j                  j                  j                  }|j                  d||      }|j                  d||      }|j                  d      dkD  j                         } | j                   d||dd|}|j"                  }| j%                  |      }||_        |S c c}w )a4  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        pixel_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask indicating padded regions in the image.
        )rJ   r   NrL   g        )rP   rM   r   )r   r   r   )sizerJ   rK   )	dimensionrw   step)rP   ru   T)rp   patch_attention_maskreturn_dictr&   )rQ   torJ   viewnumelrX   rS   any
contiguousonesr7   rK   rU   vision_configrb   unfoldvision_modellast_hidden_state	connectorpooler_output)r`   rp   rq   rr   
batch_size
num_imagesnum_channelsheightwidthnb_values_per_imagereal_images_indsirb   patches_subgridrz   image_outputsrG   image_featuress                     r(   get_image_featureszSmolVLMModel.get_image_features   s     ?K>P>P;
Jfe#TZZ8(|((j)@Z<CUCUVWVXCYZ +004::<(C/444FJ]] 			*: ;;;#$45@@B'#(::5>?l((+?jj#**$  $=#7#<#<Z*=T#vWkWqWqrsrtWu#v #78H#I#T#T#V [[..99
.55
Yc5d)001:T^0_ / 3 3 3 AA EKKM *)) 
%<P^b
fl
 ,== (;<&4#/ @s   G&a  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        attention_maskposition_idspast_key_values	use_cachec
           	      6   | j                   r/| j                  j                  r|	rt        j	                  d       d}	||j
                  \  }}n||j
                  \  }}}nt        d      |	r|t        | j                        }|9 | j                  j                         |      j                  |j                        }||t        d      |:| j                  ||d      j                  }|j                  |j                        }n)|'|j                  | j                  |j                        }|| j                  |||	      } | j                  d|||||	d
|
}t!        |j"                  |j$                  |j&                  |j(                  |      S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embeds)rU   zMYou cannot specify both pixel_values and image_hidden_states at the same timeT)r{   rI   )rE   rF   rG   )rF   r   r   r   r   )r   r   hidden_states
attentionsrG   r&   )training
text_modelgradient_checkpointingloggerwarning_oncerQ   
ValueErrorr   rU   rR   r|   rK   r   r   rJ   rn   rB   r   r   r   r   )r`   rE   r   r   r   rF   rp   rq   rG   r   rr   r   
seq_lengthra   outputss                  r(   forwardzSmolVLMModel.forward   s   4 ==T__CC	l I %.__"J
&(5(;(;%J
ATUU0*$++>O BDOO@@B9MPPQZQaQabM#(;(Glmm#"&"9"92 #: #m   #6"8"89M9M"N ,"5"8"8tzzR_RfRf"8"g* ..#+$7 / M "$// 
')%+
 
 .%77#33!//)) 3
 	
r'   )N)	NNNNNNNNN)r!   r"   r#   r$   rS   
LongTensorTensorrn   r   r   FloatTensorr   r   tupler
   r   r   
BoolTensorr7   r	   rB   r   r&   r'   r(   rD   rD   z   s   
)):?,,]b]i]iB m 9=2''2 $..52 +,	2
 
+	+2 2h 
  .2.204(,26158<8<!%A
##d*A
 t+A
 &&-	A

 A
 ((4/A
 ''$.A
 $..5A
 #..5A
 $;A
 -.A
 
/	/A
 
 A
r'   rD   c                   0     e Zd ZddiZ fdZ fdZ xZS )SmolVLMForConditionalGenerationzlm_head.weightz$model.text_model.embed_tokens.weightc                 J   t         |   |       t        |      | _        t	        j
                  |      | j                  j                  _        t        j                  |j                  j                  |j                  j                  d      | _        | j                          y )NF)bias)super__init__rD   modelr   from_model_configr   generation_configr   Lineartext_confighidden_size
vocab_sizelm_head	post_init)r`   rU   	__class__s     r(   r   z(SmolVLMForConditionalGeneration.__init__-  sq     !&)
2B2T2TU[2\

/yy!3!3!?!?ASASA^A^ejkr'   c                 $    t        |   di | y)a	  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> import torch
        >>> from PIL import Image
        >>> from io import BytesIO

        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
        >>> from transformers.image_utils import load_image

        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

        >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
        >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", dtype=torch.bfloat16, device_map="auto")

        >>> # Create inputs
        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {"type": "video", "path": path/to/video},
        ...             {"type": "text", "text": "What is happening in this video?"},
        ...         ]
        ...     }
        ... ]

        >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)

        >>> # Generate
        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

        >>> print(generated_texts)
        ```Nr&   )r   r   )r`   super_kwargsr   s     r(   r   z'SmolVLMForConditionalGeneration.forward4  s    d 	','r'   )r!   r"   r#   _tied_weights_keysr   r   __classcell__)r   s   @r(   r   r   *  s    *,RS2( 2(r'   r   )r   r0   r>   r@   r   r*   rD   r.   )1rS   huggingface_hub.dataclassesr   r   cache_utilsr   r   
generationr   modeling_flash_attention_utilsr	   modeling_outputsr
   processing_utilsr   r   utilsr   r   r   r   r   idefics3.configuration_idefics3r   r   "idefics3.image_processing_idefics3r   &idefics3.image_processing_pil_idefics3r   idefics3.modeling_idefics3r   r   r   r   r   
get_loggerr!   r   r   r*   r.   r0   r3   r>   r@   rB   rD   r   __all__r&   r'   r(   <module>r      s  "  .  . * B : 4 j j R G N  
		H	% AB". "  C"*	4 		8 	 ABN   C&,e  	2 		8 		%D 	m
= m
`<(&F <(~	r'   