
    iMO                     L   d Z ddlZddlZddlmc mZ ddlmZ ddlmZ ddl	m
Z ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/  ed      e G d de+                    Z0 ed      e G d de*                    Z1 ed      e G d de)                    Z2 G d de/      Z3 G d d e'      Z4 G d! d"e&      Z5 G d# d$ejl                        Z7 G d% d&e#      Z8 G d' d(e-      Z9 G d) d*e      Z: G d+ d,e.      Z; G d- d.ejl                        Z<e G d/ d0e             Z= ed12       G d3 d4e=             Z> ed52       G d6 d7e=             Z?e G d8 d9e"             Z@g d:ZAy);z%Pytorch implementation of AIMv2 Model    N)strict)nn   )initialization)PreTrainedConfig)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputz!apple/aimv2-large-patch14-224-lit)
checkpointc                       e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	eed	<   d
Z
eee   z  eeef   z  ed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<    e       Zy)Aimv2VisionConfiga  
    use_head (`str`, *optional*, defaults to `True`):
        Whether to use Attention Pooling Head or Not.
    is_native (`str`, *optional*, defaults to `False`):
        Whether to use ckpt trained for image native resolution or not.

    Example:

    ```python
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel

    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
    >>> configuration = Aimv2VisionConfig()

    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
    >>> model = Aimv2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```i   hidden_sizei   intermediate_size   num_hidden_layers   num_attention_heads   
patch_sizeh㈵>rms_norm_eps        attention_dropoutFqkv_biasmlp_biassilu
hidden_act{Gz?initializer_rangeTuse_head	is_nativeN)__name__
__module____qualname____doc__r"   int__annotations__r#   r%   r'   r)   listtupler+   floatr-   r.   boolr/   r1   strr3   r4   r5   AttributeErrorlayer_norm_eps     x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/aimv2/modular_aimv2.pyr!   r!   (   s    * K!s!s  46Jd3i%S/16L%%(us{(HdHdJ#u#HdIt#%NrD   r!   c                       e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   d	Z	eed
<   dZ
eed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<    e       Z e       Z e       Z e       Zd Zy)Aimv2TextConfigi   
vocab_sizei   r"   i   r#      r%      r'   M   max_position_embeddingsr0   r1   r*   r+   Fr.   r/   r2   r3   c                 .    t        j                  di | y )NrC   )r   __post_init__)selfkwargss     rE   rN   zAimv2TextConfig.__post_init__d   s    &&00rD   N)r6   r7   r8   rH   r:   r;   r"   r#   r%   r'   rL   r1   r@   r+   r>   r.   r?   r/   r3   rA   bos_token_idpad_token_idrB   projection_sizerN   rC   rD   rE   rG   rG   Q   s     JK!s!s  #%S%JL%HdHd#u#!#L!#L#%N$&O1rD   rG   c                   <    e Zd ZU dZdZeed<   dZeed<   dZ	eed<   y)	Aimv2Configa  
    max_logit_scale (`float`, *optional*, defaults to `100.0`):
        The maximum logit scale to use

    Example:

    ```python
    >>> from transformers import Aimv2Config, Aimv2Model

    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
    >>> configuration = Aimv2Config()

    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
    >>> model = Aimv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
    >>> config_text = Aimv2TextConfig()
    >>> config_vision = Aimv2VisionConfig()

    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
    ```i   projection_dimg/L
F@logit_scale_init_valueg      Y@max_logit_scaleN)
r6   r7   r8   r9   rV   r:   r;   rW   r>   rX   rC   rD   rE   rU   rU   h   s(    8 NC$*E*"OU"rD   rU   c                       e Zd Zy)Aimv2OutputNr6   r7   r8   rC   rD   rE   rZ   rZ          rD   rZ   c                       e Zd Zy)Aimv2RMSNormNr[   rC   rD   rE   r^   r^      r\   rD   r^   c                       e Zd Zy)Aimv2MLPNr[   rC   rD   rE   r`   r`      r\   rD   r`   c                        e Zd Zdef fdZedddej                  fdej                  fd       Z	dej                  dej                  fd	Z
 xZS )
Aimv2VisionEmbeddingsconfigc                 B   t         |           || _        |j                  | _        t	        j
                  |j                  |j                  |j                  |j                        | _        t        |j                  |j                        | _        |j                  |j                  z  dz  }| j                  j                  s%t	        j                  ||j                        | _        | j!                  dt#        j$                  |      j'                  d      d       y )N)kernel_sizestrider   position_ids   F)
persistent)super__init__rc   r)   r   Conv2dnum_channelsr"   patch_embedr^   r+   rms_norm
image_sizer5   	Embeddingposition_embeddingregister_buffertorcharangeexpand)rO   rc   num_patches	__class__s      rE   rm   zAimv2VisionEmbeddings.__init__   s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-VchirD      g     @cpureturnc                 :   t        j                  t        |      ||      }t        j                  t        |       ||      }t        j                  ||d      \  }}|dz  }t        j                  |||      |z  }	d||	z  z  }	|j	                         d   |	d d d f   z  }
|j	                         d   |	d d d f   z  }t        j
                  |
j                         |
j                         |j                         |j                         gd      d d d d d f   S )	Ndtypedevicexy)indexing   g      ?).Nri   dim)rv   rw   r:   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   r   grid_wgrid_hpos_dimomegaout_hout_ws               rE   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding   s     c%jfEc&kvFFq.WE&AGK{E)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddrD   pixel_valuesc                    |j                         \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }| j
                  j                  rY| j                  || j                  z  || j                  z  | j
                  j                  |j                  |j                        }n| j                  | j                        }||z   }|S )Nr   ri   )r   r   r   )sizerp   r   	transposerq   rc   r5   r   r)   r"   r   r   rt   rg   )rO   r   _r   r   hidden_states	pos_embeds          rE   forwardzAimv2VisionEmbeddings.forward   s    *//11fe((6>>qAKKAqQm4;;  ??$//)(++11$++#)) @ I //0A0ABI%	1rD   )r6   r7   r8   r!   rm   staticmethodrv   float32Tensorr   r   __classcell__rz   s   @rE   rb   rb      s]    j0 j !$'%u}}e	e e ELL U\\ rD   rb   c                       e Zd Zy)Aimv2TextEmbeddingsNr[   rC   rD   rE   r   r      r\   rD   r   c                        e Zd Z fdZ xZS )Aimv2Attentionc                    t         |   |       t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _	        y )Nbias)
rl   rm   r   Linearr   r.   k_projv_projq_projout_projrO   rc   rz   s     rE   rm   zAimv2Attention.__init__   s     iiV__UiiV__UiiV__U		$..$..vWrD   )r6   r7   r8   rm   r   r   s   @rE   r   r      s    X XrD   r   c            	            e Zd Zdef fdZ	 d	dej                  dej                  dz  dee   dej                  fdZ	 xZ
S )
Aimv2EncoderLayerrc   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y N)rl   rm   r   	attentionr`   ffnr^   r"   r+   	rms_norm1	rms_norm2r   s     rE   rm   zAimv2EncoderLayer.__init__   sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNrD   Nr   attention_maskrP   r}   c                     | j                  |      } | j                  d||d|\  }}||z   }| j                  |      }| j                  |      }||z   }|S )N)r   r   rC   )r   r   r   r   )rO   r   r   rP   norm_hidden_statesattn_outputr   
mlp_outputs           rE   r   zAimv2EncoderLayer.forward   sl     "^^M:'r6HYgrkqrQ%3!^^M:XX01
%
2rD   r   )r6   r7   r8   r!   rm   rv   r   r   r   r   r   r   s   @rE   r   r      sY    O0 O /3|| t+ +,	
 
rD   r   c                       e Zd Zy)Aimv2EncoderNr[   rC   rD   rE   r   r      r\   rD   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Aimv2AttentionPoolingHeadrc   c                 &   t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  t        j                  dd| j                              | _        t        j                  | j                  | j                  d      | _        y )Nr   ri   T)rl   rm   r"   r'   	num_headsr   r   r.   r   r   	Parameterrv   zeros	cls_tokenoutput_projr   s     rE   rm   z"Aimv2AttentionPoolingHead.__init__   s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSrD   r   r}   c                    |j                   \  }}}| j                  j                  |dd      }| j                  |      j	                  ||| j
                  || j
                  z        }| j                  |      j	                  ||| j
                  || j
                  z        }|j	                  |d| j
                  || j
                  z        }|j                  dddd      }|j                  dddd      }|j                  dddd      }t        j                  |||      }	|	j                  dd      j	                  |d|      }	|	j                  d      }	| j                  |	      }
|
S )Nrj   ri   r   r   r   r   )shaper   rx   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )rO   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              rE   r   z!Aimv2AttentionPoolingHead.forward  sH   *7*=*='
GZNN))*b"=	kk-(00WdnnV`dhdrdrVrsM*22:wXbfjftftXtu!!*at~~A]^kk!Q1%aAq)aAq)44UCG!++Aq199*aT!&&1&-!!+.rD   )	r6   r7   r8   r!   rm   rv   r   r   r   r   s   @rE   r   r      s-    	T0 	TU\\ ell rD   r   c                   v     e Zd ZU dZeed<   dZdZdZg dZ	dZ
dZdZ ej                          fd       Z xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    rc   aimv2)imageT)r   r   rb   r   c                 $   t         |   |       t        |d      rYt        |j                  t
        j                        r4t        j                  |j                  t        j                  d             y y t        |t              r7t        j                  |j                  d| j                  j                         y t        |t               rZt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             y t        |t.              rZt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             y y )Nlogit_scaleg$I$I,@r,   )r   stdrj   rh   )rl   _init_weightshasattr
isinstancer   r   r   init	constant_mathlogr   normal_r   rc   r3   rb   copy_rg   rv   rw   r   rx   r   )rO   modulerz   s     rE   r   z"Aimv2PreTrainedModel._init_weights-  s   f%6=)&,,bll;v11488H3EF < 9:LL))9V9VW 56JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5rD   )r6   r7   r8   r9   rU   r;   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnrv   no_gradr   r   r   s   @rE   r   r     sY    
 !&*# NU]]_
i 
irD   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                        e Zd ZU eed<   dZeedZdef fdZ	de
j                  fdZe ed      ed	ee   defd
                     Z xZS )Aimv2VisionModelrc   r   r   
attentionsc                 6   t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                  rt        |      | _        | j                          y r   )rl   rm   rc   rb   
embeddingsr   encoderr^   r"   r+   rq   r4   r   head	post_initr   s     rE   rm   zAimv2VisionModel.__init__H  sq     /7#F+$V%7%79L9LM==1&9DIrD   r}   c                 .    | j                   j                  S r   )r   rp   rO   s    rE   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddingsV  s    ***rD   Ftie_last_hidden_statesrP   c                     | j                  |      } | j                  dd|i|}|j                  }| j                  |      }| j                  r| j                  |      nd}t        ||      S )a3  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```inputs_embedsNlast_hidden_statepooler_outputrC   )r   r   r  rq   r4   r   r   )rO   r   rP   r   encoder_outputsr  r  s          rE   r   zAimv2VisionModel.forwardY  sz    < 5+74<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
rD   )r6   r7   r8   r!   r;   main_input_namer   r   _can_record_outputsrm   r   Moduler   r   r   r   r   r   r   r   r   r   s   @rE   r   r   ;  s~     $O*$
0 +bii +  E2*
 +,*
 
$	*
  3  *
rD   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c            
            e Zd ZdZeedZdef fdZde	j                  fdZd Ze ed	      e	 ddej"                  d
z  dee   defd                     Z xZS )Aimv2TextModel	input_idsr   rc   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                          y r   )rl   rm   rc   r   r   r   r   r^   r"   r+   rq   eos_token_idr   r   s     rE   rm   zAimv2TextModel.__init__  sa     -f5#F+$V%7%79L9LM"//rD   r}   c                 .    | j                   j                  S r   r   token_embeddingr   s    rE   r   z#Aimv2TextModel.get_input_embeddings  s    ...rD   c                 &    || j                   _        y r   r  )rO   r   s     rE   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings  s    */'rD   Fr   Nr   rP   c                    | j                  |      }|j                  \  }}}t        j                  |t        j                  |j
                        }|j                  d      j                  |d      }|t        | j                  |||d       } | j                  d	||d|}	|	j                  }
| j                  |
      }
|
t        j                  |
j                  d   |
j
                        |j                  t        j                  |
j
                        | j                  k(  j                         j!                  d      f   }t#        |
|      S )
Nr   r   rj   )rc   r   rg   r   past_key_values)r   r   )r   r   r   rC   )r   r   rv   rw   longr   	unsqueezerx   r   rc   r   r  rq   tor:   r  argmaxr   )rO   r	  r   rP   r   r   r   r   rg   r  r  pooled_outputs               rE   r   zAimv2TextModel.forward  sK    	2!.!4!4
GQ||G5::mFZFZ[#--a077
BG%/{{+)- $N '$,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
rD   r   )r6   r7   r8   r  r   r   r  rG   rm   r   r  r   r  r   r   r   rv   r   r   r   r   r   r   r   s   @rE   r  r    s     "O +$
	 	/bii /0  E2 /3&
 t+&
 +,	&

 
$&
  3  &
rD   r  c                       e Zd ZdZdefdZee	 	 	 ddej                  dz  dej                  dz  dej                  dz  dee   d	ef
d
              Zy)
Aimv2ModelTrc   c                    t        j                  | |       |j                  | _        |j                  j                  | _        |j                  j                  | _        t        j                  |j                        | _
        t        j                  |j                        | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j"                  t%        j&                  | j(                  j*                              | _        t/        j0                  |j2                        | _        | j7                          y )NFr   )r   rm   rV   vision_configr"   vision_embed_dimtext_configtext_embed_dimr   _from_configvision_modelr  
text_modelr   r   visual_projectiontext_projectionr   rv   tensorrc   rW   r   r   r   rX   max_log_logit_scaler   )rO   rc   s     rE   rm   zAimv2Model.__init__  s     v.$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C rD   Nr	  r   r   rP   r}   c                     | j                   dd|i|} | j                  d||d|}|j                  }| j                  |      }|j                  }| j	                  |      }|t        |      z  }|t        |      z  }| j                  j                  d| j                        j                         j                  |j                        }	|	|z  |j                         z  }
|
j                         }t        ||
||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r	  r   r,   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputrC   )r   r!  r  r"  r#  r   r   clampr%  expr  r   trZ   )rO   r	  r   r   rP   vision_outputstext_outputsr*  r)  r   r(  r'  s               rE   r   zAimv2Model.forward  s'   B 6GT5F5F 6
%6
6

 4C4?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
rD   )NNN)r6   r7   r8   r   rU   rm   r   r   rv   
LongTensorFloatTensorr   r   r   rZ   r   rC   rD   rE   r  r    s    { $  .215.2	?
##d*?
 ''$.?
 t+	?

 +,?
 
?
  ?
rD   r  )rU   r!   rG   r   r  r   r  )Br9   r   rv   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr    r   r   configuration_utilsr   masking_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r!   rG   rU   rZ   r^   r`   r  rb   r   r   r   r   r   r   r   r  r  __all__rC   rD   rE   <module>rF     s   ,     .  & 3 / 9 K - & I I 7 5 P P 9 \ \ Q Q >?$&* $&  @$&N >?1& 1  @1* >?#, #  @#D	, 		< 		x 	1BII 1h	, 	X_ X2 2	= 			 D i? i iD 
F
+ F

F
R 
B
) B

B
J V
 V
 V
rrD   