
    i                         d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)  ejT                  e+      Z,dejZ                  dejZ                  fdZ.dejZ                  dejZ                  fdZ/dejZ                  dejZ                  fdZ0e ed       G d de                    Z1e ed       G d  d!e                    Z2ee G d" d#e                    Z3 G d$ d%ejh                        Z5 G d& d'ejh                        Z6	 dOd(ejh                  d)ejZ                  d*ejZ                  d+ejZ                  d,ejZ                  dz  d-e7d.e7d/ee   fd0Z8 G d1 d2ejh                        Z9 G d3 d4ejh                        Z: G d5 d6e      Z;e G d7 d8e             Z< G d9 d:ejh                        Z= G d; d<e<      Z> ed=       G d> d?e<             Z? G d@ dAe<      Z@ edB       G dC dDe<             ZAe G dE dFe<             ZBe G dG dHe<             ZCe G dI dJe<             ZD edK       G dL dMe<             ZEg dNZFy)PzPyTorch CLIP model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr"   )r   s    w/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/clip/modeling_clip.pycontrastive_lossr)   /   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r)   t)r+   caption_loss
image_losss      r(   	clip_lossr0   3   s,    #J/L!*,,.1J:%,,r*   tensorc                     t        j                  | d      }t        j                  |dd      }t        j                  |d      }|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimkeepdim      ?)r%   powsum)r1   square_tensor
sum_tensornormed_tensors       r(   _get_vector_normr=   9   s<    
 IIfa(M=b$?JIIj#.Mr*   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__rA   r%   FloatTensor__annotations__rB   rC   tuplerD    r*   r(   r@   r@   D   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r*   r@   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsrB   .rC   rD   )rE   rF   rG   rH   rO   r%   rI   rJ   rB   rC   rK   rD   rL   r*   r(   rN   rN   V   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r*   rN   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrO   rA   text_model_outputvision_model_outputr   c                 B    t        d | j                         D              S )Nc              3   `   K   | ]&  }t        |t              r|j                         n| ( y wN)
isinstancer   to_tuple).0vs     r(   	<genexpr>z&CLIPOutput.to_tuple.<locals>.<genexpr>   s$     ^1Z;%?QZZ\QF^s   ,.)rK   valuesselfs    r(   r[   zCLIPOutput.to_tuple   s    ^PTP[P[P]^^^r*   )rE   rF   rG   rH   rR   r%   rI   rJ   rS   rT   rO   rA   rU   r   rV   rK   r   r[   rL   r*   r(   rQ   rQ   h   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r*   rQ   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr3   r   position_idsr   r4   
persistent)super__init__rd   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr%   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr&   expandra   rd   	__class__s     r(   rp   zCLIPVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr*   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr4   r7   r   r3   bicubicF)sizemodealign_cornersr5   )shaper~   weight	unsqueezer%   jit
is_tracingrk   rt   r   reshapepermuter   r#   interpolateviewcat)ra   r   r   r   r{   r~   r|   class_pos_embedpatch_pos_embedr5   
new_height	new_widthsqrt_num_positionss                r(   interpolate_pos_encodingz-CLIPVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr*   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (z).)dtyper3   r   r4   r   )r   rs   
ValueErrorrz   r   r   toflatten	transposerw   r   r%   r   r   r~   rk   )ra   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r(   forwardzCLIPVisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr*   F)rE   rF   rG   r   rp   r%   Tensorintr   rI   r   __classcell__r   s   @r(   rc   rc      se    q/ q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r*   rc   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
CLIPTextEmbeddingsrd   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrk   rl   Frm   )ro   rp   rq   r   r}   
vocab_sizetoken_embeddingmax_position_embeddingsr~   r   r%   r&   r   ra   rd   rr   r   s      r(   rp   zCLIPTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r*   N	input_idsrk   inputs_embedsr   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr4   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r~   r   r   rk   r   )ra   r   rk   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r(   r   zCLIPTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r*   NNN)rE   rF   rG   r   rp   r%   
LongTensorrI   r   r   r   r   s   @r(   r   r      sj    

~ 

 .20426	##d* &&- ((4/	
 
r*   r   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr4   r   )r5   r   )ptrainingr   r3   )r%   matmulr   r   r#   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r(   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r*   c                        e Zd ZdZdeez  f fdZ	 d
dej                  dej                  dz  de	e
   deej                  ej                  dz  f   fd	Z xZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrd   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  dz  | _        |j                  | _
        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )N      F)ro   rp   rd   rq   rr   num_attention_heads	num_headshead_dimscaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r(   rp   zCLIPAttention.__init__  s    ++33$..8]]D(
//ii?ii?ii?		$..$..Ar*   NrC   r   r   r   c                    |j                   dd }g |d| j                  }| j                  |      }| j                  |      }| j	                  |      }|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  | j                  sdn| j                  d|\  }
} |
j                  g |d j!                         }
| j#                  |
      }
|
|fS )z#Input shape: Batch x Time x ChannelNr4   r   r3           )r   r   )r   r   r   r   r   r   r   r   get_interfacerd   _attn_implementationr   r   r   r   r   r   r   )ra   rC   r   r   input_shapehidden_shapequerieskeysr_   attention_interfacer   r   s               r(   r   zCLIPAttention.forward,  sO    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
!\ *k));;;;FFHmmK0L((r*   rY   )rE   rF   rG   rH   r   r   rp   r%   r   r   r   rK   r   r   r   s   @r(   r   r     st    GB/.@ B$ /3%)||%) t+%) +,	%)
 
u||U\\D00	1%)r*   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y rY   )ro   rp   rd   r	   
hidden_actactivation_fnr   r   rq   intermediate_sizefc1fc2r   s     r(   rp   zCLIPMLP.__init__U  sd    #F$5$5699V//1I1IJ99V55v7I7IJr*   rC   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rY   )r   r   r   )ra   rC   s     r(   r   zCLIPMLP.forward\  s4    /**=9/r*   )rE   rF   rG   rp   r%   r   r   r   r   s   @r(   r   r   T  s$    KU\\ ell r*   r   c                        e Zd Zdeez  f fdZdej                  dej                  dee	   dej                  fdZ xZS )CLIPEncoderLayerrd   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)eps)ro   rp   rq   rr   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r(   rp   zCLIPEncoderLayer.__init__d  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr*   rC   r   r   r   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rC   r   rL   )r   r   r   r   )ra   rC   r   r   residualr   s         r(   r   zCLIPEncoderLayer.forwardl  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r*   )rE   rF   rG   r   r   rp   r%   r   r   r   rI   r   r   r   s   @r(   r   r   c  sV    S/.@ S||  +,	
 
		r*   r   c                   l    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZeedZ ej                          d        Zy)CLIPPreTrainedModelrd   clip)imagetextT)rC   rD   c                    | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t              r| j                   j                  }t	        j
                  |j                   d|j"                  dz  |z         t	        j
                  |j$                  j                  |j                   j&                  |z         t	        j
                  |j                  j                  |j                   j&                  |z         t	        j                  |j                  t        j                  |j(                        j                  d             nt        |t*              r| j                   j                  }|j"                  dz  d|j                   j,                  z  dz  z  |z  }|j"                  dz  |z  }t	        j
                  |j.                  j                  |       t	        j
                  |j0                  j                  |       t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       nt        |t6              r| j                   j                  }|j                   j8                  dz  d|j                   j,                  z  dz  z  |z  }d|j                   j8                  z  dz  |z  }t	        j
                  |j:                  j                  |       t	        j
                  |j<                  j                  |       nt        |t>              rt	        j
                  |j@                  j                  |jB                  dz  | j                   j                  z         t	        j
                  |jD                  j                  |jF                  dz  | j                   j                  z         nGt        |tH              rZt	        j
                  |jD                  j                  | j                   j8                  dz  | j                   j                  z         nt        |tJ              rZt	        j
                  |j@                  j                  | j                   j8                  dz  | j                   j                  z         nst        |tL              rct	        j
                  |jN                  j                  | j                   jP                  j8                  dz  | j                   j                  z         t        |tR        jT                        r>t	        jV                  |jX                         t	        jZ                  |j                         t        |tR        j\                        r-|jX                   t	        jV                  |jX                         y	y	y	)
zInitialize the weightsr   g{Gz?)meanstdr4   rl   r   )r  r3   N)/rd   initializer_factorrZ   r   initnormal_r   r   r~   copy_rk   r%   r&   r   r   rc   rw   rr   rz   initializer_ranger|   r   num_hidden_layersr   r   r   r   r   rq   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   r   zeros_rj   ones_r   )ra   r   factorin_proj_stdout_proj_stdfc_stds         r(   _init_weightsz!CLIPPreTrainedModel._init_weights  su    //f01LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 45[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<	*LL&&--))4/$++2P2PP LL((//++T1DKK4R4RR  =>LL((//KK++T1DKK4R4RR  ;<LL&&--KK++T1DKK4R4RR  :;LL!!((KK--994?$++B`B``
 fbll+KK$JJv}}%fbii(V[[-DKK$ .E(r*   N)rE   rF   rG   r   rJ   base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr%   no_gradr  rL   r*   r(   r   r     s[    (&*#N"&)#
 U]]_8% 8%r*   r   c                   `     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
fdZ xZS )
CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    rd   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
ro   rp   rd   r   
ModuleListranger  r   layersgradient_checkpointing)ra   rd   r   r   s      r(   rp   zCLIPEncoder.__init__  sO    mmuVMeMeGf$g!%5f%=$gh&+# %hs   A#Nr   r   r   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a7  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        )rB   )r-  r   )ra   r   r   r   rC   encoder_layers         r(   r   zCLIPEncoder.forward  sH    ( &![[ 	M) M	 +
 	
r*   rY   )rE   rF   rG   rH   r   rp   r%   r   r   r   r   r   r   r   s   @r(   r(  r(    sK    ,z , /3
 t+
 +,	

 

r*   r(  c                        e Zd ZU eed<   dZddgZdef fdZe e	d      e
	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dee   def
d                     Z xZS )CLIPTextTransformerrd   r  r   r   c                    t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        |j                  | _        | j                          y r   )ro   rp   rd   rq   r   r   r(  encoderr   r   r   final_layer_normeos_token_id	post_initr   s      r(   rp   zCLIPTextTransformer.__init__  sm     &&	,V4"6* "YF<Q<Q R #//r*   Ftie_last_hidden_statesNr   r   rk   r   r   c                 h   |t        d      |j                         }|j                  d|d         }| j                  ||      }t	        | j
                  ||d       }|j                  dd         | j                  d||dd|}|j                  }| j                  |      }| j                  dk(  rm|t        j                  |j                  d	   |j                  
      |j                  t        j                   |j                        j#                  d      f   }	n|t        j                  |j                  d	   |j                  
      |j                  t        j                   |j                        | j                  k(  j!                         j#                  d      f   }	t%        ||	      S )NzYou have to specify input_idsr4   )r   rk   )rd   r   r   past_key_valuesr   T)r   r   r   r3   r   r!   )r   r"   r   rB   pooler_outputrL   )r   r   r   r   r
   rd   popr5  rB   r6  r7  r%   r&   r   r"   r   r   argmaxr   )
ra   r   r   rk   r   r   rC   encoder_outputsrB   pooled_outputs
             r(   r   zCLIPTextTransformer.forward  s    <==nn&NN2{27	),W+;;') 	
 	

;%+74<< ,
'),
 	,
 ,== 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */'
 	
r*   r   )rE   rF   rG   r   rJ   r  _no_split_modulesrp   r   r   r   r%   r   r   r   r   r   r   r   s   @r(   r2  r2    s     -/AB
~ 
  E2 *..2,0	:
<<$&:
 t+:
 llT)	:

 +,:
 
$:
  3  :
r*   r2  zI
    The text model from CLIP without any head or projection on top.
    c                        e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Ze	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dee   def
d       Z xZS )CLIPTextModelrd   r3  r   r   c                 d    t         |   |       t        |      | _        | j	                          y rY   )ro   rp   r2  
text_modelr8  r   s     r(   rp   zCLIPTextModel.__init__\  s&     -f5r*   r   c                 B    | j                   j                  j                  S rY   rG  r   r   r`   s    r(   get_input_embeddingsz"CLIPTextModel.get_input_embeddingsb      ))999r*   c                 :    || j                   j                  _        y rY   rI  ra   r   s     r(   set_input_embeddingsz"CLIPTextModel.set_input_embeddingse      5:""2r*   Nr   r   rk   r   c                 .     | j                   d|||d|S )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rk   rL   )rG  )ra   r   r   rk   r   s        r(   r   zCLIPTextModel.forwardh  s/    0 t 
)%
 	
 	
r*   r   )rE   rF   rG   r   rJ   r  rC  rp   r   ModulerJ  rN  r   r%   r   r   r   r   r   r   r   s   @r(   rE  rE  Q  s      -/AB~ :bii :;  *..2,0	
<<$&
 t+
 llT)	

 +,
 
$
 
r*   rE  c                        e Zd ZU eed<   dZdZdgZdef fdZe	 e
d      e	 	 ddej                  dz  d	edz  d
ee   defd                     Z xZS )CLIPVisionTransformerrd   r   r  r   c                 B   t         |   |       || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        | j                          y r   )ro   rp   rd   rq   rc   r   r   r   r   pre_layrnormr(  r5  post_layernormr8  r   s      r(   rp   zCLIPVisionTransformer.__init__  sv     &&	.v6LL8M8MN"6* ll9&:O:OPr*   Fr9  Nr   r   r   c                     |t        d      | j                  ||      }| j                  |      } | j                  dd|i|}|j                  }|d d dd d f   }| j                  |      }t        ||      S )Nz You have to specify pixel_values)r   r   r   r=  rL   )r   r   rW  r5  rB   rX  r   )ra   r   r   r   rC   rA  rB   rB  s           r(   r   zCLIPVisionTransformer.forward  s     ?@@Ogh))-8+74<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r*   r*  )rE   rF   rG   r   rJ   main_input_namer  rC  rp   r   r   r   r%   rI   boolr   r   r   r   r   r   s   @r(   rT  rT    s    $O!+,	/ 	  E2 2605
''$.
 #'+
 +,	

 
$
  3  
r*   rT  zK
    The vision model from CLIP without any head or projection on top.
    c            
            e Zd ZU eed<   dZdZdgZdef fdZde	j                  fdZe	 	 ddej                  dz  d	ed
ee   defd       Z xZS )CLIPVisionModelrd   r   rU  r   c                 d    t         |   |       t        |      | _        | j	                          y rY   )ro   rp   rT  vision_modelr8  r   s     r(   rp   zCLIPVisionModel.__init__  s'     1&9r*   r   c                 B    | j                   j                  j                  S rY   r_  r   rz   r`   s    r(   rJ  z$CLIPVisionModel.get_input_embeddings        ++;;;r*   Nr   r   c                 ,     | j                   d||d|S )a(  
        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   rL   )r_  )ra   r   r   r   s       r(   r   zCLIPVisionModel.forward  s.    < !t   
%%=
 
 	
r*   r*  )rE   rF   rG   r   rJ   rZ  r  rC  rp   r   rR  rJ  r   r%   rI   r[  r   r   r   r   r   r   s   @r(   r]  r]    s     $O!+,/ <bii <  26).!
''$.!
 #'!
 +,	!

 
$!
 !
r*   r]  c                       e Zd ZU eed<   g dZdef fdZee	 	 dde	j                  de	j                  dz  de	j                  dz  dee   d	eez  f
d
              Zee	 dde	j                   dedee   d	eez  fd              Zee	 	 	 	 	 	 dde	j&                  dz  de	j                   dz  de	j                  dz  de	j&                  dz  dedz  dedee   d	efd              Z xZS )r  rd   )r   r   rc   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        j                  |      }|j                  | _        t         j                  |      }|j"                  | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Frj   )ro   rp   rZ   text_configr   	TypeErrortyper  r   projection_dimrq   r  r  rE  _from_configrG  r]  r_  r   r   r  r  ru   r%   r1   rd   logit_scale_init_valuelogit_scaler8  )ra   rd   ri  r  rG  r_  r   s         r(   rp   zCLIPModel.__init__  sx    &,,n=++,-Q0 
 &..0@A--./q2 
 ((,,$33)55 - 9 9"//<
$//&33MB(55!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r*   Nr   r   rk   r   r   c                 x     | j                   d|||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   rk   return_dictrL   )rG  r>  r  )ra   r   r   rk   r   text_outputsrB  s          r(   get_text_featureszCLIPModel.get_text_features  sV    0 4C4?? 4
)%	4

 4
 %22%)%9%9-%H"r*   r   r   c                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r   rq  rL   )r_  r>  r  )ra   r   r   r   vision_outputsrB  s         r(   get_image_featureszCLIPModel.get_image_features<  sU    6 6GT5F5F 6
%%=6
 	6
 '44'+'='=m'L$r*   return_lossc           	      L    | j                   d||d|} | j                  d|||d|}	|j                  }
| j                  |
      }
|	j                  }| j	                  |      }|
t        |
      z  }
|t        |      z  }t        j                  ||
j                         j                  |j                              }|| j                  j                         j                  |j                        z  }|j                         }d}|rt        |      }t        |||||
|	|      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```rd  rQ  N)rR   rS   rT   rO   rA   rU   rV   rL   )r_  rG  r>  r  r  r=   r%   r   r-   r   r"   ro  expr0   rQ   )ra   r   r   r   rk   rw  r   r   ru  rr  rA   rO   rT   rS   rR   s                  r(   r   zCLIPModel.forwardb  sI   L 6GT5F5F 6
%%=6
 6
 4C4?? 4
)%4
 	4
 &33--l;"00**;7 $&6|&DD!$4[$AA  ,,{LNN4D4G4GHZHZ4[\)D,<,<,@,@,B,E,EkFXFX,YY*,,._-D-+#%* .
 	
r*   NNr   )NNNNNF)rE   rF   rG   r   rJ   rC  rp   r   r   r%   r   r   r   rK   r   rs  rI   r[  rv  r   rQ   r   r   r   s   @r(   r  r    s   Z!z !F  /3,0	 <<  t+  llT)	 
 +,  
+	+    D  */"''" #'" +,	"
 
+	+"  "H  .215.204#').M
##d*M
 ''$.M
 t+	M

 &&-M
 D[M
 #'M
 +,M
 
M
  M
r*   r  c                        e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Zee	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dee   def
d              Z xZS )r  rd   r3  r   r   c                     t         |   |       t        j                  |      }|j                  | _        t        j                  |j                  |j                  d      | _	        | j                          y NFrh  )ro   rp   rE  rm  rG  r   r   rq   rl  r  r8  )ra   rd   rG  r   s      r(   rp   z$CLIPTextModelWithProjection.__init__  s[     "//7
$//!yy););V=R=RY^_ 	r*   r   c                 B    | j                   j                  j                  S rY   rI  r`   s    r(   rJ  z0CLIPTextModelWithProjection.get_input_embeddings  rK  r*   c                 :    || j                   j                  _        y rY   rI  rM  s     r(   rN  z0CLIPTextModelWithProjection.set_input_embeddings  rO  r*   Nr   r   rk   r   c                      | j                   d|||d|}|j                  }| j                  |      }t        ||j                  |j
                  |j                        S )a@  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```rQ  )rO   rB   rC   rD   rL   )rG  r>  r  rN   rB   rC   rD   )ra   r   r   rk   r   rr  rB  rO   s           r(   r   z#CLIPTextModelWithProjection.forward  su    4 4C4?? 4
)%4
 	4
 %22**=9"#*<<&44#..	
 	
r*   r   )rE   rF   rG   r   rJ   r  rC  rp   r   rR  rJ  rN  r   r   r%   r   r   r   rN   r   r   r   s   @r(   r  r    s     -/AB	~ 	:bii :;  *..2,0	&
<<$&&
 t+&
 llT)	&

 +,&
 
&
  &
r*   r  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
ee	 	 ddej                  dz  ded	ee   defd
              Z xZS )r  rd   r   rU  c                     t         |   |       t        j                  |      }|j                  | _        t        j                  |j                  |j                  d      | _	        | j                          y r}  )ro   rp   r]  rm  r_  r   r   rq   rl  r  r8  ra   rd   r_  r   s      r(   rp   z&CLIPVisionModelWithProjection.__init__  s\     &33F;(55!#6+=+=v?T?T[`!a 	r*   r   c                 B    | j                   j                  j                  S rY   ra  r`   s    r(   rJ  z2CLIPVisionModelWithProjection.get_input_embeddings  rb  r*   Nr   r   c                      | j                   d||d|}|j                  }| j                  |      }t        ||j                  |j
                  |j                        S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
        >>> from transformers.image_utils import load_image

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```rd  )rA   rB   rC   rD   rL   )r_  r>  r  r@   rB   rC   rD   )ra   r   r   r   ru  rB  rA   s          r(   r   z%CLIPVisionModelWithProjection.forward  st    : 6GT5F5F 6
%%=6
 6

 '44--m<$%,>>(66%00	
 	
r*   r*  )rE   rF   rG   r   rJ   rZ  r  rp   r   rR  rJ  r   r   r%   rI   r[  r   r   r@   r   r   r   s   @r(   r  r    s    $O!	/ 	<bii <  26).(
''$.(
 #'(
 +,	(

 
(
  (
r*   r  z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdZdeddf fdZee	 	 d
de	j                  dz  de	j                  dz  dee   defd	              Z xZS )r  r   rU  rd   r   Nc                 ~   t         |   |       |j                  | _        t        j	                  |j
                        }|j                  | _        |j                  dkD  r4t        j                  |j
                  j                  |j                        nt        j                         | _        | j                          y )Nr   )ro   rp   
num_labelsr]  rm  r  r_  r   r   rq   Identityr  r8  r  s      r(   rp   z#CLIPForImageClassification.__init__B  s      ++&33F4H4HI(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r*   labelsr   c                 0    | j                   |fi |}|j                  }t        j                  |ddddddf   d      }| j	                  |      }d}|| j                  ||| j                        }t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   )rR   r   rC   rD   )
r_  rB   r%   r  r  loss_functionrd   r   rC   rD   )ra   r   r  r   outputssequence_outputr   rR   s           r(   r   z"CLIPForImageClassification.forwardQ  s     /@d.?.?/
/

 "33**_QAX%>AF1%%ffdkkBD$!//))	
 	
r*   rz  )rE   rF   rG   rZ  r  r   rp   r   r   r%   r   r   r   r   r   r   r   s   @r(   r  r  8  s     %O!z d   -1&*
llT)
 t#
 +,	

 

  
r*   r  )r  r   rE  r  r]  r  r  )r   )GrH   collections.abcr   dataclassesr   typingr   r%   r    r   r  activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_clipr   r   r   
get_loggerrE   loggerr   r)   r0   r=   r@   rN   rQ   rR  rc   r   floatr   r   r   r   r   r(  r2  rE  rT  r]  r  r  r  r  __all__rL   r*   r(   <module>r     s    $ !    & ! / 9 b b F &  J 5 L L 
		H	%
`U\\ `ell `-%,, -5<< -U\\ ell  
	<K 	< 	< 
	<+ 	< 	< _ _  _@P299 Pf% %^ %II%<<% 
% <<	%
 LL4'% % % '(%*7)BII 7)tbii 1 B G%/ G% G%T-
")) -
`O
- O
d 
/
' /

/
d,
/ ,
^ 
1
) 1

1
h @
# @
 @
F ?
"5 ?
 ?
D =
$7 =
 =
@ 4
!4 4
4
nr*   