
    i                     2   d Z ddlZddlmZ ddlmZ ddlmZ ddlZddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+  e"jX                  e-      Z.dej^                  dej^                  fdZ0dej^                  dej^                  fdZ1ee  G d de                    Z2 G d de
jf                        Z4 G d de
jf                        Z5 G d de
jf                        Z6d e5iZ7 G d! d"e
jf                        Z8 G d# d$e
jf                        Z9 G d% d&e
jf                        Z: G d' d(e      Z; G d) d*e
jf                        Z< G d+ d,e
jf                        Z=	 dNd-e
jf                  d.ej^                  d/ej^                  d0ej^                  d1ej^                  dz  d2e>d3e>fd4Z? G d5 d6e
jf                        Z@ G d7 d8e
jf                        ZA G d9 d:e      ZB G d; d<e
jf                        ZC G d= d>e
jf                        ZDe  G d? d@e             ZE G dA dBe
jf                        ZF G dC dDeE      ZG e dEF       G dG dHeE             ZH G dI dJeE      ZI G dK dLeE      ZJg dMZKy)OzPyTorch AltCLIP model.    N)Callable)	dataclass)Any   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)capture_outputs   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)nn
functionalcross_entropytorcharangelenr!   )r   s    }/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr)   .   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r)   t)r+   caption_loss
image_losss      r(   	clip_lossr0   2   s,    #J/L!*,,.1J:%,,r*   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)AltCLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 B    t        d | j                         D              S )Nc              3   `   K   | ]&  }t        |t              r|j                         n| ( y wN)
isinstancer   to_tuple).0vs     r(   	<genexpr>z)AltCLIPOutput.to_tuple.<locals>.<genexpr>X   s$     ^1Z;%?QZZ\QF^s   ,.)tuplevaluesselfs    r(   r>   zAltCLIPOutput.to_tupleW   s    ^PTP[P[P]^^^r*   )__name__
__module____qualname____doc__r3   r%   FloatTensor__annotations__r4   r5   r6   r7   r8   r   r9   rB   r   r>    r*   r(   r2   r2   8   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r*   r2   c                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
ed        Zedd       Z xZS )AltRobertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 T   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       | j                  dt!        j(                  | j*                  j-                         t         j.                        d       |j                  | _        t        j                  |j$                  |j
                  | j0                        | _        y )	N)padding_idxepsposition_idsr   F
persistenttoken_type_idsdtype)super__init__r"   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr%   r&   max_position_embeddingsexpandzerosrS   sizelongrP   position_embeddingsrE   config	__class__s     r(   r\   zAltRobertaEmbeddings.__init___   s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
 r*   N	input_idsrX   rS   inputs_embedspast_key_values_lengthr   c                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )NrU   rX   r   r   )dimindexrZ   r!   )"create_position_ids_from_input_idsrP   &create_position_ids_from_inputs_embedsrm   hasattrrX   rk   shaper%   gatherrl   rn   rS   r!   ra   rc   ro   rd   rh   )rE   rs   rX   rS   rt   ru   input_shape
batch_size
seq_lengthbuffered_token_type_idsrc   
embeddingsro   s                r(   forwardzAltRobertaEmbeddings.forwards   sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
r*   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrU   r   ry   r   )rm   r%   r&   rn   r!   	unsqueezerk   )rt   rP   r   sequence_lengthrS   s        r(   r{   z;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<r*   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   rw   )neintr%   cumsumtype_asrn   )rs   rP   ru   maskincremental_indicess        r(   rz   z7AltRobertaEmbeddings.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r*   )NNNNr   )r   )rF   rG   rH   rI   r\   r%   
LongTensorrJ   r   Tensorr   staticmethodr{   rz   __classcell__rr   s   @r(   rN   rN   \   s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r*   rN   c            
            e Zd Z fdZ	 ddej
                  dej                  dz  dee   de	ej
                     fdZ
 xZS )	AltRobertaSelfAttentionc                    t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r[   r\   rq   r_   num_attention_headsr|   
ValueErrorr   attention_head_sizeall_head_sizer"   Linearquerykeyvaluerf   attention_probs_dropout_probrh   rp   s     r(   r\   z AltRobertaSelfAttention.__init__   s#    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr*   Nhidden_statesattention_maskkwargsr   c                 D   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }t        j                  ||j	                  dd            }	|	t        j                  | j                        z  }	||	|z   }	t        j                  j                  |	d      }
| j                  |
      }
t        j                  |
|      }|j                  dddd      j!                         }|j#                         d d | j$                  fz   }|j                  |      }||
fS )NrU   r      r   r   r   )r}   r   r   view	transposer   r   r%   matmulmathsqrtr"   r#   softmaxrh   permute
contiguousrm   r   )rE   r   r   r   r   hidden_shapequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes                r(   r   zAltRobertaSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r*   r<   )rF   rG   rH   r\   r%   r   rJ   r   r   rB   r   r   r   s   @r(   r   r      sW    G, 48".||". ))D0". +,	".
 
u||	".r*   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrQ   )r[   r\   r"   r   r_   denserd   re   rf   rg   rh   rp   s     r(   r\   zAltRobertaSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r*   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r<   r   rh   rd   rE   r   r   s      r(   r   zAltRobertaSelfOutput.forward  7    

=1]3}|'CDr*   rF   rG   rH   r\   r%   r   r   r   r   s   @r(   r   r      1    >U\\  RWR^R^ r*   r   eagerc            	            e Zd Z fdZ	 ddej
                  dej                  dz  dee   dej
                  fdZ	 xZ
S )	AltRobertaAttentionc                 |    t         |           t        |j                     |      | _        t        |      | _        y r<   )r[   r\   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationrE   r   outputrp   s     r(   r\   zAltRobertaAttention.__init__  s2    6v7R7RSTZ[	*62r*   Nr   r   r   r   c                 Z     | j                   |fd|i|\  }}| j                  ||      }|S Nr   )rE   r   )rE   r   r   r   attention_output_s         r(   r   zAltRobertaAttention.forward  sI     (dii
)
 
!
  ;;'7Gr*   r<   )rF   rG   rH   r\   r%   r   rJ   r   r   r   r   r   s   @r(   r   r     sQ    3 48 ||  ))D0  +,	 
 
 r*   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r<   )r[   r\   r"   r   r_   intermediate_sizer   r=   
hidden_actstrr   intermediate_act_fnrp   s     r(   r\   zAltRobertaIntermediate.__init__*  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r*   r   r   c                 J    | j                  |      }| j                  |      }|S r<   )r   r   rE   r   s     r(   r   zAltRobertaIntermediate.forward2  s&    

=100?r*   r   r   s   @r(   r   r   )  s#    9U\\ ell r*   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r[   r\   r"   r   r   r_   r   rd   re   rf   rg   rh   rp   s     r(   r\   zAltRobertaOutput.__init__:  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r*   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r<   r   r   s      r(   r   zAltRobertaOutput.forward@  r   r*   r   r   s   @r(   r   r   9  r   r*   r   c            	            e Zd Z fdZ	 d	dej
                  dej                  dz  dee   dej
                  fdZ	d Z
 xZS )
AltRobertaLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
r[   r\   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rp   s     r(   r\   zAltRobertaLayer.__init__I  sI    '-'E'E$,V426:&v.r*   Nr   r   r   r   c                      | j                   |fd|i|}t        | j                  | j                  | j                  |      }|S r   )r   r   feed_forward_chunkr   r   )rE   r   r   r   s       r(   r   zAltRobertaLayer.forwardQ  sY     '
)
 
 2##T%A%A4CSCSUb
 r*   c                 L    | j                  |      }| j                  ||      }|S r<   )r   r   )rE   r   intermediate_outputlayer_outputs       r(   r   z"AltRobertaLayer.feed_forward_chunkc  s,    "//0@A{{#68HIr*   r<   )rF   rG   rH   r\   r%   r   rJ   r   r   r   r   r   r   s   @r(   r   r   H  sV    / 48|| ))D0 +,	
 
$r*   r   c            	       n     e Zd Z fdZ	 ddej
                  dej                  dz  dee   de	fdZ
 xZS )	AltRobertaEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r[   r\   rq   r"   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rE   rq   irr   s      r(   r\   zAltRobertaEncoder.__init__k  sN    ]]U6KcKcEd#eOF$;#ef
&+# $f   A#Nr   r   r   r   c                 P    | j                   D ]  } |||fi |} t        |      S )Nlast_hidden_state)r   r
   )rE   r   r   r   layer_modules        r(   r   zAltRobertaEncoder.forwardq  sC     !JJ 	L( M	 +
 	
r*   r<   )rF   rG   rH   r\   r%   r   rJ   r   r   r
   r   r   r   s   @r(   r   r   j  sM    , 48
||
 ))D0
 +,	

 

r*   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r<   )r[   r\   r"   r   r_   r   Tanh
activationrp   s     r(   r\   zAltRobertaPooler.__init__  s9    YYv1163E3EF
'')r*   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r   )rE   r   first_token_tensorpooled_outputs       r(   r   zAltRobertaPooler.forward  s6     +1a40

#566r*   r   r   s   @r(   r   r     s#    $
U\\ ell r*   r   moduler   r   r   r   scalingrh   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrU   r   )rw   rZ   )ptrainingr   r   )r%   r   r   r"   r#   r   float32torZ   rh   r  r   )
r   r   r   r   r   r  rh   r   attn_weightsattn_outputs
             r(   eager_attention_forwardr	    s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r*   c                        e Zd ZdZ fdZ	 d	dej                  dej                  dz  dee   de	ej                  ej                  dz  f   fdZ
 xZS )
AltCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)r[   r\   rq   r_   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrh   	is_causalr"   r   k_projv_projq_projout_projrp   s     r(   r\   zAltCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar*   Nr   r   r   r   c                    |j                   dd }g |d| j                  }| j                  |      }| j                  |      }| j	                  |      }|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  | j                  sdn| j                  d|\  }
} |
j                  g |d j!                         }
| j#                  |
      }
|
|fS )z#Input shape: Batch x Time x ChannelNrU   r   r           )r  rh   )r}   r  r  r  r  r   r   r   get_interfacerq   r   r	  r  r  rh   reshaper   r  )rE   r   r   r   r   r   querieskeysrC   attention_interfacer  r  s               r(   r   zAltCLIPAttention.forward  sO    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
!\ *k));;;;FFHmmK0L((r*   r<   )rF   rG   rH   rI   r\   r%   r   r   r   rB   r   r   r   s   @r(   r  r    sf    GB. /3$)||$) t+$) +,	$)
 
u||U\\D00	1$)r*   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
AltCLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r<   )r[   r\   rq   r   r   activation_fnr"   r   r_   r   fc1fc2rp   s     r(   r\   zAltCLIPMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr*   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r<   )r$  r#  r%  r   s     r(   r   zAltCLIPMLP.forward  s4    /**=9/r*   r   r   s   @r(   r!  r!    s$    KU\\ ell r*   r!  c                        e Zd Zdef fdZdej                  dej                  dee   de	ej                  ej                  dz  f   fdZ xZS )	AltCLIPEncoderLayerrq   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )r[   r\   r_   r  r  	self_attnr"   rd   re   layer_norm1r!  mlplayer_norm2rp   s     r(   r\   zAltCLIPEncoderLayer.__init__  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr*   r   r   r   r   Nc                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   rL   )r+  r*  r-  r,  )rE   r   r   r   residualr   s         r(   r   zAltCLIPEncoderLayer.forward  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r*   )rF   rG   rH   r   r\   r%   r   r   r   rB   rJ   r   r   r   s   @r(   r(  r(    sc    S} S||  +,	
 
u  %,,"55	6r*   r(  c                   f     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
ez  fdZ xZS )
AltCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    rq   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r[   r\   rq   r"   r   r   r   r(  layersr   )rE   rq   r   rr   s      r(   r\   zAltCLIPEncoder.__init__#  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %kr   Nr   r   r   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a8  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)

        r   )r3  r
   )rE   rt   r   r   r   encoder_layers         r(   r   zAltCLIPEncoder.forward)  sH    * &![[ 	M) M	 +
 	
r*   r<   )rF   rG   rH   rI   r   r\   r%   r   r   r   rB   r
   r   r   r   s   @r(   r1  r1    sP    ,} , /3
 t+
 +,	

 
	 
r*   r1  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )AltCLIPVisionEmbeddingsrq   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr   r   rS   rT   rV   )r[   r\   rq   r_   r  
image_size
patch_sizer"   	Parameterr%   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr]   position_embeddingri   r&   rk   rp   s     r(   r\   z AltCLIPVisionEmbeddings.__init__M  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr*   r   heightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrU   g      ?r   r   bicubicF)rm   modealign_cornersr   )r}   rH  weightr   r%   jit
is_tracingrS   r?  r   r  r   r"   r#   interpolater   cat)rE   r   rI  rJ  rF  rH  rG  class_pos_embedpatch_pos_embedrw   
new_height	new_widthsqrt_num_positionss                r(   interpolate_pos_encodingz0AltCLIPVisionEmbeddings.interpolate_pos_encodingc  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr*   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (r  rY   r   r   rU   r   )r}   r>  r   rE  rO  rZ   r  flattenr   rB  rk   r%   rS  rY  rH  rS   )rE   rZ  rY  r   r   rI  rJ  target_dtypepatch_embedsclass_embedsr   s              r(   r   zAltCLIPVisionEmbeddings.forward  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr*   F)rF   rG   rH   r   r\   r%   r   r   rY  rJ   r   r   r   s   @r(   r7  r7  L  se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r*   r7  c                   V    e Zd ZU eed<   dZdZdZg Z e	j                         d        Zy)AltCLIPPreTrainedModelrq   altclip)imagetextTc                    | j                   j                  }t        |t              r| j                   j                  }t	        j
                  |j                  d|j                  dz  |z         t	        j
                  |j                  j                  |j                   j                  |z         t	        j
                  |j                  j                  |j                   j                  |z         t	        j                  |j                  t        j                  |j                         j#                  d             yt        |t$              r| j                   j                  }|j                  dz  d|j                   j&                  z  dz  z  |z  }|j                  dz  |z  }t	        j
                  |j(                  j                  |       t	        j
                  |j*                  j                  |       t	        j
                  |j,                  j                  |       t	        j
                  |j.                  j                  |       yt        |t0              r| j                   j                  }|j                   j2                  dz  d|j                   j&                  z  dz  z  |z  }d|j                   j2                  z  dz  |z  }t	        j
                  |j4                  j                  |       t	        j
                  |j6                  j                  |       yt        |t8              rt	        j
                  |j:                  j                  |j<                  dz  | j                   j                  z         t	        j
                  |j>                  j                  |j@                  dz  | j                   j                  z         yt        |tB        jD                        r?t	        jF                  |jH                         t	        jJ                  |j                         yt        |tB        jL                        rct	        j
                  |j                  d| j                   j                         |jH                   t	        jF                  |jH                         yyt        |tB        jN                        rt	        j
                  |j                  d| j                   j                         |jP                  EtS        |j                  dd	      s-t	        jF                  |j                  |jP                            yyyt        |tT              ryt	        j                  |j                  t        j                  |j                  jV                  d
         j#                  d             t	        jF                  |jX                         yy)zInitialize the weightsr  r  )meanstd)ri  rT   r   N_is_hf_initializedFrU   )-rq   initializer_factorr=   r7  initnormal_rB  r  rE  rO  initializer_rangerH  copy_rS   r%   r&   rG  rk   r  r   r  r  r  r  r!  r_   r$  r%  AltCLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr"   rd   zeros_r=  ones_r   r]   rP   getattrrN   r}   rX   )rE   r   factorin_proj_stdout_proj_stdfc_stds         r(   _init_weightsz$AltCLIPPreTrainedModel._init_weights  s    //f56[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<-LL&&--))4/$++2P2PP LL((//++T1DKK4R4RR -KK$JJv}}%		*LLSdkk6T6TU{{&FKK( '-LLSdkk6T6TU!!-gfmmMach6iFMM&*<*<=> 7j- 45JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 6r*   N)rF   rG   rH   r   rK   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_moduler%   no_gradr|  rL   r*   r(   rc  rc    s:    !(&*#U]]_./ ./r*   rc  c                   x     e Zd Zdef fdZe	 	 d	dej                  dz  dedz  de	e
   deez  fd       Z xZS )
AltCLIPVisionTransformerrq   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r[   r\   rq   r_   r7  r   r"   rd   re   pre_layrnormr1  encoderpost_layernorm)rE   rq   r  rr   s      r(   r\   z!AltCLIPVisionTransformer.__init__  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr*   NrZ  rY  r   r   c                     |t        d      | j                  ||      }| j                  |      } | j                  dd|i|}|d   }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)rY  rt   r   r   pooler_outputrL   )r   r   r  r  r  r   )rE   rZ  rY  r   r   encoder_outputsr   r   s           r(   r   z AltCLIPVisionTransformer.forward  s     ?@@Ogh))-8&$,, 
'


 ,A.)!Q'2++M:)/'
 	
r*   r   )rF   rG   rH   r   r\   r   r%   rJ   boolr   r   rB   r   r   r   r   s   @r(   r  r    so    Q2 Q  2605
''$.
 #'+
 +,	

 
+	+
 
r*   r  c                        e Zd ZU eed<   dZdZeedZ	def fdZ
dej                  fdZe ed	      e	 	 ddej$                  d
z  dedee   defd                     Z xZS )AltCLIPVisionModelrq   rZ  )re  r   
attentionsc                 d    t         |   |       t        |      | _        | j	                          y r<   )r[   r\   r  vision_model	post_initrp   s     r(   r\   zAltCLIPVisionModel.__init__	  s'     4V<r*   r   c                 B    | j                   j                  j                  S r<   )r  r   rE  rD   s    r(   get_input_embeddingsz'AltCLIPVisionModel.get_input_embeddings  s      ++;;;r*   Ftie_last_hidden_statesNrY  r   c                 ,     | j                   d||d|S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```rZ  rY  rL   )r  )rE   rZ  rY  r   s       r(   r   zAltCLIPVisionModel.forward  s.    > !t   
%%=
 
 	
r*   r   )rF   rG   rH   r   rK   main_input_namer~  r(  r  _can_record_outputsr\   r"   Moduler  r   r   r   r%   rJ   r  r   r   r   r   r   r   s   @r(   r  r     s    $O!,&
2 <bii <  E2 26). 
''$. 
 #' 
 +,	 

 
$ 
  3   
r*   r  aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                       e Zd ZU eed<   eedZd fd	Zd Z	d Z
eee	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dee   defd                     Z xZS )AltRobertaModelrq   r  c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r[   r\   rq   rN   r   r   r  r   poolerr  )rE   rq   add_pooling_layerrr   s      r(   r\   zAltRobertaModel.__init__I  sN    
 	 .v6(02C&v. 	r*   c                 .    | j                   j                  S r<   r   ra   rD   s    r(   r  z$AltRobertaModel.get_input_embeddingsY  s    ...r*   c                 &    || j                   _        y r<   r  rE   r   s     r(   set_input_embeddingsz$AltRobertaModel.set_input_embeddings\  s    */'r*   Nrs   r   rX   rS   rt   r   r   c                    ||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}	||j                  n|j                  }
|t	        j
                  ||	f|
      }| j                  ||      }| j                  ||||      } | j                  |fd|i|}|d   }| j                  | j                  |      nd }t        ||      S )	NzDYou cannot specify both input_ids and inputs_embeds at the same timerU   z5You have to specify either input_ids or inputs_embedsr    )rs   rS   rX   rt   r   r   r  )r   %warn_if_padding_and_no_attention_maskrm   r!   r%   onesget_extended_attention_maskr   r  r  r   )rE   rs   r   rX   rS   rt   r   r   r   r   r!   extended_attention_maskembedding_outputr  sequence_outputr   s                   r(   r   zAltRobertaModel.forward_  s8     ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN 150P0PQ_al0m??%)'	 + 
 '$,,
2
 

 *!,8<8OO4UY)-'
 	
r*   )TNNNNN)rF   rG   rH   r   rK   r   r   r  r\   r  r  r   r   r   r%   r   r   r   r   r   r   r   s   @r(   r  r  8  s     (- /0   *..2.2,0-1.
<<$&.
 t+.
 t+	.

 llT).
 ||d*.
 +,.
 
$.
    .
r*   r  c                   t    e Zd ZU eed<   dZ fdZdej                  fdZ	dej                  ddfdZdd	edz  dej                  f fd
Zee	 	 	 	 	 ddej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dee   deez  fd              Z xZS )AltCLIPTextModelrq   )rf  c                 &   t         |   |       t        |d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        | j                          y )NF)r  rQ   )r[   r\   r  robertar"   r   r_   project_dimtransformationrd   re   pre_LNr  rp   s     r(   r\   zAltCLIPTextModel.__init__  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr*   r   c                 B    | j                   j                  j                  S r<   r  r   ra   rD   s    r(   r  z%AltCLIPTextModel.get_input_embeddings  s    ||&&666r*   r   Nc                 :    || j                   j                  _        y r<   r  r  s     r(   r  z%AltCLIPTextModel.set_input_embeddings  s    27/r*   new_num_tokensc                 "    t         |   |      S r<   )r[   resize_token_embeddings)rE   r  rr   s     r(   r  z(AltCLIPTextModel.resize_token_embeddings  s    w.~>>r*   rs   r   rX   rS   rt   r   c           	           | j                   d|||||d|}|d   }| j                  |      }| j                  |      }	|	dddf   }
t        |	|
|j                  |j
                        S )a+  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPTextModel

        >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> texts = ["it's a cat", "it's a dog"]

        >>> inputs = processor(text=texts, padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```)rs   r   rX   rS   rt   r   N)r   r  r   r  rL   )r  r  r  r   r   r  )rE   rs   r   rX   rS   rt   r   outputsr  projection_stater  s              r(   r   zAltCLIPTextModel.forward  s    : $,, 
))%'
 
 "!* ++o6  ..?(A.6.'!//))	
 	
r*   r<   r  )rF   rG   rH   r   rK   r~  r\   r"   r  r  r]   r  r   r  r   r   r%   r   r   r   rB   r   r   r   r   s   @r(   r  r    s     7bii 78",, 84 8?cDj ?BLL ?  *..2.2,0-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 +,3
 
8	83
  3
r*   r  c                   0    e Zd ZU eed<   eedZdef fdZe	e
	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	ee   d
eez  fd              Ze ed      e
	 ddej(                  ded	ee   d
eez  fd                     Ze	e
	 	 	 	 	 	 	 ddej.                  dz  dej(                  dz  dej                  dz  dej.                  dz  dej                  dz  dedz  ded	ee   d
eez  fd              Z xZS )rp  rq   r  c                 r   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  |_	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t#        |      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j.                  t1        j2                  | j4                  j6                              | _        | j;                          y )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)r=  )r[   r\   r=   vision_configr   	TypeErrortypetext_configr   r   projection_dimr  rr  r_   rt  r  
text_modelr  r  r"   r   rs  rq  r@  r%   tensorrq   logit_scale_init_valuelogit_scaler  )rE   rq   r  r  rr   s       r(   r\   zAltCLIPModel.__init__  se    &..0CD--./q2  &,,.?@++,-Q0 
 ((,,-3-H-H*$33)55 - 9 9*;74]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r*   Nrs   r   rS   rX   r   r   c                 x     | j                   d||||d|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)rs   r   rS   rX   rL   )r  r  rq  )rE   rs   r   rS   rX   r   text_outputsr   s           r(   get_text_featureszAltCLIPModel.get_text_features  sZ    0 AP A
)%)	A

 A
 %22%)%9%9-%H"r*   Fr  rZ  rY  c                 t     | j                   d||d|}|j                  }| j                  |      |_        |S )ao  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```r  rL   )r  r  rs  )rE   rZ  rY  r   vision_outputsr   s         r(   get_image_featureszAltCLIPModel.get_image_features,  sQ    6 +** 
%%=
 

 '44'+'='=m'L$r*   return_lossc           	          | j                   d
||||d|}	 | j                  d
||d|}
|
d   }| j                  |      }|	d   }| j                  |      }||j	                  ddd      z  }||j	                  ddd      z  }| j
                  j                         }t        j                  ||j                               |z  }|j                  }d}|rt        |      }t        ||||||	|
	      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```)rs   r   rX   rS   r  r   r   rU   T)r  rw   keepdimN)r3   r4   r5   r6   r7   r8   r9   rL   )r  r  rs  rq  normr  expr%   r   r-   Tr0   r2   )rE   rs   rZ  r   rS   rX   r  rY  r   r  r  r7   r6   r  r5   r4   r3   s                    r(   r   zAltCLIPModel.forwardQ  s:   J 't 
))%	

 
 +** 
%%=
 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,_-D-+#%* .
 	
r*   )NNNra  )NNNNNNF)rF   rG   rH   r   rK   r(  r  r  r\   r   r   r%   r   r   r   rB   r   r  r   r   rJ   r  r  r   r2   r   r   r   s   @r(   rp  rp    s   ,&
} B  /3,0.2 <<  t+  llT)	 
 t+  +,  
+	+    D  E2 */ ''  #'  +,	 
 
+	+   3   D  .215.204.2#').L
##d*L
 ''$.L
 t+	L

 &&-L
 t+L
 D[L
 #'L
 +,L
 
	L
  L
r*   rp  )rc  r  r  rp  )r  )LrI   r   collections.abcr   dataclassesr   typingr   r%   torch.nnr"    r   rl  activationsr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_altclipr   r   r   
get_loggerrF   loggerr   r)   r0   r2   r  rN   r   r   r   r   r   r   r   r   r   floatr	  r  r!  r(  r1  r7  rc  r  r  r  r  rp  __all__rL   r*   r(   <module>r     s     $ !    & ! 9 
 G & 6 j j 7 5 X X 
		H	%
`U\\ `ell `-%,, -5<< - _K _  _Bg8299 g8T6.bii 6.t299  $& "
 "))  ,RYY  ryy 0 D
		 
4ryy . %II%<<% 
% <<	%
 LL4'% % %.;)ryy ;)~ 4 B.
RYY .
dPbii Pf 6/_ 6/ 6/r$
ryy $
N5
/ 5
p P
, P
P
fI
- I
X
) 
D _r*   