
    i!                     6   d Z ddlZddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*  e!jV                  e,      Z-e e d       G d de                    Z.e e d       G d de                    Z/ee  G d de                    Z0dejb                  dejb                  fdZ2d ejb                  dejb                  fd!Z3d"e*d#e4fd$Z5d]d%e4e6z  d&e7fd'Z8 G d( d)e	jr                        Z: G d* d+e	jv                        Z< G d, d-e	jr                        Z= G d. d/e	jr                        Z> G d0 d1e	jr                        Z? G d2 d3e	jr                        Z@ G d4 d5e	jr                        ZA G d6 d7e	jr                        ZB G d8 d9e	jr                        ZC	 d^d:e	jr                  d;ejb                  d<ejb                  d=ejb                  d>ejb                  dz  d?eDd@eDfdAZE G dB dCe	jr                        ZF G dD dEe	jr                        ZG G dF dGe	jr                        ZH G dH dIe	jr                        ZI G dJ dKe	jr                        ZJ G dL dMe      ZK G dN dOe	jr                        ZL G dP dQe	jr                        ZMe  G dR dSe             ZN e dT       G dU dVeN             ZO e dW       G dX dYeN             ZPe  G dZ d[eN             ZQg d\ZRy)_zPyTorch ALIGN model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   y)AlignVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r!   torchFloatTensor__annotations__r"   r#   tuple     y/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/align/modeling_align.pyr    r    -   sN    
 .2L%##d*126u((4/659M5**+d29r-   r    ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   dZe
ej                     dz  ed<   y)AlignTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr"   r#   
attentions)r$   r%   r&   r'   r1   r(   r)   r*   r"   r#   r+   r2   r,   r-   r.   r0   r0   >   sh    
 -1K""T)026u((4/659M5**+d2926Je''(4/6r-   r0   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)AlignOutputar  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The output of [`AlignVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AlignTextModel`].
    vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
        The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr1   r!   text_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r8   r9   N)getattrto_tuple).0kselfs     r.   	<genexpr>z'AlignOutput.to_tuple.<locals>.<genexpr>o   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)r+   keysrA   s   `r.   r>   zAlignOutput.to_tuplen   s#     
YY[
 
 	
r-   )r$   r%   r&   r'   r5   r(   r)   r*   r6   r7   r1   r!   r8   r   r9   r   r+   r   r>   r,   r-   r.   r4   r4   P   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*14818DHAH
%* 
r-   r4   logitsr:   c                     t         j                  j                  | t        j                  t        |       | j                        d      S )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr(   arangelenrH   )rE   s    r.   contrastive_lossrN   w   s5    ==&&vu||CKPVP]P]/^ps&ttr-   
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)rN   t)rO   caption_loss
image_losss      r.   
align_lossrT   {   s,    #J/L!*,,.1J:%,,r-   confignum_channelsc                     | j                   }|| j                  z  }t        |t        ||dz  z         |z  |z        }|d|z  k  r||z  }t        |      S )z<
    Round number of filters based on depth multiplier.
       g?)depth_divisorwidth_coefficientmaxint)rU   rV   divisornew_dims       r.   round_filtersr_      sf     ""GF,,,L'3|gk9:gEOPG |##7w<r-   kernel_sizeadjustc                     t        | t              r| | f} | d   dz  | d   dz  f}|r|d   dz
  |d   |d   dz
  |d   fS |d   |d   |d   |d   fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rX   r   )
isinstancer\   )r`   ra   corrects      r.   correct_padre      s}     +s#"K01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r-   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rU   c                    t         |           t        |d      | _        t	        j
                  d      | _        t	        j                  |j                  | j                  dddd      | _	        t	        j                  | j                  |j                  |j                  	      | _        t        |j                     | _        y )
N    )r   r   r   r   paddingr   rX   validFr`   striderk   bias)epsmomentum)super__init__r_   out_dimr   	ZeroPad2drk   Conv2drV   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr	   
hidden_act
activationrA   rU   	__class__s     r.   rs   zAlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r-   pixel_valuesr:   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)rk   rw   r{   r}   )rA   r   featuress      r.   forwardzAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r-   )
r$   r%   r&   r'   r   rs   r(   Tensorr   __classcell__r   s   @r.   rg   rg      s0    	40 	4ELL U\\ r-   rg   c                   .     e Zd Z	 	 	 	 	 	 	 d fd	Z xZS )AlignVisionDepthwiseConv2dc	                 @    ||z  }	t         
|   ||	|||||||	       y )N)	in_channelsout_channelsr`   rn   rk   dilationgroupsro   padding_mode)rr   rs   )rA   r   depth_multiplierr`   rn   rk   r   ro   r   r   r   s             r.   rs   z#AlignVisionDepthwiseConv2d.__init__   s=     #%55#%#% 	 
	
r-   )r   r   r   r   r   Tzeros)r$   r%   r&   rs   r   r   s   @r.   r   r      s$     
 
r-   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z
 xZS )
AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rU   in_dimrt   rn   c                     t         |           t        j                  ||ddd      | _        t        j
                  ||j                        | _        t        |j                     | _
        y )Nr   sameFr   r   r`   rk   ro   )num_featuresrp   )rr   rs   r   rv   expand_convrx   ry   	expand_bnr	   r|   
expand_act)rA   rU   r   rt   rn   r   s        r.   rs   z"AlignVisionExpansionLayer.__init__   sZ    99 
 W&BWBWX !2!23r-   r#   r:   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   rA   r#   s     r.   r   z!AlignVisionExpansionLayer.forward   s4    ((7}56r-   )r$   r%   r&   r'   r   r\   rs   r(   r)   r   r   r   r   s   @r.   r   r      sH    
40 
4# 
4 
4UX 
4U%6%6 5<< r-   r   c            
       p     e Zd ZdZdededededef
 fdZdej                  d	ej                  fd
Z xZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rU   r   rn   r`   adjust_paddingc                 b   t         |           || _        | j                  dk(  rdnd}t        ||      }t	        j
                  |      | _        t        ||||d      | _        t	        j                  ||j                  |j                        | _        t        |j                     | _        y )	NrX   rl   r   )ra   rj   Frm   r   rp   rq   )rr   rs   rn   re   r   ru   depthwise_conv_padr   depthwise_convrx   ry   rz   depthwise_normr	   r|   depthwise_act)	rA   rU   r   rn   r`   r   conv_padrk   r   s	           r.   rs   z"AlignVisionDepthwiseLayer.__init__   s     	"kkQ.7Fk.A"$,,w"?8FHSX
 !nnV%:%:VE_E_
 $F$5$56r-   r#   r:   c                     | j                   dk(  r| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S )NrX   )rn   r   r   r   r   r   s     r.   r   z!AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r-   r$   r%   r&   r'   r   r\   boolrs   r(   r)   r   r   r   r   s   @r.   r   r      sZ    7!7 7 	7
 7 7,	U%6%6 	5<< 	r-   r   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    rU   r   
expand_dimexpandc                    t         |           |r|n|| _        t        dt	        ||j
                  z              | _        t        j                  d      | _	        t        j                  | j                  | j                  dd      | _        t        j                  | j                  | j                  dd      | _        t        |j                     | _        t        j                          | _        y )Nr   )output_sizer   )r   r   r`   rk   )rr   rs   dimr[   r\   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezerv   reducer   r	   r|   
act_reduceSigmoid
act_expand)rA   rU   r   r   r   r   s        r.   rs   z&AlignVisionSqueezeExciteLayer.__init__$  s    !':V!S&*H*H!HIJ++:ii	
 ii	
 !!2!23**,r-   r#   r:   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }t        j                  ||      }|S r   )r   r   r   r   r   r(   mul)rA   r#   inputss      r.   r   z%AlignVisionSqueezeExciteLayer.forward9  sc    ]3M26M26		&-8r-   )Fr   r   s   @r.   r   r     sH    '0 '# '3 'X\ '*
U%6%6 
5<< 
r-   r   c                        e Zd ZdZdedededededef fdZd	e	j                  d
e	j                  de	j                  fdZ xZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rU   r   rt   rn   	drop_rateid_skipc                    t         |           |dk(  xr | | _        t        j                  ||ddd      | _        t        j                  ||j                  |j                        | _	        t        j                  |      | _        y )Nr   r   Fr   r   )p)rr   rs   apply_dropoutr   rv   project_convrx   ry   rz   
project_bnDropoutdropout)rA   rU   r   rt   rn   r   r   r   s          r.   rs   z#AlignVisionFinalBlockLayer.__init__K  sz     	#q[8[II 
 .. f&;&;fF`F`
 zzI.r-   
embeddingsr#   r:   c                     | j                  |      }| j                  |      }| j                  r| j                  |      }||z   }|S r   )r   r   r   r   )rA   r   r#   s      r.   r   z"AlignVisionFinalBlockLayer.forward\  sG    ))-86 LL7M)J6Mr-   r$   r%   r&   r'   r   r\   floatr   rs   r(   r)   r   r   r   r   s   @r.   r   r   F  sj    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf r-   r   c                        e Zd ZdZdededededededed	ed
ef fdZde	j                  de	j                  fdZ xZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rU   r   rt   rn   expand_ratior`   r   r   r   c
                 l   t         |           || _        | j                  dk7  | _        ||z  }
| j                  rt	        |||
|      | _        t        || j                  r|
n||||	      | _        t        |||
| j                        | _	        t        || j                  r|
n|||||      | _        y )Nr   )rU   r   rt   rn   )rU   r   rn   r`   r   )rU   r   r   r   )rU   r   rt   rn   r   r   )rr   rs   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)rA   rU   r   rt   rn   r   r`   r   r   r   expand_in_dimr   s              r.   rs   zAlignVisionBlock.__init__  s     	(''1,-;;6fmFDN 8$(KK=V#)
 <&]4;;
 5$(KK=V
r-   r#   r:   c                     |}| j                   dk7  r| j                  |      }| j                  |      }| j                  |      }| j	                  ||      }|S Nr   )r   r   r   r   r   )rA   r#   r   s      r.   r   zAlignVisionBlock.forward  s[    "
! NN=9M++M: ++M:
MBr-   r   r   s   @r.   r   r   g  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
r-   r   c                   V     e Zd ZdZdef fdZdej                  dee	   de
fdZ xZS )AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rU   c                     t                    |j                   _         fdt        |j                        }t        fd|j                  D              }d}g }t        |      D ]  }t        ||j                  |         }t        ||j                  |         }|j                  |   }	|j                  |   }
|j                  |   }t         |j                  |               D ]c  }|dk(  }|dkD  rdn|	}	|dkD  r|n|}||j                  v}|j                  |z  |z  }t        ||||	|
||||	      }|j!                  |       |dz  }e  t#        j$                  |       _        y )Nc                 Z    t        t        j                  j                  | z              S r   )r\   mathceildepth_coefficient)repeatsrA   s    r.   round_repeatsz2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr-   c              3   .   K   | ]  } |        y wr   r,   )r?   nr   s     r.   rB   z.AlignVisionEncoder.__init__.<locals>.<genexpr>  s     Laq)Ls   r   r   )	rU   r   rt   rn   r`   r   r   r   r   )rr   rs   r   rM   r   sumnum_block_repeatsranger_   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)rA   rU   num_base_blocks
num_blockscurr_block_numr   ir   rt   rn   r`   r   jr   r   r   blockr   r   s   `                @r.   rs   zAlignVisionEncoder.__init__  s   !'!9!9	D f001L63K3KLL
' 	$A"66+=+=a+@AF#FF,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEF $q&!e$%Ev!/v7O7O!O"44~E
R	(!!#! +!-'##1
 e$!#'$	$8 mmF+r-   r#   kwargsr:   c                 L    | j                   D ]
  } ||      } t        |      S N)r"   )r   r   )rA   r#   r   r   s       r.   r   zAlignVisionEncoder.forward  s3    
 [[ 	1E!-0M	1 .+
 	
r-   )r$   r%   r&   r'   r   rs   r(   r)   r   r   r   r   r   r   s   @r.   r   r     sC    ),0 ),V

((

 +,

 
(	

r-   r   c                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  f
d	Z	 xZ
S )AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxrp   position_idsr   F)
persistenttoken_type_ids)dtype)rr   rs   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   register_bufferr(   rL   r   r   r   sizelongr~   s     r.   rs   zAlignTextEmbeddings.__init__  s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r-   N	input_idsr   r   inputs_embedsr:   c                 6   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr   r   r   r   )r   rH   )r  r   hasattrr   r   r(   r   r  rH   r  r  r  r	  r   )rA   r  r   r   r  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  r   r  s               r.   r   zAlignTextEmbeddings.forward  s1     #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r-   )NNNN)r$   r%   r&   r'   rs   r(   
LongTensorr)   r   r   r   r   s   @r.   r   r     s~    Q
$ .2260426&##d*& ((4/& &&-	&
 ((4/& 
&r-   r   modulequerykeyvalueattention_maskscalingr   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrX   r   r   )r   r   )r   trainingr   )r(   matmul	transposer   rJ   softmaxfloat32tor   r   r  
contiguous)
r  r  r  r  r  r  r   r   attn_weightsattn_outputs
             r.   eager_attention_forwardr(  5  s     <<s}}Q':;gEL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r-   c                        e Zd Z fdZ	 ddej
                  dej                  dz  dee   de	ej
                  ej
                  dz  f   fdZ
 xZS )	AlignTextSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )rr   rs   r  num_attention_headsr  
ValueErrorrU   r\   attention_head_sizeall_head_sizer   Linearr  r  r  r   attention_probs_dropout_probr   attention_dropoutr  r~   s     r.   rs   zAlignTextSelfAttention.__init__L  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r-   Nr#   r  r   r:   c                 x   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  sdn| j                  | j                  d|\  }
} |
j                  g |d j!                         }
|
|fS )Nr   r   rX           )r   r  )shaper0  r  viewr!  r  r  r   get_interfacerU   _attn_implementationr(  r  r4  r  reshaper%  )rA   r#   r  r   r  hidden_shapequery_states
key_statesvalue_statesattention_interfacer'  r&  s               r.   r   zAlignTextSelfAttention.forwarda  s>    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHL((r-   r   )r$   r%   r&   rs   r(   r   r)   r   r   r+   r   r   r   s   @r.   r*  r*  K  sd    60 48)||) ))D0) +,	)
 
u||U\\D00	1)r-   r*  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )rr   rs   r   r2  r  denser	  r
  r   r  r   r~   s     r.   rs   zAlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r-   r#   input_tensorr:   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rE  r   r	  rA   r#   rF  s      r.   r   zAlignTextSelfOutput.forward  7    

=1]3}|'CDr-   r$   r%   r&   rs   r(   r   r   r   r   s   @r.   rB  rB    1    >U\\  RWR^R^ r-   rB  c            	            e Zd Z fdZ	 ddej
                  dej                  dz  dee   dej
                  fdZ	 xZ
S )	AlignTextAttentionc                 b    t         |           t        |      | _        t	        |      | _        y r   )rr   rs   r*  rA   rB  outputr~   s     r.   rs   zAlignTextAttention.__init__  s&    *62	)&1r-   Nr#   r  r   r:   c                 ^    |} | j                   |fd|i|\  }}| j                  ||      }|S Nr  )rA   rP  )rA   r#   r  r   residual_s         r.   r   zAlignTextAttention.forward  sK     !$499
)
 
q
 M8<r-   r   )r$   r%   r&   rs   r(   r   r)   r   r   r   r   r   s   @r.   rN  rN    sQ    2 48|| ))D0 +,	
 
r-   rN  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rr   rs   r   r2  r  intermediate_sizerE  rc   r|   strr	   intermediate_act_fnr~   s     r.   rs   zAlignTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r-   r#   r:   c                 J    | j                  |      }| j                  |      }|S r   )rE  rZ  r   s     r.   r   zAlignTextIntermediate.forward  s&    

=100?r-   rK  r   s   @r.   rV  rV    s#    9U\\ ell r-   rV  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y rD  )rr   rs   r   r2  rX  r  rE  r	  r
  r   r  r   r~   s     r.   rs   zAlignTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r-   r#   rF  r:   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rH  rI  s      r.   r   zAlignTextOutput.forward  rJ  r-   rK  r   s   @r.   r]  r]    rL  r-   r]  c            	            e Zd Z fdZ	 d	dej
                  dej                  dz  dee   dej
                  fdZ	d Z
 xZS )
AlignTextLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y r   )
rr   rs   chunk_size_feed_forwardseq_len_dimrN  	attentionrV  intermediater]  rP  r~   s     r.   rs   zAlignTextLayer.__init__  sI    '-'E'E$+F31&9%f-r-   Nr#   r  r   r:   c                      | j                   |fd|i|}t        | j                  | j                  | j                  |      }|S rR  )re  r   feed_forward_chunkrc  rd  )rA   r#   r  r   s       r.   r   zAlignTextLayer.forward  sY     '
)
 
 2##T%A%A4CSCSUb
 r-   c                 L    | j                  |      }| j                  ||      }|S r   )rf  rP  )rA   attention_outputintermediate_outputlayer_outputs       r.   rh  z!AlignTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr-   r   )r$   r%   r&   rs   r(   r   r)   r   r   r   rh  r   r   s   @r.   ra  ra    sV    . 48|| ))D0 +,	
 
$r-   ra  c            	       n     e Zd Z fdZ	 ddej
                  dej                  dz  dee   de	fdZ
 xZS )	AlignTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rr   rs   rU   r   r   r   num_hidden_layersra  layergradient_checkpointing)rA   rU   r   r   s      r.   rs   zAlignTextEncoder.__init__  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#Nr#   r  r   r:   c                 P    | j                   D ]  } |||fi |} t        |      S r   )rq  r   )rA   r#   r  r   layer_modules        r.   r   zAlignTextEncoder.forward  sC     !JJ 	L( M	 +
 	
r-   r   )r$   r%   r&   rs   r(   r   r)   r   r   r   r   r   r   s   @r.   rn  rn    sM    , 48
||
 ))D0
 +,	

 

r-   rn  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rr   rs   r   r2  r  rE  Tanhr}   r~   s     r.   rs   zAlignTextPooler.__init__  s9    YYv1163E3EF
'')r-   r#   r:   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )rE  r}   )rA   r#   first_token_tensorpooled_outputs       r.   r   zAlignTextPooler.forward  s6     +1a40

#566r-   rK  r   s   @r.   rv  rv     s#    $
U\\ ell r-   rv  c                   l    e Zd ZU eed<   dZdZdZ ej                         de
j                  fd       Zy)AlignPreTrainedModelrU   align)imagetextTr  c                 "   | j                   j                  }t        |t        j                  t        j
                  f      rPt        j                  |j                  d|       |j                  Ct        j                  |j                         n"t        |t              rt        j                  |j                  j                         t        j                  |j                  j                         t        j                  |j                  | j                   j                          nt        |t        j"                        rqt        j                  |j                  d|       |j$                  Ct'        |j                  dd      s,t        j                  |j                  |j$                            t        |t        j(                  t        j*                  f      rt        j                  |j                         t        j,                  |j                         t'        |dd      ^t        j                  |j.                         t        j,                  |j0                         t        j                  |j2                         yyt        |t4              ryt        j6                  |j8                  t;        j<                  |j8                  j>                  d         jA                  d             t        j                  |jB                         yy)	zInitialize the weightsr6  )meanstdN_is_hf_initializedFrunning_meanr   r   )"rU   initializer_rangerc   r   r2  rv   initnormal_weightro   zeros_
AlignModelxavier_uniform_text_projection	constant_temperaturetemperature_init_valuer   r   r=   r	  rx   ones_r  running_varnum_batches_trackedr   copy_r   r(   rL   r7  r   r   )rA   r  r  s      r.   _init_weightsz"AlignPreTrainedModel._init_weights  s    kk++fryy"))45LLSc:{{&FKK(
+  !7!7!>!>?KK..334NN6--t{{/Q/QR-LLSc:!!-gfmmMach6iFMM&*<*<=>fr||R^^<=KK$JJv}}%v~t4@F//0

6--.F667 A  34JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 5r-   N)r$   r%   r&   r   r*   base_model_prefixinput_modalitiessupports_gradient_checkpointingr(   no_gradr   Moduler  r,   r-   r.   r}  r}    s?    (&*#U]]_/BII / /r-   r}  zJ
    The text model from ALIGN without any head or projection on top.
    c                   2    e Zd ZU eed<   dZdgZeedZ	ddede
f fdZd Zd Zeee	 	 	 	 	 dd
ej$                  d	z  dej$                  d	z  dej$                  d	z  dej$                  d	z  dej$                  d	z  dee   deez  fd                     Z xZS )AlignTextModelrU   )r  r   )r#   r2   add_pooling_layerc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rr   rs   rU   r   r   rn  encoderrv  pooler	post_init)rA   rU   r  r   s      r.   rs   zAlignTextModel.__init__A  sM    
 	 -f5'/1Bof- 	r-   c                 .    | j                   j                  S r   r   r  rD   s    r.   get_input_embeddingsz#AlignTextModel.get_input_embeddingsQ  s    ...r-   c                 &    || j                   _        y r   r  )rA   r  s     r.   set_input_embeddingsz#AlignTextModel.set_input_embeddingsT  s    */'r-   Nr  r  r   r   r  r   r:   c                    ||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |\  }}	||j                  n|j                  }
|t	        j
                  ||	f|
      }| j                  ||      }| j                  ||||      } | j                  |fd|i|}|d   }| j                  | j                  |      nd}t        ||	      S )
a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsrG   )r  r   r   r  r  r   r"   pooler_output)r/  %warn_if_padding_and_no_attention_maskr  rH   r(   onesget_extended_attention_maskr   r  r  r   )rA   r  r  r   r   r  r   r  
batch_sizer  rH   extended_attention_maskembedding_outputencoder_outputssequence_outputr{  s                   r.   r   zAlignTextModel.forwardW  s8   6  ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN 150P0PQ_al0m??%)'	 + 
 '$,,
2
 

 *!,8<8OO4UY)-'
 	
r-   TNNNNN)r$   r%   r&   r   r*   r  _no_split_modulesra  r*  _can_record_outputsr   rs   r  r  r   r   r   r(   r   r   r   r+   r   r   r   r   s   @r.   r  r  3  s      ./',
 4  /0   *..2.2,0-1=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 +,=
 
+	+=
    =
r-   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                        e Zd ZU eed<   dZdZdZdZdgZ	de
iZdef fdZeee	 ddej"                  d	z  d
ee   deez  fd                     Z xZS )AlignVisionModelrU   r   )r  Frw   r   r#   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dk(  r't        j                  |j                  d      | _        nN|j                  dk(  r't        j                  |j                  d      | _        nt        d|j                         | j                          y )Nr  T)	ceil_moder[   z2config.pooling must be one of ['mean', 'max'] got )rr   rs   rU   rg   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2dr/  poolingr  r~   s     r.   rs   zAlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r-   Nr   r:   c                     |t        d      | j                  |      } | j                  |fi |}|d   }| j                  |      }|j	                  |j
                  dd       }t        ||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_valuesr   rX   r  )r/  r   r  r  r;  r7  r   )rA   r   r   r  r  r"   r{  s          r.   r   zAlignVisionModel.forward  s    < ?@@??<8&$,,

 ,A.$56%--m.A.A"1.EF7/'
 	
r-   r   )r$   r%   r&   r   r*   main_input_namer  r  _input_embed_layerr  r   r  rs   r   r   r   r(   r)   r   r   r+   r   r   r   r   s   @r.   r  r    s     $O!&+#&+,)0 "   26*
''$.*
 +,*
 
9	9	*
    *
r-   r  c                   H    e Zd ZU eed<   def fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e   d
eez  fd              Zeedej                  d	e
e   d
eez  fd              Zee	 	 	 	 	 	 	 ddej"                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  d	e
e   d
eez  fd              Z xZS )r  rU   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        t        |      | _        t        |      | _        t!        j"                  | j                  | j                        | _        t!        j&                  t)        j*                  | j,                  j.                              | _        | j3                          y )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rr   rs   rc   text_configr   	TypeErrortypevision_configr   projection_dimr  text_embed_dimr  
text_modelr  vision_modelr   r2  r  	Parameterr(   tensorrU   r  r  r  )rA   rU   r  r  r   s       r.   rs   zAlignModel.__init__  s#    &,,o>++,-Q0 
 &..0AB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r-   Nr  r  r   r   r  r   r:   c           	           | j                   d|||||d|}|d   dddddf   }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```r  r  r   r   r  r   Nr,   )r  r  r  )	rA   r  r  r   r   r  r   text_outputsr"   s	            r.   get_text_featureszAlignModel.get_text_features  sd    2 4C4?? 4
))%'4
 4
 )OAq!G4%)%9%9:K%L"r-   r   c                 *     | j                   dd|i|S )a}  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```r   r,   )r  )rA   r   r   s      r.   get_image_featureszAlignModel.get_image_features3  s    . !t  ElEfEEr-   return_lossc           	          | j                   dd|i|}	 | j                  d|||||d|}
|	d   }|
d   dddddf   }| j                  |      }||j                  ddd	      z  }||j                  ddd	      z  }t	        j
                  ||j                               | j                  z  }|j                         }d}|rt        |      }t        ||||||
|	
      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   r  r   r   NrX   r   T)r   r   keepdim)r5   r6   r7   r1   r!   r8   r9   r,   )
r  r  r  normr(   r   rQ   r  rT   r4   )rA   r  r   r  r   r   r  r  r   vision_outputsr  r!   r1   r7   r6   r5   s                   r.   r   zAlignModel.forwardL  s(   N +** 
%


 't 
))%'
 
 &a("1oaAg.**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4DEHXHXX*,,.o.D-+#%* .
 	
r-   r  )NNNNNNN)r$   r%   r&   r   r*   rs   r   r   r(   r   r   r   r+   r   r  r)   r  r  r   r4   r   r   r   s   @r.   r  r    s   { <  *..2.2,0-1"<<$&" t+" t+	"
 llT)" ||d*" +," 
+	+"  "H F!--F9?@R9SF	+	+F  F.  .215.2.2,0-1#'K
##d*K
 ''$.K
 t+	K

 t+K
 llT)K
 ||d*K
 D[K
 +,K
 
	K
  K
r-   r  )r}  r  r  r  r  )r6  )Sr'   r   collections.abcr   dataclassesr   typingr   r(   r    r   r  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_alignr   r   r   
get_loggerr$   loggerr    r0   r4   r   rN   rT   r\   r_   r+   r   re   r  rg   rv   r   r   r   r   r   r   r   r   r   r(  r*  rB  rN  rV  r]  ra  rn  rv  r}  r  r  r  __all__r,   r-   r.   <module>r     sZ     $ !    & ! 9  G & 6 M M I 5 P P 
		H	% 
:[ : : 
	7; 	7 	7  
+  
   
JuU\\ uell u-5<< -ELL -+ 3  @S5[ @$ @*BII 4
 
6		 6$		 $P$BII $N BNryy Nb>
 >
B9")) 9F %II%<<% 
% <<	%
 LL4'% % %,3)RYY 3)n"))  .BII  bii / B
ryy 
4bii   /?  /  /F 
_
) _

_
D 
I
+ I

I
X m
% m
 m
` Wr-   