
    i(                       d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 dd	lmZ dd
lmZmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4  e*jj                  e6      Z7dZ8e e)d       G d de                    Z9e e)d       G d de                    Z: G d d ejv                        Z< G d! d"ejv                        Z= G d# d$ejv                        Z> G d% d&ejv                        Z? G d' d(ejv                        Z@ G d) d*ejv                        ZA G d+ d,ejv                        ZB G d- d.ejv                        ZC G d/ d0ejv                        ZD	 	 ddd1ejv                  d2ej                  d3ej                  d4ej                  d5ej                  dz  d6eFdz  d7eFd8e$e(   fd9ZG G d: d;ejv                        ZH G d< d=ejv                        ZI G d> d?ejv                        ZJ G d@ dAejv                        ZK G dB dCe      ZL G dD dEejv                        ZM G dF dGejv                        ZNe) G dH dIe"             ZO G dJ dKeO      ZP e)dL       G dM dNeO             ZQ e)dO       G dP dQeO             ZR G dR dSejv                        ZS G dT dUejv                        ZT G dV dWejv                        ZU e)dX       G dY dZeO             ZV e)d[       G d\ d]eO             ZW G d^ d_ejv                        ZX e)d`       G da dbeO             ZYg dcZZy)ezPyTorch BridgeTower Model    )OrderedDict)Callable)	dataclassN)nn)CrossEntropyLoss   )initialization)ACT2FNQuickGELUActivation)CacheDynamicCacheEncoderDecoderCache)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputModelOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )BridgeTowerConfigBridgeTowerTextConfigBridgeTowerVisionConfigRobertaTokenizerz.
    Output type of [`BridgeTowerModel`].
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   y)BridgeTowerModelOutputa  
    text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
        Sequence of hidden-states at the text output of the last layer of the model.
    image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
        Sequence of hidden-states at the image output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
        Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
        token), respectively, after further processing through layers used for auxiliary pretraining tasks.
    Ntext_featuresimage_featurespooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r*   torchFloatTensor__annotations__r+   r,   r-   tupler.        /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/bridgetower/modeling_bridgetower.pyr)   r)   2   s|     /3M5$$t+2/3NE%%,3.2M5$$t+259M5**+d2926Je''(4/6r8   r)   z>
    Output type of ['BridgeTowerForContrastiveLearning']
    c                   H   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   dZe
ej                     dz  ed<   dZe
ej                     dz  ed<   dZe
ej                     dz  ed<   dZe
ej                     dz  ed	<   y)
BridgeTowerContrastiveOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Image-text contrastive loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    Nlosslogitstext_embedsimage_embedscross_embedsr-   r.   )r/   r0   r1   r2   r<   r3   r4   r5   r=   r>   r6   r?   r@   r-   r.   r7   r8   r9   r;   r;   J   s      &*D%

d
")'+FE$+37Ku(()D0748L%))*T1848L%))*T1859M5**+d2926Je''(4/6r8   r;   c                        e Zd Z fdZdej
                  dej
                  fdZddej
                  dej
                  dz  fdZ xZS )BridgeTowerResidualAttentionc                 h   t         |           t        j                  |j                  |j                  dz        | _        t        j                  |j                  |j                        | _        t        j                  t        dt        j                  |j                  |j                  dz        fdt               fdt        j                  |j                  dz  |j                        fg            | _        t        j                  |j                  |j                        | _        d | _        y )N@   epsc_fc   geluc_proj)super__init__r   MultiheadAttentionhidden_sizeattn	LayerNormlayer_norm_epsln_1
ModuleDictr   Linearr   mlpln_2	attn_maskselfconfig	__class__s     r9   rL   z%BridgeTowerResidualAttention.__init__k   s    ))&*<*<f>P>PTV>VW	LL!3!39N9NO	==RYYv'9'96;M;MPQ;QRS023ryy););a)?ASASTU
 LL!3!39N9NO	r8   hidden_stateattention_maskc                 ,   |+|j                  t        j                  |j                        }| j                  1| j                  j                  |j
                  |j                        nd | _        | j                  |||d| j                  |      d   S )NdtypedeviceF)need_weightsrW   key_padding_maskr   )tor3   boolra   rW   r`   rO   )rY   r\   r]   s      r9   	attentionz&BridgeTowerResidualAttention.attention|   s    %+..UZZH[H[.\N ~~) NNL$6$6|?R?RS 	
 yynn+  
  	r8   Nc                     || j                  | j                  |      |      z   }| j                  |      }| j                  j	                         D ]
  } ||      } ||z   }|S N)rf   rR   rV   rU   values)rY   r\   r]   residual_statelayers        r9   forwardz$BridgeTowerResidualAttention.forward   sc    %tyy7NP^(__yy0XX__& 	/E .L	/%4r8   rh   )	r/   r0   r1   rL   r3   Tensorrf   rl   __classcell__r[   s   @r9   rB   rB   j   sD    "ell ELL "ELL %,,QUBU r8   rB   c                   ^     e Zd Z fdZddej
                  dej
                  dz  fdZ xZS )BridgeTowerTransformerc                    t         |           |j                  | _        |j                  | _        |j                  rHt        j                  t        | j                  dz
        D cg c]  }t        |       c}      | _	        nDt        j                  t        | j                        D cg c]  }t        |       c}      | _	        |j                  | _
        y c c}w c c}w )Nr"   )rK   rL   rN   num_hidden_layersremove_last_layerr   
ModuleListrangerB   	resblocksstop_gradientrY   rZ   _r[   s      r9   rL   zBridgeTowerTransformer.__init__   s    !--!'!9!9##]]?DTE[E[^_E_?`a!-f5aDN  ]]?DTE[E[?\]!-f5]DN $11 b ^s   'C,C!Nr\   r]   c                     g }| j                   D ]H  } |||      }| j                  r |j                  |j                                8|j                  |       J |S rh   )rw   rx   appenddetach)rY   r\   r]   r-   blocks        r9   rl   zBridgeTowerTransformer.forward   s\    ^^ 	3E ~>L!!$$\%8%8%:;$$\2	3 r8   rh   r/   r0   r1   rL   r3   rm   rl   rn   ro   s   @r9   rq   rq      s)    2ELL %,,QUBU r8   rq   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )BridgeTowerVisionEmbeddingsrZ   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r"   position_idsr"   
persistent)rK   rL   rZ   rN   	embed_dim
image_size
patch_sizer   	Parameterr3   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandrX   s     r9   rL   z$BridgeTowerVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr8   
embeddingsheightwidthreturnc                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r"   r   Nr         ?r   r   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer3   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolateviewcat)rY   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r9   interpolate_pos_encodingz4BridgeTowerVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr8   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (z).r`   r   r"   r   r   )r   r   
ValueErrorr   r   r`   rd   flatten	transposer   r   r3   r   r   r   r   )rY   r   r   
batch_sizerz   r   r   target_dtypepatch_embedsclass_embedsr   s              r9   rl   z#BridgeTowerVisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr8   F)r/   r0   r1   r%   rL   r3   rm   intr   r4   rl   rn   ro   s   @r9   r   r      se    q6 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r8   r   c                        e Zd Z fdZ	 ddej
                  defdZ	 ddej
                  defdZdej
                  fdZ	 xZ
S )	BridgeTowerVisionTransformerc           	      0   t         |           t        |      | _        t	        j
                  |j                  |j                        | _        t        |      | _
        t	        j
                  |j                  |j                        | _        |j                  | _        |j                  set	        j                  t        |j                        D cg c]-  }t	        j
                  |j                  |j                        / c}      | _        y y c c}w NrE   )rK   rL   r   r   r   rP   rN   rQ   ln_prerq   transformerln_postshare_layernormru   rv   rs   ln_separatery   s      r9   rL   z%BridgeTowerVisionTransformer.__init__  s    5f=ll6#5#56;P;PQ1&9||F$6$6F<Q<QR%55%%!}}V[\b\t\tVuvQRf00f6K6KLv D &vs   2Dr   r   c                    | j                  ||      }| j                  |      }|j                  ddd      }| j                  ||      }t	        j
                  |d      }|j                  dddd      }| j                  r| j                  |      }|S g }t        || j                        D ]  \  }} ||      }|j                  |         t	        j
                  |d      }|S )Nr"   r   r   r   r   )r   r   r   r   r3   stackr   r   zipr   r|   )rY   r   r]   r   r-   hidden_states_stacklns          r9   rl   z$BridgeTowerVisionTransformer.forward  s     6NOM2%--aA6((GMq9%--aAq9 LL7M  #%%(8H8H%I :!r "= 1#**=9: "KK(;CMr8   c                 t    | j                  ||      }| j                  |      }|j                  ddd      }|S )Nr   r"   r   r   )r   r   r   )rY   r   r   r-   s       r9   forward_prez(BridgeTowerVisionTransformer.forward_pre-  s?    
 OghM2%--aA6r8   r\   c                 N    |j                  ddd      }| j                  |      }|S )Nr"   r   r   )r   r   )rY   r\   visual_output_posts      r9   forward_postz)BridgeTowerVisionTransformer.forward_post8  s-    )11!Q:!\\*<=!!r8   r   )r/   r0   r1   rL   r3   rm   re   rl   r   r   rn   ro   s   @r9   r   r     sX    " */	ll #'	< */	ll	 #'	" "r8   r   c                   $     e Zd Z fdZd Z xZS )BridgeTowerLinkTowerc                    t         |           |j                  | _        |j                  | _        |j                  dv r|j                  dk(  r.t	        j
                  t        j                  d            | _        n<|j                  dk(  r-t	        j
                  t        j                  d            | _	        t	        j                  | j                  |j                        | _
        y t        d|j                   d      )	N)add
scaled_addr   r   g      ?r   r   rE   link_tower_type  is not implemented)rK   rL   link_tower_typerN   r   r   r3   tensorscaled_factorbetarP   rQ   NotImplementedErrorrX   s     r9   rL   zBridgeTowerLinkTower.__init__?  s    %55!--!!%II%%5%'\\%,,s2C%D"''=8LLc):;	\\$*:*:@U@UVDN%(89O9O8PPc&deer8   c                 Z   | j                   dk(  r| j                  ||z         S | j                   dk(  r!| j                  || j                  z  |z         S | j                   dk(  r1| j                  |d| j                  z
  z  || j                  z  z         S t	        d| j                    d      )Nr   r   r   r"   r   r   )r   rP   r   r   r   )rY   r-   cross_modal_hidden_statesr]   s       r9   rl   zBridgeTowerLinkTower.forwardL  s    5(>>-2K"KLL!!\1>>-$2D2D"DG`"`aa!!]2>>-1tyy="AD]`d`i`iDi"ijj%(89M9M8NNa&bccr8   r/   r0   r1   rL   rl   rn   ro   s   @r9   r   r   >  s    fdr8   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BridgeTowerSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y r   )rK   rL   r   rT   rN   denserP   rQ   Dropouthidden_dropout_probdropoutrX   s     r9   rL   zBridgeTowerSelfOutput.__init__Y  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r8   r-   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rh   r   r   rP   rY   r-   r   s      r9   rl   zBridgeTowerSelfOutput.forward_  7    

=1]3}|'CDr8   r   ro   s   @r9   r   r   X  1    >U\\  RWR^R^ r8   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BridgeTowerIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rh   )rK   rL   r   rT   rN   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnrX   s     r9   rL   z BridgeTowerIntermediate.__init__h  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r8   r-   r   c                 J    | j                  |      }| j                  |      }|S rh   )r   r   rY   r-   s     r9   rl   zBridgeTowerIntermediate.forwardp  s&    

=100?r8   r   ro   s   @r9   r   r   g  s#    9U\\ ell r8   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BridgeTowerOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rK   rL   r   rT   r   rN   r   rP   rQ   r   r   r   rX   s     r9   rL   zBridgeTowerOutput.__init__x  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r8   r-   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rh   r   r   s      r9   rl   zBridgeTowerOutput.forward~  r   r8   r   ro   s   @r9   r  r  w  r   r8   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BridgeTowerPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y rh   )rK   rL   r   rT   rN   r   Tanh
activationrX   s     r9   rL   zBridgeTowerPooler.__init__  s9    YYv1163E3EF
'')r8   r-   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r
  )rY   r-   first_token_tensorpooled_outputs       r9   rl   zBridgeTowerPooler.forward  s6     +1a40

#566r8   r   ro   s   @r9   r  r    s#    $
U\\ ell r8   r  modulequerykeyvaluer]   scalingr   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr         r   r   r   )ptrainingr"   )
r   r3   matmulr   r   r   softmaxr   r  
contiguous)
r  r  r  r  r]   r  r   r  attn_weightsattn_outputs
             r9   eager_attention_forwardr    s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r8   c                        e Zd Zd	 fd	Z	 	 d
dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZ xZS )BridgeTowerSelfAttentionNc                 @   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        |j&                  | _        || _        || _        y Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r  )rK   rL   rN   num_attention_headshasattrr   rZ   r   attention_head_sizeall_head_sizer  r   rT   r  r  r  r   attention_probs_dropout_probr   
is_decoder	is_causal	layer_idxrY   rZ   r*  r+  r[   s       r9   rL   z!BridgeTowerSelfAttention.__init__  sP    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""r8   r-   r]   past_key_valuesr  r   c                     |j                   d d }g |d| j                  } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }	|>|}
t        |t              r|j                  }
|
j                  ||	| j                        \  }}	t        j                  | j                  j                  t               } || |||	|f| j"                  sdn| j$                  j&                  | j(                  d|\  }} |j*                  g |d j-                         }||fS )Nr   r"   r           r   r  )r   r&  r  r   r   r  r  r   r   self_attention_cacheupdater+  r   get_interfacerZ   _attn_implementationr  r  r   r  r  r   r  )rY   r-   r]   r-  r  input_shapehidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer  r  s                 r9   rl   z BridgeTowerSelfAttention.forward  s    $))#2.CCbC$*B*BC 5djj/44lCMMaQRS0DHH]+00,?II!QO	4djj/44lCMMaQRS&&5#/+>?*9*N*N' &=%C%CI{\`\j\j%k"I{(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ *k));;;;FFHL((r8   FN)NNr/   r0   r1   rL   r3   rm   r4   r   r   r   r6   rl   rn   ro   s   @r9   r  r    sg    #6 48(,	')||') ))D0') 	')
 +,') 
u||	')r8   r  c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dee	   de
ej
                     fd	Z xZS )BridgeTowerCrossAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        || _        || _        y r!  )rK   rL   rN   r$  r%  r   rZ   r   r&  r'  r  r   rT   r  r  r  r   r(  r   r*  r+  r,  s       r9   rL   z"BridgeTowerCrossAttention.__init__  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""r8   r-   encoder_hidden_statesr]   r-  r  r   c                 f   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }|%|j
                  j                  | j                        nd}	|]|	r[|j                  j                  | j                     j                  }
|j                  j                  | j                     j                  }ng |j                   d d d| j                  }| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|C|j                  j                  |
|| j                        \  }
}d|j
                  | j                  <   t        j                   | j"                  j$                  t&              } || ||
||f| j(                  sdn| j*                  j,                  | j.                  d|\  }} |j0                  g |d j3                         }||fS )Nr   r"   r   FTr/  r0  )r   r&  r  r   r   
is_updatedgetr+  cross_attention_cachelayerskeysri   r  r  r2  r   r3  rZ   r4  r  r  r   r  r  r   r  )rY   r-   rA  r]   r-  r  r5  r6  r7  rC  r8  r9  kv_shaper;  r  r  s                   r9   rl   z!BridgeTowerCrossAttention.forward  s    $))#2.CCbC$*B*BC jj/44\BLLQPQRGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]KX.44Sb9X2Xt?W?WXH!67<<XFPPQRTUVI**%:;@@JTTUVXYZK*)8)N)N)U)U{DNN*&	; >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ *k));;;;FFHL((r8   r<  NNN)r/   r0   r1   rL   r3   rm   r4   r   r   r   r6   rl   rn   ro   s   @r9   r?  r?    s    #4 ;?376:1)||1)  %00471) ))D0	1)
 -t31) +,1) 
u||	1)r8   r?  c                        e Zd Zd fd	Z	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dee	   d	e
ej
                     fd
Z xZS )BridgeTowerAttentionNc                     t         |           || _        |rt        nt        } ||||      | _        t        |      | _        y )Nr*  r+  )rK   rL   is_cross_attentionr?  r  rY   r   output)rY   rZ   r*  r+  rN  attention_classr[   s         r9   rL   zBridgeTowerAttention.__init__E  s=    "47I3Og#Fi9U	+F3r8   r-   r]   rA  encoder_attention_maskr-  r  r   c                     | j                   s|n|} | j                  |f|||d|\  }}| j                  ||      }||fS )N)rA  r]   r-  )rN  rY   rO  )	rY   r-   r]   rA  rQ  r-  r  attention_outputr  s	            r9   rl   zBridgeTowerAttention.forwardL  sd     04/F/FLb)2*
"7)+	*

 *
&,  ;;'7G--r8   )FNFNNNNr=  ro   s   @r9   rK  rK  D  s    4 48:>;?(,.||. ))D0.  %0047	.
 !& 1 1D 8. . +,. 
u||	.r8   rK  c                   @     e Zd Zd fd	Z	 	 	 ddee   fdZd Z xZS )BridgeTowerBertCrossLayerc                 $   t         |           |j                  | _        d| _        t	        |d|      | _        |j                  | _        |j                  | _        t	        |d|d      | _        t        |      | _
        t        |      | _        y )Nr"   TrM  Fr*  r+  rN  )rK   rL   chunk_size_feed_forwardseq_len_dimrK  rf   r)  add_cross_attentioncrossattentionr   intermediater  rO  rY   rZ   r+  r[   s      r9   rL   z"BridgeTowerBertCrossLayer.__init__b  s    '-'E'E$-fPYZ ++#)#=#= 2#	
 4F;'/r8   r  c                      | j                   |f|d d|\  }}|}	 | j                  |	f||||d|\  }
}|
}	t        | j                  | j                  | j
                  |	      }|||fS )N)r]   r-  )r]   rA  rQ  r-  )rf   r\  r   feed_forward_chunkrY  rZ  )rY   r-   rA  r]   rQ  r-  r  self_attention_outputself_attn_weightsrS  cross_attention_outputcross_attn_weightslayer_outputs                r9   rl   z!BridgeTowerBertCrossLayer.forwardr  s     4B4>>4
) 4
 	4
00 15HT5H5H6
)"7#9+6
 6
2 2 20##T%A%A4CSCSUe
 
 	
r8   c                 L    | j                  |      }| j                  ||      }|S rh   r]  rO  rY   rS  intermediate_outputre  s       r9   r`  z,BridgeTowerBertCrossLayer.feed_forward_chunk  ,    "//0@A{{#68HIr8   rh   rI  )	r/   r0   r1   rL   r   r   rl   r`  rn   ro   s   @r9   rV  rV  a  s.    0( #"
 +,"
Hr8   rV  c                        e Zd Zd fd	Z	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dee	   d	ej
                  fd
Z
d Z xZS )BridgeTowerTextLayerNc                    t         |           |j                  | _        d| _        t	        ||j
                  |      | _        |j
                  | _        |j                  | _        | j                  r.| j
                  st        |  d      t	        |d|d      | _	        t        |      | _        t        |      | _        y )Nr"   rM  z> should be used as a decoder model if cross attention is addedFTrX  )rK   rL   rY  rZ  rK  r)  rf   r[  r   r\  r   r]  r  rO  r^  s      r9   rL   zBridgeTowerTextLayer.__init__  s    '-'E'E$-f@Q@Q]fg ++#)#=#= ##?? D6)g!hii"6##'	#D 4F;'/r8   r-   r]   rA  rQ  r-  r  r   c                      | j                   ||fd|i|\  }}|}	| j                  r:|8t        | d      st        d|  d       | j                  |d ||fd|i|\  }
}|
}	t        | j                  | j                  | j                  |	      }|S )Nr-  r\  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	rf   r)  r%  r   r\  r   r`  rY  rZ  )rY   r-   r]   rA  rQ  r-  r  ra  rz   rS  rc  re  s               r9   rl   zBridgeTowerTextLayer.forward  s     $24>>$
 ,$
 	$
 q 1??4@4!12 =dV DD D 
 )<(;(;%%&	)
 !0) )%"A  60##T%A%A4CSCSUe
 r8   c                 L    | j                  |      }| j                  ||      }|S rh   rg  rh  s       r9   r`  z'BridgeTowerTextLayer.feed_forward_chunk  rj  r8   rh   rT  )r/   r0   r1   rL   r3   rm   r4   r   r   r   rl   r`  rn   ro   s   @r9   rl  rl    s    0. 48:>;?(,%||% ))D0%  %0047	%
 !& 1 1D 8% % +,% 
%Nr8   rl  c                        e Zd Z fdZ	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	e	e
   d
efdZ xZS )BridgeTowerTextEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w )Nr+  )	rK   rL   rZ   r   ru   rv   rs   rl  rk   )rY   rZ   ir[   s      r9   rL   zBridgeTowerTextEncoder.__init__  sI    ]]@EfF^F^@_`1!&A6`

`s   ANr-   r]   rA  rQ  r-  	use_cacher  r   c                 l    | j                   D ]  } ||||f||d|} t        ||r|      S d       S )N)rQ  r-  )last_hidden_stater-  )rk   r   )	rY   r-   r]   rA  rQ  r-  ru  r  layer_modules	            r9   rl   zBridgeTowerTextEncoder.forward  se     !JJ 	L(% (> / M	 9+/8O
 	
>B
 	
r8   )NNNNN)r/   r0   r1   rL   r3   rm   r4   r   re   r   r   r   rl   rn   ro   s   @r9   rq  rq    s    
 48:>;?(,!%
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 +,
 
3
r8   rq  c                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
ed        Zedd       Z xZS )BridgeTowerTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 T   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       | j                  dt!        j(                  | j*                  j-                         t         j.                        d       |j                  | _        t        j                  |j$                  |j
                  | j0                        | _        y )	N)padding_idxrE   r   r   Fr   token_type_idsr   )rK   rL   r   r   
vocab_sizerN   pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsrP   rQ   r   r   r   r   r3   r   max_position_embeddingsr   zerosr   r   longr|  position_embeddingsrX   s     r9   rL   z"BridgeTowerTextEmbeddings.__init__  s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
 r8   N	input_idsr}  r   inputs_embedspast_key_values_lengthr   c                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )Nr   r}  r   r"   )r   indexr_   )"create_position_ids_from_input_idsr|  &create_position_ids_from_inputs_embedsr   r%  r}  r   r   r3   gatherr  r  r   ra   r  r  r  rP   r   )rY   r  r}  r   r  r  r5  r   
seq_lengthbuffered_token_type_idsr  r   r  s                r9   rl   z!BridgeTowerTextEmbeddings.forward  sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
r8   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr   r"   r_   r   )r   r3   r   r  ra   r   r   )r  r|  r5  sequence_lengthr   s        r9   r  z@BridgeTowerTextEmbeddings.create_position_ids_from_inputs_embedsI  sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<r8   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r"   r   )ner   r3   cumsumtype_asr  )r  r|  r  maskincremental_indicess        r9   r  z<BridgeTowerTextEmbeddings.create_position_ids_from_input_ids[  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r8   )NNNNr   )r   )r/   r0   r1   r2   rL   r3   
LongTensorr4   r   rm   rl   staticmethodr  r  rn   ro   s   @r9   rz  rz    s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r8   rz  c                       e Zd ZU eed<   dZdZdZddgZdZ	e
eedZ ej                         d	ej"                  fd
       Zy)BridgeTowerPreTrainedModelrZ   bridgetower)imagetextFr  rB   r-  )r-   r.   cross_attentionsr  c                    | j                   j                  }t        |t              r| j                   j                  dz  d| j                   j
                  z  dz  z  }| j                   j                  dz  }d| j                   j                  z  dz  }|j                  j                  D ]  }t        j                  |j                  j                  ||z         t        j                  |j                  j                         t        j                  |j                  j                  j                  ||z         t        j                  |j                   j"                  j                  ||z         t        j                  |j                   j$                  j                  ||z          t        j                  |j&                  j(                  ||z         t        j                  |j&                  j*                  j                  ||z         nt        |t,        j.                  t,        j0                  t,        j2                  f      r't        j                  |j                  dd|z         nt        |t,        j4                        r@t        j                  |j6                         t        j8                  |j                         n*t        |t:              r5t        j<                  |j>                  | j                   j@                         nt        |tB              rMt        jD                  |jF                  tI        jJ                  |jL                        jO                  d             nt        |tP              rxt        jD                  |jF                  tI        jJ                  |jF                  jR                  d         jO                  d             t        j                  |jT                         t        |t,        j.                  tV        f      r-|j6                   t        j                  |j6                         y y y )	Nr  r   )stdr/  g?)meanr  r   r   ),rZ   initializer_factorr   r   rN   rs   r   rw   initnormal_rO   in_proj_weightzeros_in_proj_biasout_projr   rU   rG   rJ   r   r   r   r   rT   r   r   rP   r   ones_!BridgeTowerForContrastiveLearning	constant_logit_scalelogit_scale_init_valuer   copy_r   r3   r   r   r   rz  r   r}  BridgeTowerMLMHead)rY   r  r  proj_stdattn_stdfc_stdr~   s          r9   _init_weightsz(BridgeTowerPreTrainedModel._init_weightsz  s   kk,,f:;//51t{{?\?\;\ae:efH{{..4H$++111d:F++55 JUZZ66HsNKEJJ334UZZ0077X^LUYY^^22EUYY--44(S.IJ LL**::3OLL**==DD(UX.YBIIr|| DELLSdSjA-KK$JJv}}% ABNN6--t{{/Q/QR ;<JJv**ELL9M9M,N,U,UV],^_ 9:JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--.fryy*<=>6;;CZKK$ D[>r8   N)r/   r0   r1   r#   r5   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementrl  r  r?  _can_record_outputsr3   no_gradr   Moduler  r7   r8   r9   r  r  l  sd    %(&+#35ST"3-.5 U]]_%BII % %r8   r  c                   F     e Zd ZU eed<   dZ fdZed        ZddZ	 xZ
S )BridgeTowerVisionModelrZ   )r  c                 d    t         |   |       t        |      | _        | j	                          y rh   )rK   rL   r   visual	post_initrX   s     r9   rL   zBridgeTowerVisionModel.__init__  s&     26:r8   c                 j    | j                   j                  j                  j                  j                  S rh   )r  r   r   r   r`   rY   s    r9   r`   zBridgeTowerVisionModel.dtype  s$    {{%%55<<BBBr8   c                 Z    | j                  |j                  | j                        ||      S rh   )r  typer`   )rY   r  
image_maskr   r  s        r9   rl   zBridgeTowerVisionModel.forward  s#    {{5::djj1:?WXXr8   )NF)r/   r0   r1   r%   r5   r  rL   propertyr`   rl   rn   ro   s   @r9   r  r    s0    ##!
 C CYr8   r  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                   p    e Zd ZU eed<   dZd fd	Zd Zd Ze	e
e	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dee   defd                     Zd Z xZS )BridgeTowerTextModelrZ   )r  c                     t         |   |       || _        d| _        t	        |      | _        t        |      | _        |rt        |      nd| _	        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        FN)rK   rL   rZ   gradient_checkpointingrz  r   rq  encoderr  poolerr  )rY   rZ   add_pooling_layerr[   s      r9   rL   zBridgeTowerTextModel.__init__  sV    
 	 &+#3F;-f53D'/$ 	r8   c                 .    | j                   j                  S rh   r   r  r  s    r9   get_input_embeddingsz)BridgeTowerTextModel.get_input_embeddings  s    ...r8   c                 &    || j                   _        y rh   r  rY   r  s     r9   set_input_embeddingsz)BridgeTowerTextModel.set_input_embeddings  s    */'r8   Nr  r]   r}  r   r  rA  rQ  r-  ru  r  r   c
           
         |d u |d uz  rt        d      | j                  j                  sd}	|	r6|4t        t	        | j                        t	        | j                              }||j                         nd}| j                  |||||      }| j                  |||||      \  }} | j                  |f|||||	|d|
}|d   }| j                  | j                  |      nd }t        |||j                        S )	Nz:You must specify exactly one of input_ids or inputs_embedsF)rZ   r   )r  r   r}  r  r  )r]   rQ  embedding_outputrA  r-  )r]   rA  rQ  r-  ru  r   )rw  r,   r-  )r   rZ   r)  r   r   get_seq_lengthr   _create_attention_masksr  r  r   r-  )rY   r  r]   r}  r   r  rA  rQ  r-  ru  r  r  r  encoder_outputssequence_outputr  s                   r9   rl   zBridgeTowerTextModel.forward  s=   & -t";<YZZ{{%%I01,dkk2RT`hlhshsTtuOETE`!?!?!Afg??%)'#9 + 
 261M1M)#9-"7+ 2N 2
.. '$,,	
)"7#9+%	
 	
 *!,8<8OO4UY;-'+;;
 	
r8   c                     | j                   j                  rt        | j                   |||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)rZ   r  r]   r-  )rZ   r  r]   )rZ   r  r]   rA  )rZ   r)  r   r   )rY   r]   rQ  r  rA  r-  s         r9   r  z,BridgeTowerTextModel._create_attention_masks  su     ;;!!/{{.- /	N 7{{.-N "-%>{{.5&;	&" 555r8   )T)	NNNNNNNNN)r/   r0   r1   r$   r5   r  rL   r  r  r    r!   r   r3   rm   r   re   r   r   r   rl   r  rn   ro   s   @r9   r  r    s     "! "/0   *..2.2,0-1596:(,!%9
<<$&9
 t+9
 t+	9

 llT)9
 ||d*9
  %||d29
 !&t 39
 9
 $;9
 +,9
 
69
	    9
x6r8   r  zv
    The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on
    c                       e Zd Z fdZd Zd Zdej                  dedej                  fdZ	dej                  dedej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  ded	z  dej                  d	z  dedee   deej                     ez  fd              Zd Z xZS )BridgeTowerModelc           	         t         |   |       || _        |j                  }|j                  }|j
                  r_t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        nt        j                  t        |j                        D cg c],  }t        j                  |j                  |j                        . c}      | _	        t        j                  t        |j                        D cg c],  }t        j                  |j                  |j                        . c}      | _
        t        j                  d|j                        | _        t!        |      | _        t%        |      | _        |j(                  s|j*                  r| j"                  j,                  j.                  D ]  }| j"                  j,                  j0                  j2                  j4                  |j2                  _        | j"                  j,                  j0                  j6                  j4                  |j6                  _         t        j                  t        |j                        D cg c]  }t9        |       c}      | _        t        j                  t        |j                        D cg c]  }t9        |       c}      | _        t?        |      | _         t?        |      | _!        t        jD                  |j                  |jF                        | _$        t        jD                  |j                  |jF                        | _%        |jL                  r!tO        |      | _(        tO        |      | _)        nt        j                  t        |j                  dz
        D cg c]  }tO        |       c}      | _(        t        j                  t        |j                  dz
        D cg c]  }tO        |       c}      | _)        | jU                          y c c}w c c}w c c}w c c}w c c}w c c}w )Nr   rE   r"   )+rK   rL   rZ   vision_configtext_config$share_cross_modal_transformer_layersr   rT   rN   cross_modal_text_transformcross_modal_image_transformru   rv   rs   r   r  r  vision_modelr  
text_modelr   "init_layernorm_from_vision_encoderr  cross_modal_ln_separater   r   datar   rV  cross_modal_image_layerscross_modal_text_layersr  cross_modal_image_poolercross_modal_text_poolerrP   rQ   cross_modal_text_layernormcross_modal_image_layernormshare_link_tower_layersr   cross_modal_text_link_towercross_modal_image_link_towerr  )rY   rZ   r  r  rz   r   r[   s         r9   rL   zBridgeTowerModel.__init__>  s9    ,,((66.0ii8O8OQWQcQc.dD+/1yy9R9RTZTfTf/gD,.0mmQVW]WoWoQpqA;22F4F4FGq/D+ 02}}SXY_YqYqSrsa=44f6H6HIs0D, &(\\!V5G5G%H"2=A.{;,,1Z1Z''..FF J!%!2!2!9!9!A!A!H!H!M!M		#0077??DDIIJ )+=B6C[C[=\]&{3])
% (*}}=B6C[C[=\]&{3](
$
 ):&(A%'8'@$ +-,,v7I7IvOdOd*e'+-<<8J8JPVPePe+f())/CF/KD,0DV0LD-/1}}7<V=U=UXY=Y7Z[!%f-[0D, 137<V=U=UXY=Y7Z[!%f-[1D- 	W r t ^ ^  \ \s$   1P=$1QQQQQc                 6    | j                   j                         S rh   )r  r  r  s    r9   r  z%BridgeTowerModel.get_input_embeddingsv  s    3355r8   c                 :    | j                   j                  |       y rh   )r  r  r  s     r9   r  z%BridgeTowerModel.set_input_embeddingsy  s    ,,U3r8   r-   r+  r   c                 z    | j                   j                  r| j                  |      S  | j                  |   |      S rh   )rZ   r  r  rY   r-   r+  s      r9   _apply_text_transformz&BridgeTowerModel._apply_text_transform|  s8    ;;;;22=AA9t..y9-HHr8   c                 z    | j                   j                  r| j                  |      S  | j                  |   |      S rh   )rZ   r  r  r  s      r9   _apply_image_transformz'BridgeTowerModel._apply_image_transform  s8    ;;;;33MBB:t//	:=IIr8   Nr  r]   r}  r   
pixel_maskr  r?   image_token_type_idxlabelsr   r  c           
         g }g }g }g }||t        d      |xs d}|j                         }| j                  j                  |      }|j	                  |       |0t        j                  |t
        j                  |j                        }| j                  j                  ||      j                  |j                        }t        | j                  j                  j                        | j                  j                  z
  dz   }| j                  j                  j                  d| D ]  } |||      }|j	                  |        |K| j                   j"                  j%                  |j'                  | j                   j(                        |
      }n|j+                  ddd      }|j	                  |       | j                   j"                  j,                  j.                  d| D ]  } ||      }|j	                  |        | j                   j"                  j1                  |j'                  | j                   j(                              }| j3                  |d	      }| j5                  t        j6                  dt
        j                  |j                              j9                  |      }| j;                  ||z         }| j=                  |d	      }| j5                  t        j>                  d
|t
        j                  |j                              j9                  |      }||z   }| jA                  |      }t        j                  |j                  d      |j                  d      ft
        j                  |j                        }| j                  j                  ||j                               j                  |j                        } | jB                  d   ||||      }|d   } | jD                  d   ||||      }|d   }|j	                  ||f       |j	                  |d   |d   f       d} tG        |t        | j                  j                  j                              D ]  }! | j                  j                  j                  |!   ||      } | j                   j"                  j,                  j.                  |!   |      j'                  | j                   j(                        }| j=                  | j                   j"                  j1                  |      | dz         |z   }| jH                  |    }"| jJ                  |    }#| j3                  || dz         }$ |"|$|z   ||      }% |#|||      }& | jB                  | dz      |%|&||      }|d   } | jD                  | dz      |&|%||      }|d   }| dz  } |j	                  |       |j	                  |       |j	                  ||f       |j	                  |d   |d   f        ||}(}'| jM                  |'|(      })tO        |'|(|)tQ        |      tQ        |      tQ        |      ftQ        |            S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        image_token_type_idx (`int`, *optional*):
            - The token type ids for images.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerModel
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "hello world"
        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base")
        >>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> outputs.keys()
        odict_keys(['text_features', 'image_features', 'pooler_output'])
        ```NzYBridgeTowerModel does not use `inputs_embeds`.  Make sure to pass in `input_ids` instead.r"   )r  r_   r   r   r   rs  r"   )r]   rQ  )r*   r+   r,   r-   r.   ))r   r   r  r   r|   r3   onesr  ra   get_extended_attention_maskrd   lenr  rk   rZ   rs   r  r  r   r  r`   r   r   rw   r   r  r  r  	expand_asr  r  fullr  r  r  rv   r  r  get_cls_featuresr)   r6   )*rY   r  r]   r}  r   r  r  r?   r  r  r   r  all_hidden_states_textall_hidden_states_imageall_hidden_states_crossall_self_attentionsr5  r>   extend_text_maskssplit_indexrk   r~   image_embeds_with_lncross_modal_texttext_token_type_embeddingsimage_token_type_embeddingscross_modal_imageextend_image_maskslayer_outputs_textcross_text_featureslayer_outputs_imagecross_image_featureslink_layer_indexrt  text_link_towerimage_link_towertransformed_text_embedscross_text_features_cross_image_features_r*   r+   cls_featuress*                                             r9   rl   zBridgeTowerModel.forward  s   \ "$"$"$ $):%k   48qnn&oo0090E%%k2!"ZZ5::iN^N^_N OOGGXcdgg

 $//117784;;;X;XX[\\ __,,22<K@ 	7E->?K"))+6	7 ,,33??!!$"3"3"9"9:Um @ L
 (//1a8L&&|4 &&--99CCL[Q 	9E .L#**<8	9  $0077DD\EVEVW[WhWhWnWnEop  55kQ5O%)%?%?KKI4D4DE&

)$
% 	#  ::;KNh;hi#::;O[\:]&*&@&@JJt1IL\L\]'

)(
) 	$  46QQ <<=QRZZ##A&(9(>(>q(AB**##


 "__HHU_UdUdUfgjj
 =T99!<,#5	
 13>d;;A>-#4	
  315&&(;=Q'RS""$6q$9;Nq;Q#RS {C(?(?(E(E$FG -	XA:$//1177:;HYZKL4,,33??II!L\Z__!!''L ++D,=,=,D,D,Q,QR^,_aqtuauv-. !
 #>>?OPO#@@AQR '+&@&@N^abNb&c##2'*DD#!$ 
 %55IK_as$t! "T!=!=>NQR>R!S$%0'9	" #5Q"7"U$"?"?@PST@T"U%$1'8	# $7q#9 !"))+6#**<8#**,?AU+VW&&(:1(=?RST?U'VW[-	X` )<=Q~,,]NK%')&,--.-.
 01

 
	
r8   c                 x    | j                  |      }| j                  |      }t        j                  ||gd      S )Nr   r   )r  r  r3   r   )rY   r*   r+   cls_features_textcls_features_images        r9   r  z!BridgeTowerModel.get_cls_featuresS  s<     88G!::>Jyy+-?@bIIr8   )
NNNNNNNNNF)r/   r0   r1   rL   r  r  r3   rm   r   r  r  r   r   r  r4   re   r   r   r6   r)   rl   r  rn   ro   s   @r9   r  r  8  s   6p64I5<< IC ITYT`T` I
JELL JS JUZUaUa J
  .2372615.22615+/*.).I
##d*I
 ))D0I
 ((4/	I

 ''$.I
 $$t+I
 ((4/I
 ''$.I
 "DjI
   4'I
 #'I
 +,I
 
u||	5	5I
  I
VJr8   r  c                   $     e Zd Z fdZd Z xZS )"BridgeTowerPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )rK   rL   r   rT   rN   r   r   r   r   r
   transform_act_fnrP   rQ   rX   s     r9   rL   z+BridgeTowerPredictionHeadTransform.__init__[  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr8   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rh   )r   r"  rP   r  s     r9   rl   z*BridgeTowerPredictionHeadTransform.forwardd  s4    

=1--m<}5r8   r   ro   s   @r9   r   r   Z  s    Ur8   r   c                   &     e Zd Zd fd	Zd Z xZS )r  c                 p   t         |           || _        t        |      | _        t        j                  |j                  |j                  j                  d      | _
        t        j                  t        j                  |j                  j                              | _        ||| j                  _        y y )NF)r   )rK   rL   rZ   r   	transformr   rT   rN   r  r~  decoderr   r3   r  r   r   )rY   rZ   r   r[   s      r9   rL   zBridgeTowerMLMHead.__init__l  s    ;FCyy!3!3V5G5G5R5RY^_LLV-?-?-J-J!KL	"(DLL r8   c                 d    | j                  |      }| j                  |      | j                  z   }|S rh   )r&  r'  r   )rY   x	mlm_scores      r9   rl   zBridgeTowerMLMHead.forwardu  s-    NN1%	LL+dii7	r8   rh   r   ro   s   @r9   r  r  k  s    )r8   r  c                   $     e Zd Z fdZd Z xZS )BridgeTowerITMHeadc                 X    t         |           t        j                  |d      | _        y Nr   rK   rL   r   rT   fc)rY   rN   r[   s     r9   rL   zBridgeTowerITMHead.__init__|  s     ))K+r8   c                 (    | j                  |      }|S rh   r0  )rY   r)  	itm_scores      r9   rl   zBridgeTowerITMHead.forward  s    GGAJ	r8   r   ro   s   @r9   r,  r,  {  s    ,r8   r,  z\
    BridgeTower Model with a language modeling head on top as done during pretraining.
    c                   ^    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )BridgeTowerForMaskedLMzmlm_score.decoder.weightz8bridgetower.text_model.embeddings.word_embeddings.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y rh   )rK   rL   r  r  r  r*  r  rX   s     r9   rL   zBridgeTowerForMaskedLM.__init__  s5     +F3+F3 	r8   c                 .    | j                   j                  S rh   r*  r'  r  s    r9   get_output_embeddingsz,BridgeTowerForMaskedLM.get_output_embeddings  s    ~~%%%r8   c                 &    || j                   _        y rh   r8  )rY   new_embeddingss     r9   set_output_embeddingsz,BridgeTowerForMaskedLM.set_output_embeddings  s    !/r8   Nr  r]   r}  r   r  r  r?   r  r  r   c	                     | j                   d|||||||d|	}
| j                  |
j                        }d}|kt               }|j	                  |j
                        } ||j                  d| j                  j                  j                        |j                  d            }t        |||
j                  |
j                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read())).convert("RGB")
        >>> text = "a <mask> looking out of the window"

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
        >>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())

        >>> print(results)
        .a cat looking out of the window.
        ```r  r]   r}  r   r  r  r?   Nr   r<   r=   r-   r.   r7   )r  r*  r*   r   rd   ra   r   rZ   r  r~  r   r-   r.   )rY   r  r]   r}  r   r  r  r?   r  r  outputs
mlm_logitsmasked_lm_lossloss_fcts                 r9   rl   zBridgeTowerForMaskedLM.forward  s    d #$"" 	
))%!'%	
 	
 ^^G$9$9:
')HYYz001F%joob$++:Q:Q:\:\&]_e_j_jkm_noN!//))	
 	
r8   NNNNNNNN)r/   r0   r1   _tied_weights_keysrL   r9  r<  r   r   r3   r  r4   r   r   r   rl   rn   ro   s   @r9   r5  r5    s"    56pq&0  .2372615.22615*.H
##d*H
 ))D0H
 ((4/	H

 ''$.H
 $$t+H
 ((4/H
 ''$.H
   4'H
 +,H
 
H
  H
r8   r5  z
    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
    [CLS] token) for image-to-text matching.
    c                   J    e Zd Z fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	e
   defd              Z xZS )#BridgeTowerForImageAndTextRetrievalc                     t         |   |       t        |      | _        t	        |j
                  dz        | _        | j                          y r.  )rK   rL   r  r  r,  rN   r3  r  rX   s     r9   rL   z,BridgeTowerForImageAndTextRetrieval.__init__  s@     +F3+F,>,>,BC 	r8   Nr  r]   r}  r   r  r  r?   r  r  r   c	                     | j                   d|||||||d|	}
|
j                  }| j                  |      }d}|.t               }|j	                  |j
                        } |||      }t        |||
j                  |
j                        S )a^  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
            The pairs with 0 will be skipped for calculation.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
        >>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, 1].item()
        ```r>  Nr?  r7   )	r  r,   r3  r   rd   ra   r   r-   r.   )rY   r  r]   r}  r   r  r  r?   r  r  r@  r,   r=   itm_lossrC  s                  r9   rl   z+BridgeTowerForImageAndTextRetrieval.forward  s    \ #$"" 	
))%!'%	
 	
  --.')HYYv}}-F/H'!//))	
 	
r8   rD  )r/   r0   r1   rL   r   r   r3   r  r4   r   r   r   rl   rn   ro   s   @r9   rG  rG    s
     .2372615.22615*.G
##d*G
 ))D0G
 ((4/	G

 ''$.G
 $$t+G
 ((4/G
 ''$.G
   4'G
 +,G
 
"G
  G
r8   rG  c                   $     e Zd Z fdZd Z xZS )BridgeTowerContrastiveHeadc                 X    t         |           t        j                  ||      | _        y rh   r/  )rY   rN   
embed_sizer[   s      r9   rL   z#BridgeTowerContrastiveHead.__init__G  s     ))K4r8   c                 (    | j                  |      }|S rh   r2  )rY   r)  s     r9   rl   z"BridgeTowerContrastiveHead.forwardK  s    GGAJr8   r   ro   s   @r9   rL  rL  F  s    5r8   rL  zl
    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
    c                   6    e Zd Z fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de
e   defd              Z xZS )r  c                    t         |   |       t        |      | _        t	        |j
                  |j                        | _        t	        |j
                  |j                        | _        t	        |j
                  dz  |j                        | _	        t        j                  t        j                  | j                  j                              | _        | j#                          y r.  )rK   rL   r  r  rL  rN   contrastive_hidden_sizeitc_text_headitc_image_headitc_cross_modal_headr   r   r3   r   rZ   r  r  r  rX   s     r9   rL   z*BridgeTowerForContrastiveLearning.__init__V  s     +F378J8JFLjLjk89K9KVMkMkl$>v?Q?QTU?UW]WuWu$v!<<T[[5W5W(XYr8   Nr  r]   r}  r   r  r  r?   return_lossr  r   c	                    |	j                  dd        | j                  d|||||||d|	}
|
j                  }|
j                  \  }}}|d   }|d   }| j                  j                  j
                  j                  |      }| j                  j                  t        j                  ddt        j                  | j                  j                  j                  j                              j                  |      }| j                  j                  |      |z   }t        j                   j#                  | j%                  |ddd	ddf         dd
      }t        j                   j#                  | j'                  |ddd	ddf         dd
      j)                  |j                        }t        j                   j#                  | j+                  |      dd
      j)                  |j                        }t        j,                  |||gd      }| j.                  j1                         j)                  |j                        }t        j2                  ||j5                               |z  }t        j2                  ||j5                               |z  }t        j2                  ||j5                               |z  }d}|rt        j6                  t9        |      |j                        }t        j                   j;                  ||      }t        j                   j;                  ||      }t        j                   j;                  ||      }||z   |z   dz  }t=        ||||||
j                  |
j>                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import torch

        >>> image_urls = [
        ...     "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg",
        ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
        ... ]
        >>> texts = ["two dogs in a car", "two cats sleeping on a couch"]

        >>> with httpx.stream("GET", urls[0]) as response:
        ...     image1 = Image.open(BytesIO(response.read()))

        >>> with httpx.stream("GET", urls[1]) as response:
        ...     image2 = Image.open(BytesIO(response.read()))

        >>> images = [image1, image2]

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
        >>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")

        >>> inputs = processor(images, texts, padding=True, return_tensors="pt")
        >>> loss = model(**inputs, return_loss=True).loss

        >>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt")
        >>> loss_swapped = model(**inputs, return_loss=True).loss

        >>> print("Loss", round(loss.item(), 4))
        Loss 0.0019

        >>> print("Loss with swapped images", round(loss_swapped.item(), 4))
        Loss with swapped images 2.126
        ```output_hidden_statesTr>  r   r  r"   r_   Nr   r   )r   r  )ra   r   g      @)r<   r=   r>   r?   r@   r-   r.   r7   ) 
setdefaultr  r,   r-   r  r  r   r  r3   r  r  r   ra   r  r  r   r   	normalizerS  rT  rd   rU  r   r  expr  tr   r  cross_entropyr;   r.   )rY   r  r]   r}  r   r  r  r?   rV  r  r@  r,   hidden_states_txthidden_states_imghidden_states_cross_modalr>   r  r  r@   r=   r  logits_text_to_imagelogits_text_to_crosslogits_image_to_crossitc_lossr  text_to_image_losstext_to_cross_lossimage_to_cross_losss                                r9   rl   z)BridgeTowerForContrastiveLearning.forwardc  s+   v 	0$7"$"" 	
))%!'%	
 	
  --JQJ_J_G,.G'+(,#//<<CCPPQ]^&*&6&6&L&LJJtQejj9I9I9_9_9f9f9m9mn'

)(
) 	$ ''CCDXY\ww mm--d.@.@QPQSTWAU.V\^bc-d}}..t/B/B<PQSTVWPWCX/Y_aef.gjj%% k 
 }}..t/H/H/W]_cd.ehh%% i 
 k<FBO&&**,//{7I7I/J$||K9IJ[X$||K9IJ[X %\<>>;K L{ Z\\#f+fmmDF!#!<!<=QSY!Z!#!<!<=QSY!Z"$--"="=>SU["\*-??BUUY\\H+#%%!//))
 	
r8   rD  )r/   r0   r1   rL   r   r   r3   r  r4   re   r   r   r;   rl   rn   ro   s   @r9   r  r  P  s     .2372615.22615#'s
##d*s
 ))D0s
 ((4/	s

 ''$.s
 $$t+s
 ((4/s
 ''$.s
 D[s
 +,s
 
&s
  s
r8   r  )r  rG  r5  r  r  )Nr/  )[r2   collectionsr   collections.abcr   dataclassesr   r3   r   torch.nnr    r	   r  activationsr
   r   cache_utilsr   r   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r    utils.output_capturingr!   configuration_bridgetowerr#   r$   r%   
get_loggerr/   logger_TOKENIZER_FOR_DOCr)   r;   r  rB   rq   r   r   r   r   r   r  r  rm   floatr  r  r?  rK  rV  rl  rq  rz  r  r  r  r  r   r  r,  r5  rG  rL  r  __all__r7   r8   r9   <module>r     s     # $ !   % & 6 C C J 9  G & 6 K K I 5 h h 
		H	%'  
7[ 7 7$ 
7; 7 74)299 )XRYY 6P")) Pf7"299 7"td299 d4BII bii  		 		 , !%II%<<% 
% <<	%
 LL4'% T\% % '(%:@)ryy @)HI)		 I)Z.299 .:8		 8v?5 ?F
RYY 
Fg8		 g8T +% +% +%\Y7 Y" {65 {6{6| 
YJ1 YJ
YJz "    
\
7 \

\
~ T
*D T
T
n  
C
(B C

C
Lr8   