
    i#                       d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl
mc mZ ddl	mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-  e$j\                  e/      Z0d Z1d Z2d Z3de	jh                  de	jh                  fdZ5e e"d       G d de                     Z6e e"d       G d d e                     Z7ee" G d! d"e                     Z8 G d# d$ejr                        Z: G d% d&ejr                        Z; G d' d(ejr                        Z< G d) d*ejr                        Z= G d+ d,ejr                        Z> G d- d.ejr                        Z? G d/ d0ejr                        Z@ G d1 d2ejr                        ZA G d3 d4ejr                        ZB G d5 d6e      ZC G d7 d8ejr                        ZD G d9 d:ejr                        ZE G d; d<ejr                        ZF G d= d>ejr                        ZG	 ded?ejr                  d@e	jh                  dAe	jh                  dBe	jh                  dCe	jh                  dz  dDeHdEeHfdFZI G dG dHejr                        ZJ G dI dJejr                        ZK G dK dLejr                        ZL G dM dNejr                        ZM G dO dPejr                        ZN G dQ dRe      ZO G dS dTejr                        ZP G dU dVejr                        ZQe" G dW dXe             ZR G dY dZeR      ZS e"d[       G d\ d]eR             ZTe" G d^ d_eR             ZUe" G d` daeR             ZVe" G db dceR             ZWg ddZXy)fzPyTorch CLAP model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)capture_outputs   )ClapAudioConfig
ClapConfigClapTextConfigc                     | j                   \  }}}| dddddddf   j                  dd|d      }|j                  |||z  |      }|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         w/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.pyinterpolater(   .   sX     .;-@-@*ZkaD!m,33Aq%CI!!*kE.A;OI    c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r            r   viewpermute
contiguous)r!   window_sizer#   heightwidthnum_channelswindowss          r'   window_partitionr8   ?   s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr)   c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r.   r   r   r   r+   r,   r-   r/   )r7   r3   r4   r5   r6   s        r'   window_reverser:   T   sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr)   logitsreturnc                     t        j                  t        |       | j                        }t        j
                  j                  | |      S )Ndevice)torcharangelenr?   r   
functionalcross_entropy)r;   labelss     r'   contrastive_lossrF   i   s1    \\#f+fmm<F==&&vv66r)   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)ClapTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedslast_hidden_state.r!   
attentions)__name__
__module____qualname____doc__rJ   r@   FloatTensor__annotations__rK   r!   tuplerL    r)   r'   rI   rI   n   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r)   rI   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)ClapAudioModelOutputz
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        The Audio embeddings obtained by applying the projection layer to the pooler_output.
    Naudio_embedsrK   .r!   rL   )rM   rN   rO   rP   rW   r@   rQ   rR   rK   r!   rS   rL   rT   r)   r'   rV   rV      sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r)   rV   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)
ClapOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for audio-text similarity.
    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapTextModel`].
    audio_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrJ   rW   text_model_outputaudio_model_outputr<   c                 B    t        d | j                         D              S )Nc              3   `   K   | ]&  }t        |t              r|j                         n| ( y wN)
isinstancer   to_tuple).0vs     r'   	<genexpr>z&ClapOutput.to_tuple.<locals>.<genexpr>   s$     ^1Z;%?QZZ\QF^s   ,.)rS   valuesselfs    r'   rc   zClapOutput.to_tuple   s    ^PTP[P[P]^^^r)   )rM   rN   rO   rP   rZ   r@   rQ   rR   r[   r\   rJ   rW   r]   r   r^   rS   r   rc   rT   r)   r'   rY   rY      s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148185929_%* _r)   rY   c                   *     e Zd ZdZd fd	Zd Z xZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    c                 0    t         |           || _        y ra   )super__init__	drop_prob)ri   ro   	__class__s     r'   rn   zClapDropPath.__init__   s    "r)   c                 J   | j                   dk(  s| j                  s|S d| j                   z
  }|j                  d   fd|j                  dz
  z  z   }|t	        j
                  ||j                  |j                        z   }|j                          |j                  |      |z  }|S )N        r   r   )r   dtyper?   )
ro   trainingr   ndimr@   randrt   r?   floor_div)ri   r!   	keep_probr   random_tensoroutputs         r'   forwardzClapDropPath.forward   s    >>S   &	$$Q')DM4F4F4J,KK!EJJuM<O<OXeXlXl$mm""9-=r)   ra   )rM   rN   rO   rP   rn   r}   __classcell__rp   s   @r'   rk   rk      s    
#r)   rk   c                   .     e Zd ZdZdef fdZd Z xZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    t         |           |j                  }|j                  }t	        ||z        }t        j                  t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _
        t        j                  t        j                  d      t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _        t        j                         | _        y )Nr   r   kernel_sizestridepaddingT)inplace)rm   rn   patch_embeds_hidden_sizeaff_block_rintr   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)ri   r   channelsdownsize_ratiointer_channelsrp   s        r'   rn   zClapAudioAFFBlock.__init__   s   22++X78IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 --  #IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 zz|r)   c                     ||z   }| j                  |      | j                  |      z   }| j                  |      }d|z  |z  d|z  d|z
  z  z   }|S )Nr+   r   )r   r   r   )ri   r!   residualattention_inputfused_layer_outputr|   s         r'   r}   zClapAudioAFFBlock.forward   sb    '(2!^^O<t?__!\\*<=]"%77!h,!N`J`:aar)   rM   rN   rO   rP   r   rn   r}   r~   r   s   @r'   r   r      s    
$ $0r)   r   c                   0     e Zd ZdZdef fdZddZ xZS )ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    r   c                    t         |           t        |j                  t              r|j                  |j                  fn|j                  }t        |j
                  t              r|j
                  |j
                  fn|j
                  }t        |j                  t              r|j                  |j                  fn|j                  }|| _        || _        |d   |d   z  |d   |d   z  f| _        | j                  d   | j                  d   z  | _	        |j                  | _        |j                  | _        |d   |d   z
  dz  |d   |d   z
  dz  f}| j                  r|j                  dk(  rdnd}t        j                  |j                   |z  |j"                  |||      | _        |j&                  rt        j(                  |j"                        nt        j*                         | _        | j                  rZt/        |      | _        t        j                  |j                   |j"                  |d   |d   dz  f|d   |d   dz  f|      | _        y y )Nr   r   r+   channel_mapr,   r   r   )rm   rn   rb   	spec_sizer   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)ri   r   r   r   r   r   scale_factorrp   s          r'   rn   zClapAudioPatchEmbed.__init__   s+   ;EfFVFVX[;\F$$f&6&67bhbrbr6@ARARTW6XV 1 12^d^o^o 	 ;EVEXEXZ]:^V  &"5"56djdwdw 	 !("1+a8(1+VW:XY>>!,t~~a/@@22#11qMLO39JqMLYZO<[`a;ab ..63E3E3Vq\]II--<++"
	 FLEcEcBLL!@!@Aikititiv	 1& 9D ii11//']JqMA,=>$Qa1)<=DO r)   c                    | j                   r|d d ddd d d d f   }|j                  \  }}}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }|j                  d      }t        |      dkD  r||dd d d d d f   j                         }	|	j                  \  }}}}|	j                  ||z  d||      }	| j                  |	      }	|	j                  \  }
}}}|	j                  |||||      }	|	j                  d      j                         j                  d	      }	|	j                  d      }t        j                  j                  j                  |	d||z
  fd
d      }	| j!                  ||   |	      ||<   |}nx|j                  \  }
}
}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }| j                  r!|j                  d      j#                  dd      }| j%                  |      }|S )Nr   r   zInput audio size (*z) doesn't match model (z).r.   )r   r+   r   r   r,   r   constantr+   )r   r   r   
ValueErrorr   sizerB   r2   r0   r   r1   r   r@   r   rC   padr   	transposer   )ri   r!   is_longer_idxglobal_hidden_statesr#   r6   r4   r5   output_widthlocal_hidden_states_featureslocal_widths                r'   r}   zClapAudioPatchEmbed.forward(  s   #0AaCA#>  7K6P6P3Jfeq))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  $(99-A#B /44R8L=!A%&3M12q!4K&L&W&W&Y#:M:S:S7
L&%&9&>&>zL?XZ[]cej&k#&*oo6I&J#-@-F-F*8VU&9&>&>z<Yacikp&q#&9&A&A/&R&]&]&_&g&ghi&j#166r:&+hh&9&9&=&='!\K-G)H*VW'# 7;6G6G(79L7$]3 1M"/"5"5Aq&%q))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  !IIm4M<<)11!4>>q!DM		-0r)   ra   r   r   s   @r'   r   r      s    
( (T/r)   r   c            
            e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  deej
                     fdZ	d Z
 xZS )
ClapAudioSelfAttentionc                    t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        t        j                  t        j                  d| j                  d   z  dz
  d| j                  d   z  dz
  z  |            | _        | j#                  d| j%                                t        j&                  | j                  | j                  |j(                        | _        t        j&                  | j                  | j                  |j(                        | _        t        j&                  | j                  | j                  |j(                        | _        t        j0                  |j2                        | _        y )	Nr   The hidden size (6) is not a multiple of the number of attention heads ()r+   r   relative_position_indexbias)rm   rn   r   num_attention_headsr   attention_head_sizeall_head_sizerb   collectionsabcIterabler3   r   	Parameterr@   zerosrelative_position_bias_tableregister_buffercreate_relative_position_indexLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropoutri   r   dim	num_headsr3   rp   s        r'   rn   zClapAudioSelfAttention.__init__\  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
) 	68[8[8]^YYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr)   Nr!   attention_maskoutput_attentionsr<   c                    |j                   \  }}}||d| j                  f}| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
t        j                  ||	j	                  dd            }|t        j                  | j                        z  }| j                  | j                  j                  d         }|j                  | j                  d   | j                  d   z  | j                  d   | j                  d   z  d      }|j                  ddd      j                         }||j!                  d      z   }|r|j                   d   }|j                  ||z  || j"                  ||      }||j!                  d      j!                  d      z   }|j                  d| j"                  ||      }t$        j&                  j)                  |d      }| j+                  |      }t        j                  ||
      }|j                  dddd      j                         }|j-                         d d | j.                  fz   }|j                  |      }|r||f}|S |f}|S )Nr.   r   r+   r   r   r   )r   r   r   r0   r   r   r   r@   matmulmathsqrtr   r   r3   r1   r2   	unsqueezer   r   rC   softmaxr   r   r   )ri   r!   r   r   r#   r   r6   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                     r'   r}   zClapAudioSelfAttention.forwardv  s    )6(;(;%
C"CT-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX   0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r)   c                    t        j                  | j                  d         }t        j                  | j                  d         }t        j                  t        j                  ||gd            }t        j
                  |d      }|d d d d d f   |d d d d d f   z
  }|j                  ddd      j                         }|d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   d| j                  d   z  dz
  z  cc<   |j                  d      }|S )Nr   r   ij)indexingr+   r.   )	r@   rA   r3   stackmeshgridr   r1   r2   sum)ri   coords_hcoords_wcoordscoords_flattenrelative_coordsr   s          r'   r   z5ClapAudioSelfAttention.create_relative_position_index  s-   << 0 0 34<< 0 0 34U^^Xx,@4PQvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"9&&r)   NF)rM   rN   rO   rn   r@   TensorrQ   boolrS   r}   r   r~   r   s   @r'   r   r   [  s^    G: 48).	1||1 ))D01  $;	1
 
u||	1f'r)   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapAudioSelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y ra   )rm   rn   r   r   denser   r   r   ri   r   r   rp   s      r'   rn   zClapAudioSelfOutput.__init__  s6    YYsC(
zz&"E"EFr)   r!   input_tensorr<   c                 J    | j                  |      }| j                  |      }|S ra   r  r   ri   r!   r	  s      r'   r}   zClapAudioSelfOutput.forward  s$    

=1]3r)   rM   rN   rO   rn   r@   r  r}   r~   r   s   @r'   r  r    s2    G
U\\  RWR^R^ r)   r  c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	ClapAudioAttentionc                 j    t         |           t        ||||      | _        t	        ||      | _        y ra   )rm   rn   r   ri   r  r|   r   s        r'   rn   zClapAudioAttention.__init__  s.    *63	;O	)&#6r)   Nr!   r   r   r<   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   ri   r|   )ri   r!   r   r   self_outputsattention_outputr   s          r'   r}   zClapAudioAttention.forward  sE     yy@QR;;|AF#%QR(88r)   r  )rM   rN   rO   rn   r@   r  rQ   r  rS   r}   r~   r   s   @r'   r  r    sW    7 48).		||	 ))D0	  $;		
 
u||		r)   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y ra   )rm   rn   r   r   r   	mlp_ratior  rb   
hidden_actstrr	   intermediate_act_fnr  s      r'   rn   zClapAudioIntermediate.__init__  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r)   r!   r<   c                 J    | j                  |      }| j                  |      }|S ra   r  r  ri   r!   s     r'   r}   zClapAudioIntermediate.forward  &    

=100?r)   r  r   s   @r'   r  r    #    9U\\ ell r)   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y ra   )
rm   rn   r   r   r   r  r  r   hidden_dropout_probr   r  s      r'   rn   zClapAudioOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r)   r!   r<   c                 J    | j                  |      }| j                  |      }|S ra   r  r  s     r'   r}   zClapAudioOutput.forward  s$    

=1]3r)   r  r   s   @r'   r"  r"    s#    >
U\\ ell r)   r"  c                        e Zd Zd fd	Zd Zd Zd Z	 	 ddej                  de	e
e
f   dedz  d	edz  d
e	ej                  ej                  f   f
dZ xZS )ClapAudioLayerc                    t         |           |j                  | _        || _        |j                  | _        || _        t        j                  ||j                        | _	        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        y )Neps)r3   rr   )rm   rn   chunk_size_feed_forward
shift_sizer3   input_resolutionr   r   layer_norm_epslayernorm_beforer  	attentionrk   r   	drop_pathlayernorm_afterr  intermediater"  r|   )ri   r   r   r-  r   drop_path_rater,  rp   s          r'   rn   zClapAudioLayer.__init__  s    '-'E'E$$!-- 0 "Sf6K6K L+FCPTP`P`a9G#9Mn5SUS^S^S`!||CV5J5JK1&#>%fc2r)   c                    t        |      | j                  k  rgt        d      | _        t        j
                  j                         r(t	        j                   t	        j                  |            n
t        |      | _        y y Nr   )minr3   r   r,  r@   jit
is_tracingtensor)ri   r-  s     r'   set_shift_and_window_sizez(ClapAudioLayer.set_shift_and_window_size  s\     D$4$44'lDO=BYY=Q=Q=S		%,,'789Y\]mYn  5r)   c           	         | j                   dkD  rht        j                  d||df||      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }	|D ]  }
||d d |	|
d d f<   |dz  }  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  d      j                  |dk(  d      }|S d }|S )Nr   r   rs   r.   r+   g      Yrr   )	r,  r@   r   slicer3   r8   r0   r   masked_fill)ri   r4   r5   rt   r?   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r'   get_attn_maskzClapAudioLayer.get_attn_mask  s   ??Q{{Avua#8fUHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir)   c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS r6  )r3   r   rC   r   )ri   r!   r4   r5   	pad_right
pad_bottom
pad_valuess          r'   	maybe_padzClapAudioLayer.maybe_pad)  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r)   r!   input_dimensionsr   Nalways_partitionr<   c                    |s| j                  |       n	 |\  }}|j                         \  }}}	|}
| j                  |      }|j                  ||||	      }| j	                  |||      \  }}|j
                  \  }}}}| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |	      }| j                  |||j                  |j                        }| j                  |||      }|d   }|j                  d| j                  | j                  |	      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j!                         }|j                  |||z  |	      }|
| j#                  |      z   }| j%                  |      }| j'                  |      }|| j)                  |      z   }|r	||d	   f}|S |f}|S )
Nr   )r   r+   )shiftsdimsr.   rs   )r   r   r-   r   )r;  r   r/  r0   rL  r   r,  r@   rollr8   r3   rG  rt   r?   r0  r:   r2   r1  r2  r3  r|   )ri   r!   rM  r   rN  r4   r5   r#   r   r   shortcutrK  
height_pad	width_padshifted_hidden_stateshidden_states_windowsrF  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                           r'   r}   zClapAudioLayer.forward0  s     **+;<("/"4"4"6
Ax --m<%**:vuhO %)NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&	)<)<EZEaEa ' 
	 !NN+@)_pNq,Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX 4>>2C#DD++M:((6$t{{<'@@@Q'8';< YeWfr)   )rr   r   FF)rM   rN   rO   rn   r;  rG  rL  r@   r  rS   r   r  r}   r~   r   s   @r'   r'  r'    sz    38) */(->||>  S/>  $;	>
 +> 
u||U\\)	*>r)   r'  c                        e Zd Z fdZ	 	 d	dej
                  deeef   dedz  dedz  deej
                     f
dZ	 xZ
S )
ClapAudioStagec                 h   t         	|           || _        || _        t	        j
                  t        |      D cg c]-  }t        ||||||   |dz  dk(  rdn|j                  dz        / c}      | _	        |& |||t        j                        | _        d| _        y d | _        d| _        y c c}w )Nr+   r   )r   r   r-  r   r4  r,  )r   
norm_layerF)rm   rn   r   r   r   
ModuleListranger'  r3   blocksr   
downsamplepointing)
ri   r   r   r-  depthr   r1  rf  irp   s
            r'   rn   zClapAudioStage.__init__s  s    mm u
  !%5'#,Q<%&UaZqf6H6HA6M

 !()9sr||\DO  #DO'
s   2B/r!   rM  r   NrN  r<   c                    |\  }}t        | j                        D ]  \  }} |||||      }	|	d   } |}
| j                  )|dz   dz  |dz   dz  }}||||f}| j                  |
|      }n||||f}||
|f}|r|	dd  z  }|S )Nr   r   r+   )	enumeratere  rf  )ri   r!   rM  r   rN  r4   r5   ri  layer_moduler]  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                  r'   r}   zClapAudioStage.forward  s     )(5 	-OA|(8HJ[]mnM)!,M	-
 -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr)   r^  )rM   rN   rO   rn   r@   r  rS   r   r  r}   r~   r   s   @r'   r`  r`  r  sb    < */(-||  S/  $;	
 + 
u||	r)   r`  c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    r-  r   rb  r<   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr,   r+   Fr   )rm   rn   r-  r   r   r   	reductionr   )ri   r-  r   rb  rp   s       r'   rn   zClapAudioPatchMerging.__init__  sI     01s7AG%@q3w'	r)   c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr+   r   r   )r   rC   r   )ri   input_featurer4   r5   
should_padrK  s         r'   rL  zClapAudioPatchMerging.maybe_pad  sU    qjAo:519>
Q519a!<JMM--mZHMr)   rw  rM  c                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r+   r   r.   r,   )r   r0   rL  r@   catr   ru  )ri   rw  rM  r4   r5   r#   r   r6   input_feature_0input_feature_1input_feature_2input_feature_3s               r'   r}   zClapAudioPatchMerging.forward  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL		-0}5r)   )rM   rN   rO   rP   r   r   rS   r   Modulern   rL  r@   r  r}   r~   r   s   @r'   rs  rs    sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r)   rs  c                        e Zd Z fdZd Ze	 	 	 	 	 	 ddej                  dz  dedz  dedz  dedz  dedz  d	edz  d
e	e
z  fd       Z xZS )ClapAudioEncoderc                    t         |           t        |j                        | _        || _        t        |      | _        |j                  | _        | j                  j                  | _	        |j                  | _
        |j                  |j                  z  | _        t        |j                  d| j                  dz
  z  z        | _        t!        j"                  d|j$                  t'        |j                        d      D cg c]  }|j)                          }}| j                  j*                  }t-        | j                        D cg c]  }|d   d|z  z  |d   d|z  z  f c}| _        t1        j2                  t-        | j                        D cg c]  }t5        |t        |j                  d|z  z        | j.                  |   |j                  |   |j6                  |   |t'        |j                  d |       t'        |j                  d |dz           || j                  dz
  k  rt8        nd        c}      | _        d| _        t1        j>                  |j                        | _         t1        jB                  | j                        | _"        |j                  | _        t1        jF                  d      | _$        y c c}w c c}w c c}w )Nr+   r   r   cpur>   )r   r   r-  rh  r   r1  rf  F)%rm   rn   rB   depths
num_layersr   r   patch_embedr   r   r   num_mel_bins
freq_ratior   r   num_featuresr@   linspacer4  r   itemr   rd  input_resolutionsr   rc  r`  r   rs  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)ri   r   xr4  r   ri  i_layerrp   s          r'   rn   zClapAudioEncoder.__init__  sW   fmm,.v6#11 ,,99)) **f.A.AA ? ?!Z[H[B\ \],1NN1f>S>SUXY_YfYfUgpu,vwq!&&(ww$$..	\abfbqbq\r!sWX9Q<AqD#99Q<AqD;Q"R!smm  %T__5  !F;;ajHI%)%;%;G%D --0$88A,Sx1H-ICPVP]P]^k`gjk`kPlLmn9@4??UVCV9V4]a
 ',#..)<)<=LL!2!23	mm++A.3 x "ts   J<KB#Kc                    |j                   \  }}}}t        | j                  | j                  z        }| j                  | j                  z  }||kD  s||kD  rt	        d      ||k  r%t
        j                  j                  |||fdd      }||k  r%t
        j                  j                  |||fdd      }|j                   \  }}}	}
|j                  ||| j                  z  |	| j                  z  |
      }|j                  dddd      j                         }|j                  |||
| j                  z  |	| j                  z        }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r   r+   )r   r   r   r  r   r   rC   r(   r    r1   r2   )ri   normalized_input_featuresr   r$   freq_length
spec_widthspec_heightbatchr   timefreqs              r'   reshape_mel2imgz ClapAudioEncoder.reshape_mel2img	  s`   
 *C)H)H&1k;$//9:
nn7#{['@_`` #(*(A(A)J+D9dh )B )% $(*(A(A)K+EIei )B )% '@&E&E#xt %>$E$E8doo-tt/F%
! %>$E$EaAq$Q$\$\$^!$=$E$E8TDOO3TT__5L%
! )(r)   N	is_longerr   output_hidden_states(output_hidden_states_before_downsamplingrN  return_dictr<   c                 J   |xs | j                   j                  }|xs | j                   j                  }|j                  dd      }| j	                  |      }|j                  dd      }d }	| j
                  r6|j                  |j                        }
t        j                  |
dk(        d   }	| j                  |      }|j                  d   }| j                  ||	      }|rdnd }|rdnd }|rdnd }| j                  d   }|rE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }t!        | j"                        D ]  \  }}| j                  |   } |||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                  \  }}} |j                  |g|d   |d   f| }|j                  dddd      }||fz  }||fz  }nI|rG|sE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }|s||dd  z  } | j%                  |      }|j                  \  }}}|dt'        | j(                        dz
  z  z  | j*                  d   z  }|dt'        | j(                        dz
  z  z  | j*                  d   z  }|j                  ddd      j-                         j/                  ||||      }|j                  \  }}}}|| j0                  z  } |j/                  |||| z  | |      }|j                  ddddd      j-                         j/                  ||| d      }| j3                  t        j4                  |d            }!t        j4                  |!d      }!t7        ||!||	      S )
Nr   r   r   r+   rT   r   r.   r,   )rK   pooler_outputr!   rL   )r   r  r   r   r  r   tor?   r@   wherer  r   r  r  r0   r1   rk  r  r   rB   r  r   r2   r    r  r  r   r   )"ri   input_featuresr  r   r  r  rN  r  r  is_longer_list_idxis_longer_listr!   
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsrM  r#   r   hidden_sizereshaped_hidden_stateri  rl  r]  rm  rp  rK   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs"                                     r'   r}   zClapAudioEncoder.forward-  sc     4Wt{{7W7W-N1N1N'11!Q7$(OON$C!$=$G$G1$M!!&\\.*?*?@N!&^q-@!A!!D,,-FG"((+
((8JK"6BD+?RT"$5b411!4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5 	9OA|#55a8(8HJ[]mnM)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#?	9B !IIm4$5$;$;!
AzA#dkk*:Q*>$?@DDUDUVWDXX
#c$++.>.B(CDHYHYZ[H\\ %%aA.99;CCJPZ\fhvw 	 9J8O8O5
Jv"doo5
-55
MZ$?V
 %%aAq!4??AII*V`blnpq 	 U]]3Da%HImQ7)/'4*	
 	
r)   )NFFFFT)rM   rN   rO   rn   r  r   r@   rQ   r  rS   rV   r}   r~   r   s   @r'   r  r    s    &/P")H  /3).,1@E(-#'h
 $$t+h
  $;	h

 #Tkh
 37+h
 +h
 D[h
 
%	%h
 h
r)   r  c                   0     e Zd Zdeez  f fdZd Z xZS )ClapProjectionLayerr   c                     t         |           || _        |j                  }|j                  }t        j                  ||      | _        t        |j                     | _
        t        j                  ||      | _        y ra   )rm   rn   r   r  projection_dimr   r   linear1r	   projection_hidden_act
activationlinear2)ri   r   r  r  rp   s       r'   rn   zClapProjectionLayer.__init__  sa    ((..yyn= !=!=>yy@r)   c                 l    | j                  |      }| j                  |      }| j                  |      }|S ra   )r  r  r  r  s     r'   r}   zClapProjectionLayer.forward  s2    ]36]3r)   )rM   rN   rO   r   r   rn   r}   r~   r   s   @r'   r  r    s    A? Ar)   r  c                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
ed        Zedd       Z xZS )ClapTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 T   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       | j                  dt!        j(                  | j*                  j-                         t         j.                        d       |j                  | _        t        j                  |j$                  |j
                  | j0                        | _        y )	N)padding_idxr)  position_idsr   r.   T)
persistenttoken_type_ids)rt   )rm   rn   r   	Embedding
vocab_sizer  pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsr   r.  r   r$  r   r   r@   rA   max_position_embeddingsexpandr   r  r   longr  position_embeddingsri   r   rp   s     r'   rn   zClapTextEmbeddings.__init__  s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXei 	 	
 	ekk$*;*;*@*@*B%**Ubf 	 	
 "..#%<<**F,>,>DL\L\$
 r)   N	input_idsr  r  inputs_embedspast_key_values_lengthr<   c                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )Nr.   r  r   r   )r   indexrs   )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   hasattrr  r  r   r@   gatherr   r  r  r?   r  r  r  r   r   )ri   r  r  r  r  r  input_shaper#   
seq_lengthbuffered_token_type_idsr  
embeddingsr  s                r'   r}   zClapTextEmbeddings.forward  sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
r)   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr.   r   rs   r   )r   r@   rA   r  r?   r   r  )r  r  r  sequence_lengthr  s        r'   r  z9ClapTextEmbeddings.create_position_ids_from_inputs_embeds  sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<r)   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   r   )ner   r@   cumsumtype_asr  )r  r  r  maskincremental_indicess        r'   r  z5ClapTextEmbeddings.create_position_ids_from_input_ids  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r)   )NNNNr   )r   )rM   rN   rO   rP   rn   r@   
LongTensorrQ   r   r  r}   staticmethodr  r  r~   r   s   @r'   r  r    s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r)   r  moduler   r   r   r   scalingr   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr+   r   r.   )r   rt   )pru   r   )r@   r   r   r   rC   r   float32r  rt   r   ru   r2   )
r  r   r   r   r   r  r   kwargsattn_weightsattn_outputs
             r'   eager_attention_forwardr    s     <<s}}Q':;gEL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r)   c                        e Zd Z fdZ	 ddej
                  dej                  dz  dee   de	ej
                  ej
                  dz  f   fdZ
 xZS )	ClapTextSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizer   r   r         )rm   rn   r  r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   attention_dropoutr  r  s     r'   rn   zClapTextSelfAttention.__init__/  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r)   Nr!   r   r  r<   c                 x   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  sdn| j                  | j                  d|\  }
} |
j                  g |d j!                         }
|
|fS )Nr.   r   r+   rr   )r   r  )r   r   r   r0   r   r   r   r   get_interfacer   _attn_implementationr  ru   r  r  r    r2   )ri   r!   r   r  r  r   query_states
key_statesvalue_statesattention_interfacer  r  s               r'   r}   zClapTextSelfAttention.forwardD  s>    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHL((r)   ra   )rM   rN   rO   rn   r@   r  rQ   r   r   rS   r}   r~   r   s   @r'   r  r  .  sd    60 48)||) ))D0) +,	)
 
u||U\\D00	1)r)   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr)  )rm   rn   r   r   r  r  r   r.  r   r$  r   r  s     r'   rn   zClapTextSelfOutput.__init__f  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r)   r!   r	  r<   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S ra   r  r   r   r  s      r'   r}   zClapTextSelfOutput.forwardl  7    

=1]3}|'CDr)   r  r   s   @r'   r  r  e  1    >U\\  RWR^R^ r)   r  c            	            e Zd Z fdZ	 ddej
                  dej                  dz  dee   dej
                  fdZ	 xZ
S )	ClapTextAttentionc                 b    t         |           t        |      | _        t	        |      | _        y ra   )rm   rn   r  ri   r  r|   r  s     r'   rn   zClapTextAttention.__init__u  s&    )&1	(0r)   Nr!   r   r  r<   c                 ^    |} | j                   |fd|i|\  }}| j                  ||      }|S Nr   r  )ri   r!   r   r  r   r   s         r'   r}   zClapTextAttention.forwardz  sK     !$499
)
 
q
 M8<r)   ra   )rM   rN   rO   rn   r@   r  rQ   r   r   r}   r~   r   s   @r'   r
  r
  t  sQ    1 48|| ))D0 +,	
 
r)   r
  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y ra   )rm   rn   r   r   r  intermediate_sizer  rb   r  r  r	   r  r  s     r'   rn   zClapTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r)   r!   r<   c                 J    | j                  |      }| j                  |      }|S ra   r  r  s     r'   r}   zClapTextIntermediate.forward  r  r)   r  r   s   @r'   r  r    r   r)   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r  )rm   rn   r   r   r  r  r  r   r.  r   r$  r   r  s     r'   rn   zClapTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r)   r!   r	  r<   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S ra   r  r  s      r'   r}   zClapTextOutput.forward  r  r)   r  r   s   @r'   r  r    r  r)   r  c            	            e Zd Z fdZ	 d	dej
                  dej                  dz  dee   dej
                  fdZ	d Z
 xZS )
ClapTextLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
rm   rn   r+  seq_len_dimr
  r0  r  r3  r  r|   r  s     r'   rn   zClapTextLayer.__init__  sI    '-'E'E$*6208$V,r)   Nr!   r   r  r<   c                      | j                   |fd|i|}t        | j                  | j                  | j                  |      }|S r  )r0  r   feed_forward_chunkr+  r  )ri   r!   r   r  s       r'   r}   zClapTextLayer.forward  sY     '
)
 
 2##T%A%A4CSCSUb
 r)   c                 L    | j                  |      }| j                  ||      }|S ra   )r3  r|   )ri   r  intermediate_outputr\  s       r'   r  z ClapTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr)   ra   )rM   rN   rO   rn   r@   r  rQ   r   r   r}   r  r~   r   s   @r'   r  r    sV    - 48|| ))D0 +,	
 
$r)   r  c            	       n     e Zd Z fdZ	 ddej
                  dej                  dz  dee   de	fdZ
 xZS )	ClapTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r  )
rm   rn   r   r   rc  rd  num_hidden_layersr  layerr  )ri   r   ri  rp   s      r'   rn   zClapTextEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#Nr!   r   r  r<   c                 P    | j                   D ]  } |||fi |} t        |      S )N)rK   )r#  r   )ri   r!   r   r  rl  s        r'   r}   zClapTextEncoder.forward  sC     !JJ 	L( M	 +
 	
r)   ra   )rM   rN   rO   rn   r@   r  rQ   r   r   r   r}   r~   r   s   @r'   r   r     sM    , 48
||
 ))D0
 +,	

 

r)   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y ra   )rm   rn   r   r   r  r  Tanhr  r  s     r'   rn   zClapTextPooler.__init__  s9    YYv1163E3EF
'')r)   r!   r<   c                 \    |d d df   }| j                  |      }| j                  |      }|S r6  )r  r  )ri   r!   first_token_tensorpooled_outputs       r'   r}   zClapTextPooler.forward  s6     +1a40

#566r)   r  r   s   @r'   r&  r&    s#    $
U\\ ell r)   r&  c                   l    e Zd ZU eed<   dZdZdZ ej                         de
j                  fd       Zy)ClapPreTrainedModelr   clap)audiotextFr  c                    | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j                  |j                          yt        |t"              rt	        j$                  |j&                  t)        j*                  | j                   j,                               t	        j$                  |j.                  t)        j*                  | j                   j,                               yt        |t0        j2                        r&t	        j
                  |j                  d|dz         yt        |t0        j4                  t0        j6                  f      rt	        j                  |j8                         t	        j:                  |j                         t=        |dd      ^t	        j                  |j>                         t	        j:                  |j@                         t	        j                  |jB                         yyt        |t0        jD                  t0        jF                  f      r| j                   jH                  dz  d	| j                   jJ                  z  dz  z  |z  }t	        j
                  |j                  |
       |j8                   t	        j                  |j8                         yyt        |tL              rNt	        j                  |jN                         t	        j                  |jP                  |jS                                yy)zInitialize the weightsrr   g{Gz?)meanstdr.   r  running_meanNr  r+   )r3  )*r   initializer_factorrb   r  initnormal_r  weightr  copy_r  r@   rA   r   r  zeros_r  	ClapModel	constant_logit_scale_ar   loglogit_scale_init_valuelogit_scale_tr   r  r   r   r   ones_getattrr4  running_varnum_batches_trackedr   r   r  r"  r   r   r   r   )ri   r  factorin_proj_stds       r'   _init_weightsz!ClapPreTrainedModel._init_weights  sj    //f01LL33::&SW-XLL55<<3FUYMZJJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--.	*NN6//$++:\:\1]^NN6//$++:\:\1]^-LLSftmDr~~ >?KK$JJv}}%v~t4@F//0

6--.F667 A BII 67;;22D8a$++B_B_>_dh=hilrrKLLK8{{&FKK( ' 67KK;;<JJv55v7\7\7^_ 8r)   N)rM   rN   rO   r   rR   base_model_prefixinput_modalitiessupports_gradient_checkpointingr@   no_gradr   r  rG  rT   r)   r'   r-  r-    sB    (&+#U]]_`BII ` `r)   r-  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 ddej                  dz  dej                  dz  d	ee   deez  fd
       Z xZS )ClapAudioModelr   r  r/  c                 d    t         |   |       t        |      | _        | j	                          y ra   )rm   rn   r  audio_encoder	post_initr  s     r'   rn   zClapAudioModel.__init__!  s'     -f5r)   r<   c                 B    | j                   j                  j                  S ra   )rO  r  r   rh   s    r'   get_input_embeddingsz#ClapAudioModel.get_input_embeddings'  s    !!--222r)   Nr  r  c                 ,     | j                   d||d|S )ad  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```r  r  rT   )rO  )ri   r  r  r  s       r'   r}   zClapAudioModel.forward*  s.    : "t!! 
)
 
 	
r)   NN)rM   rN   rO   r   rR   main_input_namerI  rn   r   r  rR  r   r@   rQ   
BoolTensorr   r   rS   r   r}   r~   r   s   @r'   rM  rM    s    &O 3bii 3  48-1 
))D0 
 ##d* 
 +,	 

 
+	+ 
  
r)   rM  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                       e Zd ZU eed<   dZeedZd fd	Z	d Z
d Zeee	 	 	 	 	 ddej                   dz  d	ej                   dz  d
ej                   dz  dej                   dz  dej                   dz  dee   defd                     Z xZS )ClapTextModelr   r0  r!   rL   c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rm   rn   r   r  r  r   encoderr&  poolerrP  )ri   r   add_pooling_layerrp   s      r'   rn   zClapTextModel.__init__d  sM    
 	 ,V4&v.0AnV,t 	r)   c                 .    | j                   j                  S ra   r  r  rh   s    r'   rR  z"ClapTextModel.get_input_embeddingst  s    ...r)   c                 &    || j                   _        y ra   ra  ri   r   s     r'   set_input_embeddingsz"ClapTextModel.set_input_embeddingsw  s    */'r)   Nr  r   r  r  r  r  r<   c                    ||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}	||j                  n|j                  }
|t	        j
                  ||	f|
      }| j                  ||      }| j                  ||||      } | j                  |fd|i|}|d   }| j                  | j                  |      nd }t        ||      S )	NzDYou cannot specify both input_ids and inputs_embeds at the same timer.   z5You have to specify either input_ids or inputs_embedsr>   )r  r  r  r  r   r   )rK   r  )r   %warn_if_padding_and_no_attention_maskr   r?   r@   onesget_extended_attention_maskr  r]  r^  r   )ri   r  r   r  r  r  r  r  r#   r  r?   extended_attention_maskembedding_outputencoder_outputssequence_outputr+  s                   r'   r}   zClapTextModel.forwardz  s8     ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN 150P0PQ_al0m??%)'	 + 
 '$,,
2
 

 *!,8<8OO4UY)-'
 	
r)   )T)NNNNN)rM   rN   rO   r   rR   rI  r  r  _can_record_outputsrn   rR  rd  r   r   r   r@   r  r   r   r   r}   r~   r   s   @r'   rY  rY  N  s      &+
 /0   *..2.2,0-1.
<<$&.
 t+.
 t+	.

 llT).
 ||d*.
 +,.
 
$.
    .
r)   rY  c                   "    e Zd ZU eed<   def fdZee	 	 ddej                  dej                  dz  dej                  dz  de
e   deez  f
d	              Zee	 	 dd
ej                  dej                  dz  dej                  dz  de
e   deez  f
d              Zee	 	 	 	 	 	 ddej                   dz  d
ej"                  dz  dej$                  dz  dej                  dz  dej                   dz  dedz  de
e   deez  fd              Z xZS )r;  r   c                 .   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }t        j                  t        j                  t        j                  |j                                    | _        t        j                  t        j                  t        j                  |j                                    | _        |j$                  | _        t'        |      | _        t+        |      | _        t/        |      | _        t+        |      | _        | j5                          y )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )rm   rn   rb   text_configr   	TypeErrortypeaudio_configr   r   r   r@   r:  r   r>  r?  r=  r@  r  rY  
text_modelr  text_projectionrM  audio_modelaudio_projectionrP  )ri   r   rq  rt  rp   s       r'   rn   zClapModel.__init__  s=    &,,n=++,-Q0 
 &--?,,-.a1 
 ((**\\%,,txx@]@]7^*_`\\%,,txx@]@]7^*_`$33'42;?),7 3L A 	r)   Nr  r   r  r  r<   c                      | j                   d|||d|}| j                  |j                        }t        j                  |d      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```r  r   r  r.   r   rT   )ru  rv  r  F	normalize)ri   r  r   r  r  text_outputstext_featuress          r'   get_text_featureszClapModel.get_text_features  s^    . 4C4?? 4
)%4
 	4
 ,,\-G-GH%&[[B%G"r)   r  r  c                      | j                   d||d|}| j                  |j                        }t        j                  |d      |_        |S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))

        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     audio_features = model.get_audio_features(**inputs)
        ```rT  r.   r   rT   )rw  rx  r  r{  r|  )ri   r  r  r   r  audio_outputsaudio_featuress          r'   get_audio_featureszClapModel.get_audio_features  sZ    8 5ED4D4D 5
)Y5
BH5
 ..}/J/JK&'kk.b&I#r)   return_lossc           	          | j                   d
||d|} | j                  d
|||d|}	|j                  }
| j                  |
      }
|	j                  }| j	                  |      }|
|
j                  ddd      z  }
||j                  ddd      z  }| j                  j                         }| j                  j                         }t        j                  ||
j                               |z  }t        j                  |
|j                               |z  }d}|r,t        |      }t        |j                               }||z   dz  }t        |||||
|	|	      S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]

        >>> inputs = processor(text=input_text, audio=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```rT  rz  r+   r.   T)r  r   keepdimNg       @)rZ   r[   r\   rJ   rW   r]   r^   rT   )rw  ru  r  rx  rv  r   r@  expr=  r@   r   trF   rY   )ri   r  r  r  r   r  r  r  r  r}  rW   rJ   logit_scale_textlogit_scale_audior\   r[   rZ   caption_loss
audio_losss                      r'   r}   zClapModel.forward  s   N )(( 
)
 
 't 
)%
 	
 %22,,\:"00**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  --113 ..224,,{LNN4DEHXX <<kmmoFIZZ+O<L)*:*<*<*>?J :-4D-+#%*,
 	
r)   rU  )NNNNNN)rM   rN   rO   r   rR   rn   r   r   r@   r  r   r   rS   r   r  r  r  rQ   rW  r  rY   r}   r~   r   s   @r'   r;  r;    s   z @  /3,0	<< t+ llT)	
 +, 
+	+  @  *..2	   <<$&  t+	 
 +,  
+	+    D  .237-1.204#'P
##d*P
 ))D0P
 ##d*	P

 t+P
 &&-P
 D[P
 +,P
 
	P
  P
r)   r;  c                        e Zd ZU eed<   dZeedZdef fdZ	de
j                  fdZd Zee	 	 	 dd	ej"                  dz  d
ej"                  dz  dej"                  dz  dee   deez  f
d              Z xZS )ClapTextModelWithProjectionr   rZ  r[  c                     t         |   |       t        |      | _        t	        |      | _        | j                          y ra   )rm   rn   rY  ru  r  rv  rP  r  s     r'   rn   z$ClapTextModelWithProjection.__init__v  s3     '/26:r)   r<   c                 B    | j                   j                  j                  S ra   ru  r  r  rh   s    r'   rR  z0ClapTextModelWithProjection.get_input_embeddings}  s    ))999r)   c                 :    || j                   j                  _        y ra   r  rc  s     r'   rd  z0ClapTextModelWithProjection.set_input_embeddings  s    5:""2r)   Nr  r   r  r  c                      | j                   d|||d|}|j                  }| j                  |      }t        ||j                  |j
                  |j                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```rz  )rJ   rK   r!   rL   rT   )ru  r  rv  rI   rK   r!   rL   )ri   r  r   r  r  r}  r+  rJ   s           r'   r}   z#ClapTextModelWithProjection.forward  su    . 4C4?? 4
)%4
 	4
 %22**=9"#*<<&44#..	
 	
r)   )NNN)rM   rN   rO   r   rR   rI  r  r  rm  rn   r   r  rR  rd  r   r   r@   r  r   r   rS   rI   r}   r~   r   s   @r'   r  r  m  s     &+
~ :bii :;  *..2,0	#
<<$&#
 t+#
 llT)	#

 +,#
 
$	$#
  #
r)   r  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
ee	 	 ddej                  dz  dej                  dz  d	ee   deez  fd
              Z xZS )ClapAudioModelWithProjectionr   r  r/  c                     t         |   |       t        |      | _        t	        |      | _        | j                          y ra   )rm   rn   rM  rw  r  rx  rP  r  s     r'   rn   z%ClapAudioModelWithProjection.__init__  s4     )&1 3F ;r)   r<   c                 V    | j                   j                  j                  j                  S ra   )rw  rO  r  r   rh   s    r'   rR  z1ClapAudioModelWithProjection.get_input_embeddings  s     --99>>>r)   Nr  r  c                      | j                   d||d|}| j                  |j                        }t        ||j                  |j
                  |j                        S )au  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audio=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```rT  )rW   rK   rL   r!   rT   )rw  rx  r  rV   rK   rL   r!   )ri   r  r  r  r  rW   s         r'   r}   z$ClapAudioModelWithProjection.forward  so    : 5ED4D4D 5
)5
 5
 ,,]-H-HI#%+==$//'55	
 	
r)   rU  )rM   rN   rO   r   rR   rV  rI  rn   r   r  rR  r   r   r@   rQ   rW  r   r   rS   rV   r}   r~   r   s   @r'   r  r    s    &O ?bii ?  48-1(
))D0(
 ##d*(
 +,	(

 
%	%(
  (
r)   r  )r;  r-  rY  r  rM  r  )rr   )YrP   r   r   collections.abcr   dataclassesr   typingr   r@   torch.nn.functionalr   rC   r{   r   r6  activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_clapr   r   r   
get_loggerrM   loggerr(   r8   r:   r  rF   rI   rV   rY   r  rk   r   r   r   r  r  r  r"  r'  r`  rs  r  r  r  floatr  r  r  r
  r  r  r  r   r&  r-  rM  rY  r;  r  r  __all__rT   r)   r'   <module>r     s      $ !      & ! 9 G & 6 j j 7 5 K K 
		H	%"**7U\\ 7ell 7
 	<+ 	< 	< 
	<; 	< 	< _ _  _B299 2%		 %P_")) _FZ'RYY Z'|
")) 
 &BII  	bii 	wRYY wv4/ 4p3BII 3lv
ryy v
r")) &g8 g8d %II%<<% 
% <<	%
 LL4'% % %.3)BII 3)n 		 .299  RYY . D
bii 
4RYY  #`/ #` #`L/
( /
d O
' O
O
d {
# {
 {
| :
"5 :
 :
z 9
#6 9
 9
xr)   