
    id                        d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(  ejR                  e*      Z+	 	 d-dejX                  dejZ                  dejZ                  dejZ                  dejZ                  dz  de.dz  de.fdZ/ G d dejX                        Z0 G d d e      Z1e G d! d"e             Z2 ed#$       G d% d&e2             Z3 G d' d(ejX                        Z4 ed)$       G d* d+e2e             Z5g d,Z6y).    N)Callable)nn   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)create_bidirectional_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM   )AudioFlamingo3ConfigAudioFlamingo3EncoderConfigmodulequerykeyvalueattention_maskscalingdropoutc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )N      r   r   )dimptrainingr   )
sizetorchmatmul	transposer   
functionalsoftmaxr$   r+   
contiguous)
r   r   r    r!   r"   r#   r$   kwargsattn_weightsattn_outputs
             /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/audioflamingo3/modeling_audioflamingo3.pyeager_attention_forwardr7   /   s     **R.D(<<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$    c                   :    e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZ	 	 	 	 dde	j                  de	j                  dz  dedz  de	j                  dz  dedee   dee	j                  e	j                  dz  ee	j                     dz  f   fdZ xZS )AudioFlamingo3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr$   
is_decoderbias	is_causal	layer_idxconfigc	                 z   t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        |/|r-t        j                  d| j                  j                   d       || _        t!        j"                  ||d      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r'   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr>   )super__init__r;   r<   r$   head_dimrA   
ValueErrorr#   r=   r?   loggerwarning_once	__class____name__r@   r   Lineark_projv_projq_projout_proj)
selfr;   r<   r$   r=   r>   r?   r@   rA   rJ   s
            r6   rE   z AudioFlamingo3Attention.__init__L   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"*4>>+B+B*C D, ,
 #ii	95Aii	94@ii	94@		)YTBr8   hidden_stateskey_value_statespast_key_valuesr"   output_attentionsr3   returnc                    |du}|j                   dd }g |d| j                  }	| j                  |      | j                  z  j	                  |	      j                  dd      j                         }
|it        |t              rY|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                   }n|d   d| j"                  | j                  f}| j%                  |      j	                  |      j                  dd      j                         }| j'                  |      j	                  |      j                  dd      j                         }| |j)                  ||| j                        \  }}t+        j,                  | j.                  j0                  t2              } || |
|||f| j4                  sdn| j6                  d|d	|\  }} |j8                  g |d j                         }| j;                  |      }||fS )
z#Input shape: Batch x Time x ChannelNr&   r   r   Tr                 ?)r$   r#   rU   )shaperF   rO   r#   viewr/   r2   
isinstancer   
is_updatedgetr@   cross_attention_cacheself_attention_cachelayerskeysvaluesr<   rM   rN   updater   get_interfacerA   _attn_implementationr7   r+   r$   reshaperP   )rQ   rR   rS   rT   r"   rU   r3   is_cross_attentioninput_shapehidden_shapequery_statesr]   current_states
key_statesvalue_stateskv_shapeattention_interfacer5   r4   s                      r6   forwardzAudioFlamingo3Attention.forwardt   s`    .T9#))#2.88b8$--8 M2T\\AGGU__`acdeppr &:oGZ+[(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL
 $ADNNDMMJH^499(CMMaQRS^^`J;;~6;;HEOOPQSTU``bL*+:+A+A*l\`\j\j+k(
L(?(M(MKK,,.E)
 %8
%
  $}}C$,,/
%
 
%
!\ *k));;;;FFHmmK0L((r8   )rX   FTFNN)NNNF)rK   
__module____qualname____doc__intfloatboolr   rE   r-   Tensorr   r   r   tuplerq   __classcell__rJ   s   @r6   r:   r:   I   s&   G   $.2&C&C &C 	&C
 &C &C &C :&C %t+&CV 15(,.2"'H)||H)  ,,-H) 	H)
 t+H)  H) -.H) 
u||U\\D0%2E2LL	MH)r8   r:   c                   ~     e Zd Zdef fdZdej                  dej                  dee   dej                  fdZ	 xZ
S )AudioFlamingo3EncoderLayerrA   c                 h   t         |           |j                  | _        t	        | j                  |j
                  |j                  |      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)r;   r<   r$   rA   )rD   rE   d_modelr;   r:   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr$   r   activation_functionactivation_fnactivation_dropoutrL   encoder_ffn_dimfc1fc2final_layer_normrQ   rA   rJ   s     r6   rE   z#AudioFlamingo3EncoderLayer.__init__   s    0nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r8   rR   r"   r3   rV   c                     |}| j                  |      } | j                  d||d|\  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|j                  t        j                  k(  rEt        j                  |j                        j                  dz
  }t        j                   || |      }|S )a>  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        )rR   r"   r)   i  )minmax )r   r   r   r0   r$   r+   r   r   r   r   r   dtyper-   float16finfor   clamp)rQ   rR   r"   r3   residual_clamp_values          r6   rq   z"AudioFlamingo3EncoderLayer.forward   sT    !11-@)4>> 
')
 
q
 --mt||VZVcVc-d =0 --m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0%--/++m&9&9:>>EK!KKK<[YMr8   )rK   rr   rs   r   rE   r-   rx   r   r   rq   rz   r{   s   @r6   r}   r}      sM    =3 =$"||" " +,	"
 
"r8   r}   c                   6    e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZy)AudioFlamingo3PreTrainedModelrA   model)audiotextTr:   rT   N)rK   rr   rs   r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar   r8   r6   r   r      s4      (&*#23"3Nr8   r   zT
    The audio model from AudioFlamingo3 without any head or projection on top.
    custom_introc            
           e Zd ZU dZeed<   dZdZdgZe	e
dZdef fdZd Zd	ej                  fd
Zdej                  fdZee	 ddej*                  dej*                  dz  d	eez  fd              Zdej2                  fdZ xZS )AudioFlamingo3EncoderzY
    AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
    rA   input_featuresr   r}   )rR   
attentionsc                 b   t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _        |j                  rt        j                  |      nd| _        t        j                  | j                  |dd      | _        t        j                  ||ddd      | _        t        j                   | j                  |      | _        | j"                  j%                  d       t        j&                  t)        |j*                        D cg c]  }t-        |       c}      | _        t        j0                  |j
                        | _        t        j4                  dd      | _        d| _        | j;                          y c c}w )	NrY   r   r   )kernel_sizepaddingr   )r   strider   F)r   )rD   rE   r$   encoder_layerdrop	layerdropr   num_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler   Conv1dconv1conv2	Embeddingembed_positionsrequires_grad_
ModuleListrangeencoder_layersr}   ra   r   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rQ   rA   r;   r   rJ   s       r6   rE   zAudioFlamingo3Encoder.__init__  s7    ~~11NN	"//$*$?$?!393I3I499Y/sYYt00)TUV
YYy)1VWX
!||D,E,EyQ++E2mmQVW]WlWlQm$nA%?%G$no,,v~~6,,q3&+# %os   6F,c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_grad_requires_grad)rQ   params     r6   _freeze_parametersz(AudioFlamingo3Encoder._freeze_parameters1  s(    __& 	(E"'E	(#r8   rV   c                     | j                   S Nr   rQ   s    r6   get_input_embeddingsz*AudioFlamingo3Encoder.get_input_embeddings6  s    zzr8   r!   c                     || _         y r   r   rQ   r!   s     r6   set_input_embeddingsz*AudioFlamingo3Encoder.set_input_embeddings9  s	    
r8   Ninput_features_maskc                    |j                   d   dz
  dz  dz   }|j                  d      }|dz
  dz  dz   }t        j                  ||j                        |dddf   k  }t
        j                  j                  | j                  |            }t
        j                  j                  | j                  |            }|j                  ddd      }|| j                  j                  z   }t
        j                  j                  || j                  | j                        }t        | j                   ||      }| j"                  D ]>  }	| j                  xr" t        j$                  g       | j&                  k  }
|
r6 |	||      }@ |j                  ddd      }| j)                  |      j                  ddd      }| j+                  |      }t-        |	      S )
ap  
        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
                these features from waveform input.
            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
        r&   r   r   deviceNr   r)   )rA   inputs_embedsr"   )last_hidden_state)rZ   sumr-   aranger   r   r0   gelur   r   permuter   weightr$   r+   r
   rA   ra   randr   r   r   r   )rQ   r   r   r3   seq_leninput_features_lengthsr   rR   r"   layerdrops              r6   rq   zAudioFlamingo3Encoder.forward<  s   ( "''+a/A59!4!8!8!<"81"<!BQ!F#ll7>;P;PQTjklnrkrTss **4::n+EF**4::m+DE%--aA6 &(<(<(C(CC--mt||VZVcVc-d2;;'.
 [[ 	EE==DUZZ^dnn%DD %m^ D	E &--aA66>>q!QG6)+
 	
r8   input_lengthsc                 6    |dz
  dz  dz   }|dz
  dz  dz   }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r   r   r   )rQ   r   output_lengthss      r6    _get_feat_extract_output_lengthsz6AudioFlamingo3Encoder._get_feat_extract_output_lengthst  s7     '*q014'!+1A5n,,r8   r   )rK   rr   rs   rt   r   r   main_input_namer   r   r}   r:   _can_record_outputsrE   r   r   Moduler   r   r   r   r-   rx   ry   r   rq   
LongTensorr   rz   r{   s   @r6   r   r     s    
 ('&O56 4-
: 2$
bii "))    483
3
 #\\D03

 
+	+3
   3
l-e>N>N -r8   r   c                   .     e Zd ZdZdef fdZd Z xZS )!AudioFlamingo3MultiModalProjectorz
    Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    rA   c                    t         |           t        j                  |j                  j
                  |j                  j
                  |j                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                  |j                        | _        y )NrC   )rD   rE   r   rL   audio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2r   s     r6   rE   z*AudioFlamingo3MultiModalProjector.__init__  s    		++V-?-?-K-KRXRgRg
 &556		**F,>,>,J,JQWQfQf
r8   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rQ   audio_featuresrR   s      r6   rq   z)AudioFlamingo3MultiModalProjector.forward  s2    n5/m4r8   )rK   rr   rs   rt   r   rE   rq   rz   r{   s   @r6   r   r   }  s    

3 
r8   r   z
    The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
    c                       e Zd ZdZdZdZ fdZd Zd Zd Z	d Z
d Zd Ze ed	
      dej                   dej"                  dee   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 ddej.                  dz  dej                   dz  dej"                  dz  dej"                  dz  dej.                  dz  dedz  dej                   dz  dej.                  dz  dedz  deej"                  z  dee   defd              Zdddef fdZ xZS )&AudioFlamingo3ForConditionalGenerationNc                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y r   )rD   rE   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r6   rE   z/AudioFlamingo3ForConditionalGeneration.__init__  sn      ,,77$001D1DE2>>v?Q?QR%Fv%N" 	r8   c                 6    | j                   j                         S r   )r   r   r   s    r6   r   z;AudioFlamingo3ForConditionalGeneration.get_input_embeddings  s    ""7799r8   c                 :    | j                   j                  |       y r   )r   r   r   s     r6   r   z;AudioFlamingo3ForConditionalGeneration.set_input_embeddings  s    007r8   c                 6    | j                   j                         S r   )r   get_output_embeddingsr   s    r6   r  z<AudioFlamingo3ForConditionalGeneration.get_output_embeddings  s    ""88::r8   c                 :    | j                   j                  |       y r   )r   set_output_embeddings)rQ   new_embeddingss     r6   r  z<AudioFlamingo3ForConditionalGeneration.set_output_embeddings  s    11.Ar8   c                 :    | j                   j                  |       y r   )r   set_decoder)rQ   decoders     r6   r  z2AudioFlamingo3ForConditionalGeneration.set_decoder  s    ''0r8   c                 6    | j                   j                         S r   )r   get_decoderr   s    r6   r  z2AudioFlamingo3ForConditionalGeneration.get_decoder  s    ""..00r8   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   r   r   r3   rV   c                     | j                   |f|dd|}| j                  |j                        }|j                  d      j	                  t
        j                        }| j                   j                  |      \  }}t        j                  |j                  d   |j                        dddf   |dddf   k  }	||	j	                  |j                           |_        |S )a
  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        T)r   return_dictr&   r   r   N)r   r   r   r   tor-   longr   r   rZ   r   pooler_output)
rQ   r   r   r3   audio_outputaudio_embedsr   r   post_lengths
valid_masks
             r6   get_audio_featuresz9AudioFlamingo3ForConditionalGeneration.get_audio_features  s    * (t''
0CQU
Y_
 11,2P2PQ ,//366uzzB**KKMZ<\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
%1*--@S@S2T%U"r8   	input_idsr"   position_idsrT   r   labels	use_cachelogits_to_keepc                    | | j                         |      }||| j                  ||d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d||||||	|
d|}|S )a+  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

        >>> model_id = "nvidia/audio-flamingo-3-hf"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

        >>> conversations = [
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {"type": "text", "text": "Transcribe the input speech."},
        >>>                 {
        >>>                     "type": "audio",
        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
        >>>                 },
        >>>             ],
        >>>         }
        >>>     ],
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {
        >>>                     "type": "text",
        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
        >>>                 },
        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
        >>>             ],
        >>>         }
        >>>     ],
        >>> ]

        >>> inputs = processor.apply_chat_template(
        >>>     conversations,
        >>>     tokenize=True,
        >>>     add_generation_prompt=True,
        >>>     return_dict=True,
        >>> ).to(model.device)

        >>> outputs = model.generate(**inputs, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(
        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
        >>> )
        >>> print(decoded_outputs)
        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
        ```T)r  r&   )r   r"   r  rT   r  r  r  r   )
r   r  r  rA   audio_token_id	unsqueezemasked_scatterr  r   r   )rQ   r  r   r   r"   r  rT   r   r  r  r  r3   r  audio_token_maskoutputss                  r6   rq   z.AudioFlamingo3ForConditionalGeneration.forward  s    ^  7D557	BM%)*?22>CVdh2iwwL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +>$*=*= 	+
')%+)	+
 	+
 r8   F)is_first_iterationr!  c                    |j                  dd       }|j                  dd       }t        |   |i |}|s|j                  dd      s|||d<   |||d<   |S )Nr   r   r  F)poprD   prepare_inputs_for_generationr^   )rQ   r!  argsr3   r   r   model_inputsrJ   s          r6   r$  zDAudioFlamingo3ForConditionalGeneration.prepare_inputs_for_generationC  st    $4d;$jj)>Ew<dMfM\%5%5k5%I)1?-.".6I23r8   )
NNNNNNNNNr   )rK   rr   rs   _keep_in_fp32_modules_strict_tp_plan_pp_planrE   r   r   r  r  r  r  r   r   r-   FloatTensorrx   r   r   ry   r   r  r   r   rw   ru   r   rq   r$  rz   r{   s   @r6   r   r     s    $( HH:8;B11  w)) #\\ +,	
 
+	+ <  .23737.204(,26*.!%-.c##d*c ))D0c #\\D0	c
 t+c &&-c c ((4/c   4'c $;c ell*c +,c 
 c  cJ OT t  r8   r   )r   r   r   )NrX   )7r   collections.abcr   r-   r   activationsr   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   autor   r   configuration_audioflamingo3r   r   
get_loggerrK   rH   r   rx   rv   r7   r:   r}   r   r   r   r   __all__r   r8   r6   <module>r<     sj  ,  $   ! 5 ) 6 B 9 R F & R R 7 5 2 [ 
		H	% !%II%<<% 
% <<	%
 LL4'% T\% %4s)bii s)l5!; 5p O   
r-9 r-
r-j		 . 
v-JO v
vr or8   