
    i~I                        d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%  e        rd dlZ G d de	jL                        Z'e G d de             Z( G d de	jL                        Z)d Z*d Z+ ed       G d de(e             Z,ddgZ-y)     )Callable)pi)Optional)Tensorbroadcast_tensorsnn   )initialization)ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONS)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_available   )	AutoModelAutoModelForCausalLM   )MusicFlamingoConfigNc                        e Zd ZU dZej
                  ed<   ddef fdZe		 	 	 ddedz  de
d   dedz  d	ed
ef   fd       Z ej                         deded	eeef   fd       Zd Z xZS )MusicFlamingoRotaryEmbeddinga  Rotary time embedding module used by MusicFlamingo checkpoints.

    This is a checkpoint-faithful integration, not a direct implementation of the RoTE formulation described in
    (Goel et al., 2024): https://arxiv.org/abs/2410.12109. It applies axial rotary embeddings over the window index
    within each audio sample and the encoder time index within each window, then modulates both axes with absolute
    timestamps in seconds.
    inv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       | j                  | j                        }| j                  d|d       y )N	rope_typedefaultr   F)
persistentoriginal_inv_freqposition_angles)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr   rope_parametersr!   compute_default_rope_parametersr   attention_scalingregister_bufferclone_compute_position_anglesr   )selfr   devicerope_init_fnr   r%   	__class__s         /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/musicflamingo/modeling_musicflamingo.pyr'   z%MusicFlamingoRotaryEmbedding.__init__8   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuU77F.ER    r2   ztorch.deviceseq_lenreturnztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   r   dtyper2   r>   )r+   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r   r2   r7   baser;   r<   dimattention_factorr   s	            r5   r,   z<MusicFlamingoRotaryEmbedding.compute_default_rope_parametersJ   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r6   
timestampsc                    |dddf   j                  | j                  j                  | j                  j                        }| j                  j
                  dz  |z  }t        j                  ||z        | j                  z  }|j                  d      | j                  z  }t        j                  |dd      }|dddddf   }| j                  d| dddddf   }t        ||      \  }}t        j                  ||fd      }| dz  t        z  j                  |      }	||	j                  d      z  }|j                         |j!                         fS )zBCompute 2D axial rotary embeddings for window and time dimensions.Nr   r?      r   rK   )rH   r   r2   r>   r   audio_frame_steprE   roundr)   	unsqueezerepeat_interleaver%   r   catr   cossin)
r1   rM   r7   window_startswindow_durationwindow_positionswindow_freqs
time_freqsfreqsangles
             r5   forwardz$MusicFlamingoRotaryEmbedding.forwardj   s5   
 #1a4(++4==3G3Gt}}ObOb+c++66:WD ;;}'FG$JaJaa'11"5E..|QBG $AtQJ/))(73D!QJ?
#4\:#N j		<4"=q2%))%0++yy{EIIK''r6   c                 B   t        j                  t        | j                        |j                  |j
                        }|| j                  z  dt        z  z  }|j                  d      |z  }t        j                  |dd      }|j                  |j
                        S )Nr?   r   rP   rQ   r=   )
rE   rF   rD   r)   r2   r>   r   rT   rU   rH   )r1   r   	positionsr%   s       r5   r0   z5MusicFlamingoRotaryEmbedding._compute_position_angles~   s    LLT%<%<!=hoo]e]k]kl	 7 771r6B	#--b1H<11/1"M!!!77r6   N)NNN)__name__
__module____qualname____doc__rE   r   __annotations__r   r'   staticmethodr   rD   tuplerI   r,   no_gradr`   r0   __classcell__r4   s   @r5   r   r   -   s     llS2 S$ -1+/"*#d**(* t* 
~u$	%	* *> U]]_(& (3 (5;P ( (&8r6   r   c                   n     e Zd ZU eed<   dZdZdZdZdZ	dZ
dZ ej                          fd       Z xZS )MusicFlamingoPreTrainedModelr   model)audiotextTNpast_key_valuesc                     t         |   |       t        |t              r<|j	                  |j
                        }t        j                  |j                  |       y y rc   )	r&   _init_weights
isinstancer   r0   r   initcopy_r%   )r1   modulebuffer_valuer4   s      r5   ru   z*MusicFlamingoPreTrainedModel._init_weights   sH    f%f:;!::6??KLJJv--|< <r6   )rd   re   rf   r   rh   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdparE   rk   ru   rl   rm   s   @r5   ro   ro      sL    (&*#"3NU]]_= =r6   ro   c                   .     e Zd ZdZdef fdZd Z xZS ) MusicFlamingoMultiModalProjectorz
    Audio adaptor (small MLP) that projects MusicFlamingoEncoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    r   c                    t         |           t        j                  |j                  j
                  |j                  j
                  |j                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                  |j                        | _        y )N)bias)r&   r'   r   Linearaudio_configrB   text_configprojector_biaslinear_1r   projector_hidden_actactlinear_2r1   r   r4   s     r5   r'   z)MusicFlamingoMultiModalProjector.__init__   s    		++V-?-?-K-KRXRgRg
 &556		**F,>,>,J,JQWQfQf
r6   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rc   )r   r   r   )r1   audio_featureshidden_statess      r5   r`   z(MusicFlamingoMultiModalProjector.forward   s2    n5/m4r6   )rd   re   rf   rg   r   r'   r`   rl   rm   s   @r5   r   r      s    

2 
r6   r   c                      | j                   g | j                  d d dd } | j                  d      \  }}t        j                  | |fd      } | j                  d      S )NrP   r   rQ   )reshapeshapeunbindrE   stackflatten)xx1x2s      r5   rotate_halfr      sa    		'1773B<''Q'AXX"XFBbS"I2&A99R=r6   c                 V   | j                   }| j                  t        j                        } |j                  |       }|j                  |       }|j                  d   }| d|d f   }| dd |f   }||z  t        |      |z  z   }t        j                  ||fd      j                  |      S )NrP   .rQ   )r>   rH   rE   float64r   r   rV   )r   rW   rX   original_dtyperot_dimpassthroughrotateds          r5   apply_rotary_time_embr      s    "((N!$$U]]3M
&&
C
&&
CiimGWX.KC'M*G}W!5!;<G99g{+477GGr6   z
    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, rotary time embedding, a multi-modal projector, and a Qwen2 language model.
    custom_introc                       e Zd ZdZdZdZdef fdZd Zd Z	d Z
d Zd Zd	 Ze ed
      dej"                  dej$                  dej&                  dee   deez  f
d              Zee	 	 	 	 	 	 	 	 	 	 d!dej&                  dz  dej"                  dz  dej$                  dz  dej$                  dz  dej&                  dz  dedz  dej"                  dz  dej&                  dz  dedz  deej$                  z  dee   defd              Zdddef fdZdej&                  dej&                  dedej"                  fd Z xZ S )"%MusicFlamingoForConditionalGenerationNr   c                 J   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        t        |      | _        | j                          y rc   )r&   r'   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   pos_emb	post_initr   s     r5   r'   z.MusicFlamingoForConditionalGeneration.__init__   sz      ,,77$001D1DE2>>v?Q?QR%Ef%M"3F; 	r6   c                 6    | j                   j                         S rc   )r   get_input_embeddingsr1   s    r5   r   z:MusicFlamingoForConditionalGeneration.get_input_embeddings   s    ""7799r6   c                 :    | j                   j                  |       y rc   )r   set_input_embeddings)r1   values     r5   r   z:MusicFlamingoForConditionalGeneration.set_input_embeddings   s    007r6   c                 6    | j                   j                         S rc   )r   get_output_embeddingsr   s    r5   r   z;MusicFlamingoForConditionalGeneration.get_output_embeddings   s    ""88::r6   c                 :    | j                   j                  |       y rc   )r   set_output_embeddings)r1   new_embeddingss     r5   r   z;MusicFlamingoForConditionalGeneration.set_output_embeddings   s    11.Ar6   c                 :    | j                   j                  |       y rc   )r   set_decoder)r1   decoders     r5   r   z1MusicFlamingoForConditionalGeneration.set_decoder   s    ''0r6   c                 6    | j                   j                         S rc   )r   get_decoderr   s    r5   r   z1MusicFlamingoForConditionalGeneration.get_decoder   s    ""..00r6   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   input_featuresinput_features_mask	input_idskwargsr8   c                     | j                   |f|dd|}|j                  }| j                   j                  |j                  d      j	                  t
        j                              \  }}| j                  |||j                  d         }	| j                  |	j	                  |j                        |j                  d         \  }
}t        ||
|      }| j                  |      }t        j                  |j                  d   |j                        dddf   |dddf   k  }||j	                  |j                           |_        |S )	az  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Token ids containing the audio token ID placeholders, for reconstructing rotary time embedding timestamps.
        T)r   return_dictrP   r   )r7   r   r2   N)r   last_hidden_state _get_feat_extract_output_lengthssumrH   rE   long_build_audio_timestampsr   r   r2   r   r   rF   pooler_output)r1   r   r   r   r   audio_outputr   _post_lengthsaudio_timestampsrW   rX   audio_embeds
valid_masks                 r5   get_audio_featuresz8MusicFlamingoForConditionalGeneration.get_audio_features   sL   " (t''
 3
 	
 %66**KKL_LcLcdfLgLjLjkpkukuLvw<77	<Q^QdQdegQhi<< 0 3 3M4H4H IS`SfSfgiSj<kS-mS#F11-@ \\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
%1*--@S@S2T%U"r6   attention_maskposition_idsrs   inputs_embedslabels	use_cachelogits_to_keepc                    | | j                         |      }||| j                  |||d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d||||||	|
d|}|S )a&	  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor

        >>> model_id = "nvidia/music-flamingo-2601-hf"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")

        >>> conversation = [
        >>>     {
        >>>         "role": "user",
        >>>         "content": [
        >>>             {
        >>>                 "type": "text",
        >>>                 "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
        >>>             },
        >>>             {
        >>>                 "type": "audio",
        >>>                 "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
        >>>             },
        >>>         ],
        >>>     }
        >>> ]

        >>> inputs = processor.apply_chat_template(
        >>>     conversation,
        >>>     tokenize=True,
        >>>     add_generation_prompt=True,
        >>>     return_dict=True,
        >>> ).to(model.device, model.dtype)

        >>> outputs = model.generate(**inputs, max_new_tokens=100)

        >>> decoded_outputs = processor.batch_decode(
        >>>     outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
        >>> )
        >>> print(decoded_outputs)
        ["This track is an uplifting Eurodance-style Trance-Pop anthem..."]
        ```T)r   r   rP   )r   r   r   rs   r   r   r    )
r   r   r   r   audio_token_idrT   masked_scatterrH   r2   r   )r1   r   r   r   r   r   rs   r   r   r   r   r   r   audio_token_maskoutputss                  r5   r`   z-MusicFlamingoForConditionalGeneration.forward  s    F  7D557	BM%)*?22 3yVZ 3 m 
 !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +>$*=*= 	+
')%+)	+
 	+
 r6   F)is_first_iterationr   c                    |j                  dd       }|j                  dd       }t        |   |i |}|s|j                  dd      s|||d<   |||d<   |S )Nr   r   r   F)popr&   prepare_inputs_for_generationr@   )r1   r   argsr   r   r   model_inputsr4   s          r5   r   zCMusicFlamingoForConditionalGeneration.prepare_inputs_for_generationl  st    $4d;$jj)>Ew<dMfM\%5%5k5%I)1?-.".6I23r6   r   max_post_lengthc                 4   || j                   j                  k(  }t        j                  t        j                  j
                  j                  |j                         dd      d      }t        j                  |dk(        \  }}t        j                  |dk(        \  }}||z
  j                  t        j                        }	| j                   j                  dz  }
t        j                  ||j                  t        j                        |
z  }t        j                  t        j                   d|j                  	      t        j"                  |d      d d g      }t        j"                  |	d      }t        j$                  ||d
      }t        j$                  |t        j                  |	j&                  d   |j                  	            }t        j                  |j&                  d   |j                  	      ||   z
  }|j)                  d      |z  |
z  |z   S )N)r   r   r   )r   r   rQ   rP   rO   r?   r   T)right)r   r   rE   diffr   
functionalpadrD   whererH   r   rR   rF   r2   float32rV   zeroscumsumsearchsortedr   rT   )r1   r   r   r   r   r   r   startsendssample_lengthsaudio_embed_frame_stepframe_offsetscumsum_postcumsum_samplessample_indicessample_start_rowswindow_indicess                    r5   r   z=MusicFlamingoForConditionalGeneration._build_audio_timestampsz  s    %(B(BBzz%((--112B2F2F2H&XY1Z`abKK	*	6++dbj)4-++EJJ7 "&!=!=!ALL1D1DEMMZ]ss 	
 iiQ|7J7J!KU\\ZflmMnorprMs tun!<++NKtT "..ELL)=)=a)@I\I\]
 LL++A.|7J7JKN_`nNoo 	
 ''*_<?UUXeeer6   )
NNNNNNNNNr   )!rd   re   rf   _keep_in_fp32_modules_strict_tp_plan_pp_planr   r'   r   r   r   r   r   r   r   r   rE   FloatTensorr   
LongTensorr   r   rj   r   r   r   boolrD   r   r`   r   r   rl   rm   s   @r5   r   r      s+    $( HH	2 	:8;B11  w)) #\\ ##	
 +, 
+	+ @  .23737.204(,26*.!%-.Y##d*Y ))D0Y #\\D0	Y
 t+Y &&-Y Y ((4/Y   4'Y $;Y ell*Y +,Y 
 Y  Yv OT t  f## f && f 	 f
 
		 fr6   r   ).collections.abcr   mathr   typingr   rE   r   r   r    r
   rw   activationsr   cache_utilsr   
generationr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   autor   r   configuration_musicflamingor   Moduler   ro   r   r   r   r   __all__r   r6   r5   <module>r     s   , %   / / & !   ) R 6 - & ] ] 2 < V8299 V8r =? = =$ryy .
H 
Qf,H/ Qf
Qfh 34R
Sr6   