
    iH                        d dl Z d dlmZ d dlmZ d dlmZmZ ddlm	Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!  e       rd dlZ ed      e G d de                    Z" G d de      Z#d Z$d Z% G d de!      Z& G d de      Z' ed       G d d e             Z(g d!Z)y)"    N)pi)strict)Tensorbroadcast_tensors   )initialization)Cache)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_available   )AudioFlamingo3Config)&AudioFlamingo3ForConditionalGenerationAudioFlamingo3PreTrainedModel)AudioFlamingo3Processor)MoonshineRotaryEmbeddingznvidia/music-flamingo-2601-hf)
checkpointc                   b     e Zd ZU dZdZeed<   dZeed<   dZe	ed<   dZ
edz  ed	<    fd
Z xZS )MusicFlamingoConfiga  
    audio_bos_token_id (`int`, *optional*, defaults to 151670):
        The beginning-of-audio token index used to mark the start of audio spans.
    audio_eos_token_id (`int`, *optional*, defaults to 151671):
        The end-of-audio token index used to mark the end of audio spans.
    audio_frame_step (`float`, *optional*, defaults to 0.01):
        Duration in seconds of one input mel frame (trained with hop_length 160 at sampling_rate 16000).

    Example:

    ```python
    >>> from transformers import MusicFlamingoForConditionalGeneration, MusicFlamingoConfig, AudioFlamingo3EncoderConfig, Qwen2Config

    >>> # Initializing an MusicFlamingoEncoder config
    >>> audio_config = AudioFlamingo3EncoderConfig()

    >>> # Initializing a Qwen2 config
    >>> text_config = Qwen2Config()

    >>> # Initializing an MusicFlamingo configuration
    >>> configuration = MusicFlamingoConfig(audio_config, text_config)

    >>> # Initializing a model from the musicflamingo style configuration
    >>> model = MusicFlamingoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```ivP audio_bos_token_idiwP audio_eos_token_idg{Gz?audio_frame_stepNrope_parametersc                     t        |   di | | j                  dddd| _        | j                  d   | _        | j                  j
                  | _        y )Ndefault  g?)	rope_type
rope_thetapartial_rotary_factorr#    )super__post_init__r   max_position_embeddingsaudio_confighidden_sizehead_dim)selfkwargs	__class__s     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/musicflamingo/modular_musicflamingo.pyr'   z!MusicFlamingoConfig.__post_init__N   sU    '''1:$il#mD '+';';L'I$))55    )__name__
__module____qualname____doc__r   int__annotations__r   r   floatr   dictr'   __classcell__r.   s   @r/   r   r   )   sC    : %$$$"e"#'OTD['6 6r0   r   c                   R     e Zd ZdZ	 	 	 	 	 d	 fd	Zd Zd Zd Zd Zd Z	d Z
 xZS )
MusicFlamingoProcessorah  
    Constructs an MusicFlamingo processor which wraps an MusicFlamingo feature extractor and an MusicFlamingo
    tokenizer into a single processor.

    [`MusicFlamingoProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~MusicFlamingoProcessor.__call__`] for more information.

    Args:
        feature_extractor ([`WhisperFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`Qwen2TokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`Optional[str]`, *optional*):
            The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
            template will be used.
        audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
            Special token used to represent audio inputs in the chat template.
        audio_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
            Special token used to represent the beginning of audio.
        audio_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
            Special token used to represent the end of audio.
        max_audio_len (`int`, *optional*, defaults to 1200):
            Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
    c                     t         |   |||||       | `|| _        || _        |j                  |      | _        |j                  |      | _        y )N)chat_templateaudio_tokenmax_audio_len)r&   __init__default_transcription_promptaudio_bos_tokenaudio_eos_tokenconvert_tokens_to_idsr   r   )	r,   feature_extractor	tokenizerr>   r?   rC   rD   r@   r.   s	           r/   rA   zMusicFlamingoProcessor.__init__q   sd     	'#' 	 	
 -.."+"A"A/"R"+"A"A/"Rr0   c                    t        j                  t        j                  |j                  d      |      D cg c]  }|j                          c}      }| j	                  |      }t        j                  t        j                  | j                              }t        |      D ]D  \  }}	|j                  | j                  | j                  |	z  z   | j                  z   ||         ||<   F |S c c}w )N)torchstacksplitsum_get_audio_token_lengthrecompileescaper?   	enumeratesubrC   rD   )
r,   textpadding_maskper_sample_windowssaudio_lengthsaudio_tokens_lengthsaudio_token_patterniaudio_lengths
             r/   _expand_audio_tokensz+MusicFlamingoProcessor._expand_audio_tokens   s    ekk,BRBRSUBVXj6k$lQUUW$lm#;;MJ jj43C3C)DE()=> 	OA|)--$$t'7'7,'FFI]I]]QDG	
  %ms   C,c                 `    || j                   k(  || j                  k(  z  || j                  k(  z  S N)audio_token_idr   r   )r,   	input_idss     r/   _get_audio_tokens_maskz-MusicFlamingoProcessor._get_audio_tokens_mask   s;    $---D3335D3335	
r0   c                     t        d      Nz/This method is not supported for MusicFlamingo.NotImplementedErrorr,   argsr-   s      r/   apply_transcription_requestz2MusicFlamingoProcessor.apply_transcription_request       !"STTr0   c                     t        d      Nz5MusicFlamingo does not need to overwrite this method.re   rg   s      r/   decodezMusicFlamingoProcessor.decode       !"YZZr0   c                     t        d      rl   re   rg   s      r/   batch_decodez#MusicFlamingoProcessor.batch_decode   rn   r0   c                     t        d      rd   re   rg   s      r/   "_strip_assistant_prefix_and_quotesz9MusicFlamingoProcessor._strip_assistant_prefix_and_quotes   rj   r0   )Nz<sound>z<|sound_bos|>z<|sound_eos|>r!   )r1   r2   r3   r4   rA   r]   rb   ri   rm   rp   rr   r9   r:   s   @r/   r<   r<   W   sA    : ''S.	
U[[Ur0   r<   c                      | j                   g | j                  d d dd } | j                  d      \  }}t        j                  | |fd      } | j                  d      S )NrI   r   dim)reshapeshapeunbindrJ   rK   flatten)xx1x2s      r/   rotate_halfr~      sa    		'1773B<''Q'AXX"XFBbS"I2&A99R=r0   c                 V   | j                   }| j                  t        j                        } |j                  |       }|j                  |       }|j                  d   }| d|d f   }| dd |f   }||z  t        |      |z  z   }t        j                  ||fd      j                  |      S )NrI   .rt   )dtypetorJ   float64rx   r~   cat)hidden_statescossinoriginal_dtyperot_dimpassthroughrotateds          r/   apply_rotary_time_embr      s    "((N!$$U]]3M
&&
C
&&
CiimGWX.KC'M*G}W!5!;<G99g{+477GGr0   c            	       v     e Zd ZdZd	def fdZd Z ej                         de	de
dee	e	f   fd       Z xZS )
MusicFlamingoRotaryEmbeddinga  Rotary time embedding module used by MusicFlamingo checkpoints.

    This is a checkpoint-faithful integration, not a direct implementation of the RoTE formulation described in
    (Goel et al., 2024): https://arxiv.org/abs/2410.12109. It applies axial rotary embeddings over the window index
    within each audio sample and the encoder time index within each window, then modulates both axes with absolute
    timestamps in seconds.
    configc                     t         |   ||       | j                  | j                        }| j	                  d|d       y )Ndeviceposition_anglesF)
persistent)r&   rA   _compute_position_anglesinv_freqregister_buffer)r,   r   r   r   r.   s       r/   rA   z%MusicFlamingoRotaryEmbedding.__init__   s?    /77F.ERr0   c                 B   t        j                  t        | j                        |j                  |j
                        }|| j                  z  dt        z  z  }|j                  d      |z  }t        j                  |dd      }|j                  |j
                        S )Nr   r   r   rI   rt   )r   )
rJ   aranger5   max_seq_len_cachedr   r   r   	unsqueezerepeat_interleaver   )r,   r   	positionsr   s       r/   r   z5MusicFlamingoRotaryEmbedding._compute_position_angles   s    LLT%<%<!=hoo]e]k]kl	 7 771r6B	#--b1H<11/1"M!!!77r0   
timestampsseq_lenreturnc                    |dddf   j                  | j                  j                  | j                  j                        }| j                  j
                  dz  |z  }t        j                  ||z        | j                  z  }|j                  d      | j                  z  }t        j                  |dd      }|dddddf   }| j                  d| dddddf   }t        ||      \  }}t        j                  ||fd      }| dz  t        z  j                  |      }	||	j                  d      z  }|j                         |j!                         fS )zBCompute 2D axial rotary embeddings for window and time dimensions.Nr   r      rI   r   rt   )r   r   r   r   r   r   rJ   roundr   r   r   r   r   r   r   r   r   )
r,   r   r   window_startswindow_durationwindow_positionswindow_freqs
time_freqsfreqsangles
             r/   forwardz$MusicFlamingoRotaryEmbedding.forward   s5   
 #1a4(++4==3G3Gt}}ObOb+c++66:WD ;;}'FG$JaJaa'11"5E..|QBG $AtQJ/))(73D!QJ?
#4\:#N j		<4"=q2%))%0++yy{EIIK''r0   r_   )r1   r2   r3   r4   r   rA   r   rJ   no_gradr   r5   tupler   r9   r:   s   @r/   r   r      sV    S2 S
8 U]]_(& (3 (5;P ( (r0   r   c                   >    e Zd ZdZ ej
                         d        Zy)MusicFlamingoPreTrainedModelNc                     t        j                  | |       t        |t              r<|j	                  |j
                        }t        j                  |j                  |       y y r_   )	r   _init_weights
isinstancer   r   r   initcopy_r   )r,   modulebuffer_values      r/   r   z*MusicFlamingoPreTrainedModel._init_weights   sJ    %%dF3f:;!::6??KLJJv--|< <r0   )r1   r2   r3   _no_split_modulesrJ   r   r   r%   r0   r/   r   r      s"    U]]_= =r0   r   z
    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, rotary time embedding, a multi-modal projector, and a Qwen2 language model.
    custom_introc                   B    e Zd Zdef fdZdej                  dej                  dedej                  fdZ	e
 ed	      d
ej                  dej                  dej                  dee   deez  f
d              Ze
e	 	 	 	 	 	 	 	 	 	 ddej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dedz  deej                  z  dee   defd              Z xZS )%MusicFlamingoForConditionalGenerationr   c                 D    t         |   |       t        |      | _        y r_   )r&   rA   r   pos_emb)r,   r   r.   s     r/   rA   z.MusicFlamingoForConditionalGeneration.__init__   s     3F;r0   ra   post_lengthsmax_post_lengthr   c                 4   || j                   j                  k(  }t        j                  t        j                  j
                  j                  |j                         dd      d      }t        j                  |dk(        \  }}t        j                  |dk(        \  }}||z
  j                  t        j                        }	| j                   j                  dz  }
t        j                  ||j                  t        j                        |
z  }t        j                  t        j                   d|j                  	      t        j"                  |d      d d g      }t        j"                  |	d      }t        j$                  ||d
      }t        j$                  |t        j                  |	j&                  d   |j                  	            }t        j                  |j&                  d   |j                  	      ||   z
  }|j)                  d      |z  |
z  |z   S )N)   r   r   )valuer   rt   rI   r   r   r   T)right)r   r`   rJ   diffnn
functionalpadr5   wherer   longr   r   r   float32r   zeroscumsumsearchsortedrx   r   )r,   ra   r   r   audio_token_maskr   _startsendssample_lengthsaudio_embed_frame_stepframe_offsetscumsum_postcumsum_samplessample_indicessample_start_rowswindow_indicess                    r/   _build_audio_timestampsz=MusicFlamingoForConditionalGeneration._build_audio_timestamps   s    %(B(BBzz%((--112B2F2F2H&XY1Z`abKK	*	6++dbj)4-++EJJ7 "&!=!=!ALL1D1DEMMZ]ss 	
 iiQ|7J7J!KU\\ZflmMnorprMs tun!<++NKtT "..ELL)=)=a)@I\I\]
 LL++A.|7J7JKN_`nNoo 	
 ''*_<?UUXeeer0   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   input_featuresinput_features_maskr-   c                     | j                   |f|dd|}|j                  }| j                   j                  |j                  d      j	                  t
        j                              \  }}| j                  |||j                  d         }	| j                  |	j	                  |j                        |j                  d         \  }
}t        ||
|      }| j                  |      }t        j                  |j                  d   |j                        dddf   |dddf   k  }||j	                  |j                           |_        |S )	az  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Token ids containing the audio token ID placeholders, for reconstructing rotary time embedding timestamps.
        T)r   return_dictrI   rv   )r   r   r   N)audio_towerlast_hidden_state _get_feat_extract_output_lengthsrM   r   rJ   r   r   rx   r   r   r   multi_modal_projectorr   pooler_output)r,   r   r   ra   r-   audio_outputr   r   r   audio_timestampsr   r   audio_embeds
valid_masks                 r/   get_audio_featuresz8MusicFlamingoForConditionalGeneration.get_audio_features  sL   " (t''
 3
 	
 %66**KKL_LcLcdfLgLjLjkpkukuLvw<77	<Q^QdQdegQhi<< 0 3 3M4H4H IS`SfSfgiSj<kS-mS#F11-@ \\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
%1*--@S@S2T%U"r0   Nattention_maskposition_idspast_key_valuesinputs_embedslabels	use_cachelogits_to_keepc                    | | j                         |      }||| j                  |||d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d||||||	|
d|}|S )a&	  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor

        >>> model_id = "nvidia/music-flamingo-2601-hf"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")

        >>> conversation = [
        >>>     {
        >>>         "role": "user",
        >>>         "content": [
        >>>             {
        >>>                 "type": "text",
        >>>                 "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
        >>>             },
        >>>             {
        >>>                 "type": "audio",
        >>>                 "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
        >>>             },
        >>>         ],
        >>>     }
        >>> ]

        >>> inputs = processor.apply_chat_template(
        >>>     conversation,
        >>>     tokenize=True,
        >>>     add_generation_prompt=True,
        >>>     return_dict=True,
        >>> ).to(model.device, model.dtype)

        >>> outputs = model.generate(**inputs, max_new_tokens=100)

        >>> decoded_outputs = processor.batch_decode(
        >>>     outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
        >>> )
        >>> print(decoded_outputs)
        ["This track is an uplifting Eurodance-style Trance-Pop anthem..."]
        ```T)ra   r   rI   )r   r   r   r   r   r   r   r%   )
get_input_embeddingsr   r   r   r`   r   masked_scatterr   r   language_model)r,   ra   r   r   r   r   r   r   r   r   r   r-   r   r   outputss                  r/   r   z-MusicFlamingoForConditionalGeneration.forward@  s    F  7D557	BM%)*?22 3yVZ 3 m 
 !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +>$*=*= 	+
')%+)	+
 	+
 r0   )
NNNNNNNNNr   )r1   r2   r3   r   rA   rJ   
LongTensorr5   FloatTensorr   r   r   r   r   r   r   r
   r   r	   boolr   r   r9   r:   s   @r/   r   r      s   <2 < f## f && f 	 f
 
		 fD  w)) #\\ ##	
 +, 
+	+ @  .23737.204(,26*.!%-.Y##d*Y ))D0Y #\\D0	Y
 t+Y &&-Y Y ((4/Y   4'Y $;Y ell*Y +,Y 
 Y  Yr0   r   )r   r<   r   r   )*rO   mathr   huggingface_hub.dataclassesr   rJ   r   r    r   r   cache_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   +audioflamingo3.configuration_audioflamingo3r   &audioflamingo3.modeling_audioflamingo3r   r   (audioflamingo3.processing_audioflamingo3r   moonshine.modeling_moonshiner   r   r<   r~   r   r   r   r   __all__r%   r0   r/   <module>r     s     
  . + &   R - & ] ] N O C  :;)6. )6  <)6XMU4 MU`
H'(#; '(T=#@ = 
f,R f
fRr0   