
    i'                         d dl Z d dlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ  e       rd dlZ ej$                  e      Z G d d	e	d
      Z G d de
      ZdgZy)    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   *    e Zd Zddidddddddd	Zy
)MusicFlamingoProcessorKwargspaddingTi>  
max_length)sampling_ratereturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/musicflamingo/processing_musicflamingo.pyr   r   (   s2     t
 #%)#
 #"
Ir   r   F)totalc                        e Zd ZdZ	 	 	 	 	 d fd	Zd Zd Zd Z	 	 ddee	e   z  de
dz  d	edz  d
ee   def
dZede	e   fd       Z xZS )MusicFlamingoProcessorah  
    Constructs an MusicFlamingo processor which wraps an MusicFlamingo feature extractor and an MusicFlamingo
    tokenizer into a single processor.

    [`MusicFlamingoProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~MusicFlamingoProcessor.__call__`] for more information.

    Args:
        feature_extractor ([`WhisperFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`Qwen2TokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`Optional[str]`, *optional*):
            The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
            template will be used.
        audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
            Special token used to represent audio inputs in the chat template.
        audio_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
            Special token used to represent the beginning of audio.
        audio_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
            Special token used to represent the end of audio.
        max_audio_len (`int`, *optional*, defaults to 1200):
            Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
    Nc                     || _         |j                  |      | _        || _        t        |   |||       || _        || _        |j                  |      | _        |j                  |      | _	        y )N)chat_template)
audio_tokenconvert_tokens_to_idsaudio_token_idmax_audio_lensuper__init__audio_bos_tokenaudio_eos_tokenaudio_bos_token_idaudio_eos_token_id)	selffeature_extractor	tokenizerr%   r&   r,   r-   r)   	__class__s	           r    r+   zMusicFlamingoProcessor.__init__S   ss     ''==kJ**I]S.."+"A"A/"R"+"A"A/"Rr   c                 2    |dz
  dz  dz   }|dz
  dz  dz   }|S )N      r   )r0   audio_lengthsconv_output_lengthsaudio_tokens_lengthss       r    _get_audio_token_lengthz.MusicFlamingoProcessor._get_audio_token_lengthf   s2    ,q0Q6: 3a 7A=A##r   c                    t        j                  t        j                  |j                  d      |      D cg c]  }|j                          c}      }| j	                  |      }t        j                  t        j                  | j                              }t        |      D ]D  \  }}	|j                  | j                  | j                  |	z  z   | j                  z   ||         ||<   F |S c c}w )N)torchstacksplitsumr:   recompileescaper&   	enumeratesubr,   r-   )
r0   textpadding_maskper_sample_windowssr7   r9   audio_token_patterniaudio_lengths
             r    _expand_audio_tokensz+MusicFlamingoProcessor._expand_audio_tokensk   s    ekk,BRBRSUBVXj6k$lQUUW$lm#;;MJ jj43C3C)DE()=> 	OA|)--$$t'7'7,'FFI]I]]QDG	
  %ms   C,c                 `    || j                   k(  || j                  k(  z  || j                  k(  z  S N)r(   r.   r/   )r0   	input_idss     r    _get_audio_tokens_maskz-MusicFlamingoProcessor._get_audio_tokens_maskv   s;    $---D3335D3335	
r   rF   audiooutput_labelskwargsreturnc           
          | j                   t        fd| j                  j                  i|}|d   }|d   }|j	                  d      }|dk7  r"t        | j                  j                   d      t        |t              r|g}n3t        |t        t        f      rt        d |D              st        d      i }	|t        |      }t        |      t        |      k7  r$t        d	t        |       d
t        |       d      t        |d   | j                   j"                  z        }
t        | j$                  | j                   j"                  z        }g }g }|D ]  }t        |j&                  d         }t)        d||
z   dz
  |
z        }||kD  r<t*        j-                  d||d   z  dd| j$                   d| j$                   d       |}|j/                  |       t1        |||
z        }t3        |      D ]-  }||
z  }t1        |dz   |
z  |      }|j/                  |||        /   | j                   |fi |}	|	j5                  d      }||	d<   | j7                  |||      } | j                  |fi |}i ||	}|rH|d   j9                         }d|| j;                  |      <   d||| j                  j<                  k(  <   ||d<   t?        ||      S )a=  
        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
        the text is tokenized as-is (LM-only behavior).

        Args:
            text (`str` or `list[str]`):
                Input sequence or batch of sequences.
            audio (`np.ndarray` or `list[np.ndarray]`):
                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
                `audio` inputs.
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training.

        Returns:
            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
            audio features (`input_features`, `input_features_mask`).
        tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c              3   <   K   | ]  }t        |t                y wrO   )
isinstancestr).0ts     r    	<genexpr>z2MusicFlamingoProcessor.__call__.<locals>.<genexpr>   s     9[QR*Q:L9[s   zAInvalid input text. Please provide a string, or a list of stringszGot z
 text but z audios; they must match 1:1.r   r   r5   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_maskrP   ilabels)datatensor_type) _merge_kwargsr   r2   init_kwargsget
ValueErrorr3   r   rY   rZ   listtupleallr   lenintr1   chunk_lengthr)   shapemaxloggerwarningappendminrangepoprM   clonerQ   pad_token_idr   )r0   rF   rR   rS   rT   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsrH   flat_chunksaudio_el	n_samplesn_wintime_caprK   startendrG   text_inputsra   r`   s                            r    __call__zMusicFlamingoProcessor.__call__}   s,   : )d(((
"&.."<"<
 
 "-0">2$)9:T! 7 788]^__dC 6DTD%=1c9[VZ9[6[`aa&u-E4yCJ& 4D	{*SZLHe!fgg l?;d>T>T>a>aabKd00D4J4J4W4WWXK,.,.K! <q 12	A	K 7! ;KL;&NN*9|O7T+TUX*YYdeiewewdx  yP  QU  Qc  Qc  Pd  df  g (E"))%0y%+*=>u <AOEq1u3X>C&&xc':;<<" 2411+NNL'++,<=L2>L./ ,,T<ASTD %dnnT9[9.+..+&,,.F:>F4..v67<@F6T^^8889#DN>BBr   c                     | j                   j                  }| j                  j                  }t        t        j                  ||z   dgz               S )Nr_   )r2   model_input_namesr1   rg   dictfromkeys)r0   	tok_names	fea_namess      r    r   z(MusicFlamingoProcessor.model_input_names   sD    NN44	**<<	DMM)i"7;P:Q"QRSSr   )Nz<sound>z<|sound_bos|>z<|sound_eos|>i  )NF)r   r   r   __doc__r+   r:   rM   rQ   r
   rg   r   boolr	   r   r   r   propertyrZ   r   __classcell__)r3   s   @r    r#   r#   9   s    : ''S&$
	
 $(%*	]C$y/)]C D ]C d{	]C
 56]C 
]C~ T49 T Tr   r#   )rA   numpynpaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   utilsr   r   r=   
get_loggerr   ro   r   r#   __all__r   r   r    <module>r      sh   , 
  9 4 H H 0 0  
		H	%#35 "gT^ gTT $
$r   