
    i9                         d dl Z d dlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ  e       rd dlZ ej$                  e      Z G d d	e	d
      Z G d de
      ZdgZy)    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   *    e Zd Zddidddddddd	Zy
)GlmAsrProcessorKwargspaddingTi>  
max_length)sampling_ratereturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     }/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/glmasr/processing_glmasr.pyr   r   (   s2     t
 #%)#
 #"
Ir   r   F)totalc                       e Zd ZdZ	 	 	 	 d fd	ZddZd Zd Z	 	 dd	ee	e   z  d
e
dz  dedz  dee   def
dZede	e   fd       Z	 dd
ee	e   z  e
z  dee	e   z  dz  dee   defdZdddZd Zd	edefdZ xZS )GlmAsrProcessora  
    Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
    tokenizer into a single processor.

    [`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.

    Args:
            feature_extractor ([`WhisperFeatureExtractor`]):
                The feature extractor is a required input.
            tokenizer ([`Qwen2TokenizerFast`]):
                The tokenizer is a required input.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
                template will be used.
            audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
                Special token used to represent audio inputs in the chat template.
            default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
                Default prompt to use for transcription tasks when applying transcription requests.
            max_audio_len (`int`, *optional*, defaults to 655):
                Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
                655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
    Nc                     || _         |j                  |      | _        || _        || _        t
        |   |||       y )N)chat_template)audio_tokenconvert_tokens_to_idsaudio_token_iddefault_transcription_promptmax_audio_lensuper__init__)selffeature_extractor	tokenizerr%   r&   r)   r*   	__class__s          r    r,   zGlmAsrProcessor.__init__R   sE     ''==kJ,H)**I]Sr   returnc                 d    d}dD ]  \  }}}|d|z  z   |dz
  z
  dz
  |z  dz   } ||z
  |z  dz   }|S )N   ))   r   r4   )r4   r      r5   r4   r   )r-   audio_lengthsmerge_factorr   kernel_sizestride
num_tokenss          r    _get_audio_token_lengthz'GlmAsrProcessor._get_audio_token_lengtha   sc    ,B 	`(G[&*Q[8K!OLqPU[[^__M	` $l2|CaG
r   c                    t        j                  t        j                  |j                  d      |      D cg c]  }|j                          c}      }| j	                  |      }t        j                  t        j                  | j                              }t        |      D ]*  \  }}	|j                  | j                  |	z  ||         ||<   , |S c c}w )N)torchstacksplitsumr;   recompileescaper&   	enumeratesub)
r-   textpadding_maskper_sample_windowssr6   audio_tokens_lengthsaudio_token_patterniaudio_lengths
             r    _expand_audio_tokensz$GlmAsrProcessor._expand_audio_tokensi   s    ekk,BRBRSUBVXj6k$lQUUW$lm#;;MJ jj43C3C)DE()=> 	XOA|)--d.>.>.MtTUwWDG	X %ms   Cc                      || j                   k(  S N)r(   )r-   	input_idss     r    _get_audio_tokens_maskz&GlmAsrProcessor._get_audio_tokens_maskq   s    D////r   FrG   audiooutput_labelskwargsc           
          | j                   t        fd| j                  j                  i|}|d   }|d   }|j	                  d      }|dk7  r"t        | j                  j                   d      t        |t              r|g}n3t        |t        t        f      rt        d |D              st        d      i }	|t        |      }t        |      t        |      k7  r$t        d	t        |       d
t        |       d      t        |d   | j                   j"                  z        }
t        | j$                  | j                   j"                  z        }g }g }|D ]  }t        |j&                  d         }t)        d||
z   dz
  |
z        }||kD  r<t*        j-                  d||d   z  dd| j$                   d| j$                   d       |}|j/                  |       t1        |||
z        }t3        |      D ]-  }||
z  }t1        |dz   |
z  |      }|j/                  |||        /   | j                   |fi |}	|	j5                  d      }||	d<   | j7                  |||      } | j                  |fi |}i ||	}|rH|d   j9                         }d|| j;                  |      <   d||| j                  j<                  k(  <   ||d<   t?        ||      S )a=  
        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
        the text is tokenized as-is (LM-only behavior).

        Args:
            text (`str` or `list[str]`):
                Input sequence or batch of sequences.
            audio (`np.ndarray` or `list[np.ndarray]`):
                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
                `audio` inputs.
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training.

        Returns:
            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
            audio features (`input_features`, `input_features_mask`).
        tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c              3   <   K   | ]  }t        |t                y wrQ   
isinstancestr).0ts     r    	<genexpr>z+GlmAsrProcessor.__call__.<locals>.<genexpr>   s     9[QR*Q:L9[   zAInvalid input text. Please provide a string, or a list of stringszGot z
 text but z audios; they must match 1:1.r   r   r4   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_maskrR   ilabels)datatensor_type) _merge_kwargsr   r/   init_kwargsget
ValueErrorr0   r   r[   r\   listtupleallr   lenintr.   chunk_lengthr*   shapemaxloggerwarningappendminrangepoprO   clonerS   pad_token_idr   )r-   rG   rT   rU   rV   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsrI   flat_chunksaudio_el	n_samplesn_wintime_caprM   startendrH   text_inputsrd   rc   s                            r    __call__zGlmAsrProcessor.__call__t   s,   : )d((!
"&.."<"<
 
 "-0">2$)9:T! 7 788]^__dC 6DTD%=1c9[VZ9[6[`aa&u-E4yCJ& 4D	{*SZLHe!fgg l?;d>T>T>a>aabKd00D4J4J4W4WWXK,.,.K! <q 12	A	K 7! ;KL;&NN*9|O7T+TUX*YYdeiewewdx  yP  QU  Qc  Qc  Pd  df  g (E"))%0y%+*=>u <AOEq1u3X>C&&xc':;<<" 2411+NNL'++,<=L2>L./ ,,T<ASTD %dnnT9[9.+..+&,,.F:>F4..v67<@F6T^^8889#DN>BBr   c                     | j                   j                  }| j                  j                  }t        t        j                  ||z   dgz               S )Nrb   )r/   model_input_namesr.   rj   dictfromkeys)r-   	tok_names	fea_namess      r    r   z!GlmAsrProcessor.model_input_names   sD    NN44	**<<	DMM)i"7;P:Q"QRSSr   promptc           	         t        |t              r|g}nt        |t        t        f      r |rt	        d |D              rt        |      }nst        t        |            }t               rU|D cg c]J  }t        |t        j                        r,|j                         j                         j                         n|L }}t        |      }|dk(  rt        d      || j                  g|z  }nt        |t              r|g|z  }nt        |t        t        f      r}t        |      |k7  rt        dt        |       d| d      g }|D ]L  }||j                  | j                         !t        |t              r|j                  |       Ct!        d       nt!        d      t#        ||      D 	
cg c](  \  }	}
d	t        |
t              rd
|
dnd
|
dd|	dgdg* }}	}
 | j$                  |fdddd|S c c}w c c}
}	w )a	  
        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

        Args:
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
            prompt (`str` or `list[str]`, *optional*):
                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
                each sample uses `"Transcribe the input speech."`.
            **kwargs:
                Additional keyword arguments forwarded to [`~GlmAsrProcessor.apply_chat_template`] (for example
                `text_kwargs`, `audio_kwargs`, ...).

        Returns:
            [`BatchFeature`]: Processor outputs ready to be passed to [`GlmAsrForConditionalGeneration.generate`].

        c              3   <   K   | ]  }t        |t                y wrQ   rZ   )r]   els     r    r_   z>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>   s     ?dXZ
2s@S?dr`   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userrT   )typepath)r   rT   rG   )r   rG   )rolecontentT)tokenizeadd_generation_promptreturn_dict)r[   r\   rj   rk   rl   r   r   r>   Tensordetachcpunumpyrm   ri   r)   rt   	TypeErrorzipapply_chat_template)r-   rT   r   rV   audio_itemsr   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r    apply_transcription_requestz+GlmAsrProcessor.apply_transcription_request   s   2 eS!38'Ke}-%C?d^c?d<du+K1%89K!#kvwegJr5<<<Xryy{0668^``ww%
?HII>889JFG$h+Gu.6{j( F}OJ<Gkl  G O<NN4#D#DEc*NN4(#$MNNO Z[[ ,/w+D
 (Z # &j#6 ")*=&-
C!'=	 

 
 (t''
"&	

 
 	
S x4
s   -AG70-G<)strip_prefixc                     | j                   j                  |i |}|r|D cg c]  }| j                  |       }}|S c c}w )aj  
        Forward arguments to [`~PreTrainedTokenizer.decode`] and optionally remove the assistant framing the model
        was trained to produce.

        AF3 transcription requests respond with sentences such as `"The spoken content of the audio is "..."."`.
        Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
        )r/   decode"_strip_assistant_prefix_and_quotes)r-   r   argsrV   decodedrG   s         r    r   zGlmAsrProcessor.decode*  sK     ($..''88QXYt>>tDYGY Zs   ?c                 &     | j                   |i |S )z)BC as previous examples used batch_decode)r   )r-   r   rV   s      r    batch_decodezGlmAsrProcessor.batch_decode7  s    t{{D+F++r   c                 @   |j                         }dD ]1  }|j                  |      s|t        |      d j                         } n |j                  d      r|dd j                         }t        |      dk\  r%|d   |d   k(  r|d   dv r|dd j                         }|S )	zi
        Remove the assistant prefix and surrounding quotes from a decoded transcription string.
        )z"The spoken content of the audio isz!The transcription of the audio isz!The content of the input audio isN.r=   r5   r   >   "'r4   )strip
startswithrm   endswith)r-   rG   strippedprefixs       r    r   z2GlmAsrProcessor._strip_assistant_prefix_and_quotes;  s    
 ::<
 	F
 ""6*#CKM288:	 S!}**,Hx=A(1+""=(1+Q[B["~++-Hr   )Nz<|pad|>z&Please transcribe this audio into texti  )r6   torch.Tensorr1   r   )NFrQ   )r   r   r   __doc__r,   r;   rO   rS   r
   rj   r   boolr	   r   r   r   propertyr\   r   r   r   r   r   __classcell__)r0   s   @r    r#   r#   9   s'   8 %MT0 $(%*	]C$y/)]C D ]C d{	]C
 ./]C 
]C~ T49 T T *.O
T#Y+O
 d3i$&O
 ./	O

 
O
b */ ,s s r   r#   )rB   r   npaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   utilsr   r   r>   
get_loggerr   rr   r   r#   __all__r   r   r    <module>r      sf   , 
  9 4 H H 0 0  
		H	%,E "Xn Xv 
r   