
    i                     &   d dl mZ d dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;  e+jx                  e=      Z> e)d      e G d de                    Z?e e)d       G d de                    Z@ G d  d!ej                        ZB G d" d#ej                        ZC G d$ d%e3      ZD G d& d'e2      ZE G d( d)e6      ZF G d* d+e      ZGe) G d, d-e$             ZH G d. d/eH      ZI G d0 d1e7      ZJ G d2 d3e:      ZK e)d4       G d5 d6eHe             ZLg d7ZMy)8    )Callable)	dataclassN)strict   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PreTrainedConfig)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)RopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightzUsefulSensors/moonshine-tiny)
checkpointc                       e Zd ZU dZdZdgZdddddZd	Zee	d
<   dZ
ee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   dZedz  e	d<   dZedz  e	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d <   dZeez  dz  e	d!<   dZee	d"<   d#Z ee	d$<   d%Z!eez  e	d&<   dZ"edz  e	d'<   d(Z#ee$e   z  dz  e	d)<   dZ%edz  e	d*<   dZ&ee	d+<    fd,Z' xZ(S )-MoonshineConfiga	  
    encoder_num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `num_attention_heads`.
    decoder_num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `decoder_num_attention_heads`.
    pad_head_dim_to_multiple_of (`int`, *optional*):
        Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
        optimized attention implementations.
    encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder.
    decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	moonshinepast_key_valuesdecoder_num_key_value_headsdecoder_num_attention_headsdecoder_num_hidden_layersdecoder_hidden_act)num_key_value_headsnum_attention_headsnum_hidden_layers
hidden_acti   
vocab_sizei   hidden_sizei  intermediate_size   encoder_num_hidden_layers   encoder_num_attention_headsNencoder_num_key_value_headspad_head_dim_to_multiple_ofgeluencoder_hidden_actsilui   max_position_embeddingsg{Gz?initializer_range   decoder_start_token_idT	use_cacherope_parametersis_encoder_decoderFattention_bias        attention_dropoutbos_token_idr!   eos_token_idpad_token_idtie_word_embeddingsc                     | j                   | j                  | _         | j                  | j                  | _        |j	                  dd       t        |   di | y )Npartial_rotary_factorg? )r>   r=   r/   r0   
setdefaultsuper__post_init__)selfkwargs	__class__s     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/moonshine/modular_moonshine.pyrV   zMoonshineConfig.__post_init__   sX    ++3/3/O/OD,++3/3/O/OD,137''    ))__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr7   int__annotations__r8   r9   r;   r1   r=   r0   r>   r/   r?   rA   strr2   rC   rD   floatrF   rG   boolrH   r   dictrI   rJ   rL   rM   rN   listrO   rP   rV   __classcell__rY   s   @rZ   r,   r,   2   sg   &P J#4"5<<8*	M JK!s!%&s&%&s&'(('((.2t2.2t2.2t2$$$$#&S&#u#"#C#It48O^d*T18## ND %(us{( L#* +,L#S	/D(,#L#*# $$( (r[   r,   z
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   6    e Zd ZU dZej
                  dz  ed<   y)MoonshineEncoderModelOutputNattention_mask)r\   r]   r^   ro   torchTensorrd   rS   r[   rZ   rn   rn      s     +/NELL4'.r[   rn   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y NrU   __init__configr   activation_fnnnLinearr8   r9   fc1fc2rW   rx   r6   rY   s      rZ   rw   zMoonshineEncoderMLP.__init__   s^    #J/99V//1I1IJ99V55v7I7IJr[   hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S ru   )r|   ry   r}   )rW   r   s     rZ   forwardzMoonshineEncoderMLP.forward   s4    /**=9/r[   r\   r]   r^   rw   rp   rq   r   rj   rk   s   @rZ   rs   rs      s$    KU\\ ell r[   rs   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )Nr!   rv   r~   s      rZ   rw   zMoonshineDecoderMLP.__init__   sc    #J/99V//1I1IA1MN99V55v7I7IJr[   r   r   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )Nr!   )dim)r|   chunkry   r}   )rW   r   gates      rZ   r   zMoonshineDecoderMLP.forward   sS    /+11!1<t**40=@/r[   r   rk   s   @rZ   r   r      s$    KU\\ ell r[   r   c                       e Zd Zy)MoonshineRotaryEmbeddingN)r\   r]   r^   rS   r[   rZ   r   r      s    r[   r   c                   H    e Zd Zdededededef
 fdZ	 	 	 	 ddej                  d	e	ej                  ej                  f   dz  d
ej                  dz  de
dz  dej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )MoonshineAttentionrx   	layer_idx	is_causalr4   r3   c                 n   |j                  ||d       t        | 	  ||       || _        t	        |d|j
                  |j                  z        | _        | j                  j                  C| j                  j                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _
        y d| _
        y )N)r4   r3   head_dimrE   r   )updaterU   rw   r   getattrr8   r4   r   rx   r?   head_dim_padding)	rW   rx   r   r   r4   r3   target_multipletarget_head_dimrY   s	           rZ   rw   zMoonshineAttention.__init__   s     	.AZmno+"
F4F4F&JdJd4de ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!r[   Nr   position_embeddingsro   r.   key_value_statesrX   r   c                 :   |j                   d d \  }}| j                  |      j                  ||| j                  j                  | j
                        j                  dd      }	|d u}
|Y|j                  j                  | j                        }|
r&d|j                  | j                  <   |j                  }n|j                  }||n|}|
rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j!                  |      j                  |d| j                  j                  | j
                        j                  dd      }|
r"| |j#                  ||| j                        \  }}|
s8|\  }}t%        |	|||      \  }	}| |j#                  ||| j                        \  }}t'        j(                  | j                  j*                  t,              }| j.                  xr |d u xr |dkD  }| j0                  dkD  rt2        j4                  j6                  j9                  |	d| j0                  f      }	t2        j4                  j6                  j9                  |d| j0                  f      }t2        j4                  j6                  j9                  |d| j0                  f      } || |	|||f| j:                  sdn| j<                  | j>                  |d|\  }}| j0                  dkD  r|dd | j0                   f   }|jA                  ||d      jC                         }| jE                  |      }||fS )	Nr   rE   r!   Tr   rK   )dropoutscalingr   .)#shapeq_projviewrx   r3   r   	transpose
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesk_projv_projr   r$   r   get_interface_attn_implementationr'   r   r   rp   rz   
functionalpadtrainingrL   r   reshape
contiguouso_proj)rW   r   r   ro   r.   r   rX   bszq_lenquery_statesis_cross_attentionr   current_states
key_statesvalue_statescossinattention_interfacer   attn_outputattn_weightss                        rZ   r   zMoonshineAttention.forward   s]    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9&(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "o&A+:+A+A*l\`\j\j+k(
L!*HC';L*VY[^'_$L**+:+A+A*l\`\j\j+k(
L(?(M(MKK,,.E)
 NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((r[   )NNNN)r\   r]   r^   r,   rc   rg   rw   rp   rq   tupler   r   r   r   rj   rk   s   @rZ   r   r      s    && & 	&
 !& !&0 IM.2(,04O)||O) #5<<#=>EO) t+	O)
 O)  ,,-O) -.O) 
u||U\\D0%2E2LL	MO)r[   r   c                   (     e Zd Zdedef fdZ xZS )MoonshineEncoderLayerrx   r   c                 F   t         |   ||       t        ||d|j                  |j                        | _        t        ||j                        | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFrx   r   r   r4   r3   bias)rU   rw   r   r=   r>   	self_attnrs   rA   mlprz   	LayerNormr8   input_layernormpost_attention_layernormrW   rx   r   rY   s      rZ   rw   zMoonshineEncoderLayer.__init__   s    ++ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%r[   )r\   r]   r^   r,   rc   rw   rj   rk   s   @rZ   r   r     s    U U3 U Ur[   r   c                       e Zd Zddededz  f fdZ	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	dz  de
dz  deej                  ej                  f   dz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )MoonshineDecoderLayerNrx   r   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )rU   rw   r8   r   r4   r3   r   encoder_attnr   r6   r   rz   r   r   r   final_layernormr   s      rZ   rw   zMoonshineDecoderLayer.__init__1  s    !--+ & : : & : :
 / & : : & : :
 'vv/@/@A!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr[   r   ro   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsr.   rG   r   encoder_position_embeddingsrX   r   c           
      &   |}| j                  |      } | j                  d||||||	d|\  }}||z   }|1|}| j                  |      }| j                  |||||      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|S )N)r   ro   r   r.   rG   r   )r   r   ro   r.   rG   rS   )r   r   r   r   r   r   )rW   r   ro   r   r   r   r   r.   rG   r   r   rX   residual_s                 rZ   r   zMoonshineDecoderLayer.forwardI  s     !,,];)4>> 
')%+ 3
 
q !=0 ,$H 99-HM#00+!65 /#  1  M1 %}4M ,,];/ =0r[   ru   )	NNNNNNFNN)r\   r]   r^   r,   rc   rw   rp   rq   
LongTensorr   rg   r   r   r   FloatTensorr   rj   rk   s   @rZ   r   r   0  sR   L L3: L6 /3596:048<(,!&HLPT,||, t+,  %||d2	,
 !&t 3, &&-, $..5, , $;, #5<<#=>E, &+5<<+E%F%M, +,, 
u  %(9(95;L;L(L"MPT"TT	U,r[   r   c                   \    e Zd ZU eed<   dZdZdZdZddgZ	dZ
dZdZdej                  fd	Zy
)MoonshinePreTrainedModelrx   modelinput_valuesaudioTr   r   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   rE      r   r!   )rc   )rW   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        rZ    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r[   N)r\   r]   r^   r,   rd   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphrp   r   r   rS   r[   rZ   r   r   x  sN    $O&*#02IJN!#e>N>N #r[   r   c                        e Zd ZdZdZeedZdef fdZ	de
j                  fdZde
j                  fd	Zee	 ddej"                  dej$                  d
z  dee   deez  fd              Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsr   rx   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t	        j                  t        |j                        D cg c]  }t        ||       c}      | _        t	        j                   |d      | _        t%        |      | _        d| _        | j+                          y c c}w )NrE   r   r   F)kernel_sizestrider   r!   r   r   )r   r   gh㈵>)
num_groupsnum_channelsepsr   rx   )rU   rw   rx   r8   rz   Conv1dconv1conv2conv3	GroupNorm	groupnorm
ModuleListranger;   r   r   r   
layer_normr   
rotary_embgradient_checkpointing	post_init)rW   rx   	embed_dimidxrY   s       rZ   rw   zMoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTUmm;@AaAa;bcC"63/c
 ,,yu=2&A&+# ds   D,r   c                     | j                   S ru   r   rW   s    rZ   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings  s    zzr[   valuec                     || _         y ru   r
  )rW   r  s     rZ   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings  s	    
r[   Nro   rX   c                 h   |j                  d      }t        j                  j                  | j	                  |            }| j                  |      }t        j                  j                  | j                  |            }t        j                  j                  | j                  |            }|j                  ddd      }d}|3| j                  |j                  d         }d}|ddd|f   dd|f   }|}t        | j                  |||      }t        j                  d|j                  d   |j                   	      j                  d      }| j#                  ||
      }	| j$                  D ]  }
 |
|f|||	d|} | j'                  |      }t)        |||j+                               S d      S )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        rE   r   r!   Nr   i  .rx   inputs_embedsro   r   devicer   )ro   r   r   )last_hidden_statero   )	unsqueezerz   r   tanhr   r   r@   r   r   permuter   r   r   rx   rp   aranger  r  r   r  rn   rc   )rW   r   ro   rX   r   output_attention_maskmask_lendownsample_strider   r   encoder_layers              rZ   r   zMoonshineEncoder.forward  s   . $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 !%%<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN$2!2;;')"/	
 ||A}':':1'=mFZFZ[eefgh"oom,oW![[ 	M)-)$7	
 M	 6*+:O:[0446
 	
ae
 	
r[   ru   )r\   r]   r^   r_   r   r   r   _can_record_outputsr,   rw   rz   Moduler  r  r   r    rp   r   rq   r   r   r   r   r   rj   rk   s   @rZ   r   r     s     %O(.
 $bii "))    /3<
''<
 t+<
 +,	<

 
(	(<
   <
r[   r   c                   b    e Zd ZdZ eedd      e eedd      dZdef fdZ	e
e	 	 	 	 	 	 	 	 ddej                  d	z  d
ej                  d	z  dej                  d	z  ded	z  dej                   d	z  ded	z  dej                   d	z  dej                  d	z  dee   deez  fd              Z xZS )MoonshineDecoder	input_idsrE   r   )index
layer_namer   )r   r   cross_attentionsrx   c           	         t         |   |       t        j                  |j                  d      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        y c c}w NFr   )rU   rw   rz   r   r8   normr  r  r5   r   r   )rW   rx   r  rY   s      rZ   rw   zMoonshineDecoder.__init__   s[     LL!3!3%@	mmSXY_YqYqSr$sC%:63%G$st$ss   A=Nro   r   r.   r  rG   r   r   rX   r   c	           
         |du |duz  rt        d      || j                  |      }|r6|4t        t        | j                        t        | j                              }|V||j                         nd}
t        j                  |j                  d   |j                        |
z   }|j                  d      }t        | j                  ||||      }t        | j                  |||      }|}| j                  ||	      }| j                  D ]  } ||||f|||||d
|	} | j                  |      }t!        ||r|      S d      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rE   r  )rx   r  ro   r.   r   r  r  )r   r   r.   rG   r   )r  r.   )
ValueErrorembed_tokensr
   r	   rx   get_seq_lengthrp   r  r   r  r  r   r   r  r   r)  r   )rW   r#  ro   r   r.   r  rG   r   r   rX   past_seen_tokenscausal_maskr   r   decoder_layers                  rZ   r   zMoonshineDecoder.forward  s   0 -t";<YZZ  --i8M01,dkk2RT`hlhshsTtuOCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ";;;'1"7	"
 &"oom,oW![[ 	M)%
 (>) /#$7
 
M	 		-08+/8O
 	
>B
 	
r[   )NNNNNNNN)r\   r]   r^   r   r   r   r   r  r,   rw   r   r    rp   r   rq   r   r   rg   r   r   r   r   r   rj   rk   s   @rZ   r"  r"    s2   !O$%7q[Y.*+=QSabu u
   .2.204(,26!%:>6:G
##d*G
 t+G
 &&-	G

 G
 ((4/G
 $;G
  %0047G
 !&t 3G
 +,G
 
(	(G
   G
r[   r"  c                   N   e Zd Zd Zee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de	e	ej                        dz  de
dz  d	e	ej                     dz  d
e	ej                     dz  dedz  dee   defd              Zy)MoonshineModelc                     t        d      )NzNot needed for Moonshine)AttributeErrorr  s    rZ   _mask_input_featuresz#MoonshineModel._mask_input_featuresR  s    788r[   Nr   ro   decoder_input_idsdecoder_attention_maskencoder_outputsr.   decoder_inputs_embedsdecoder_position_idsrG   rX   r   c
                 T   | | j                   |fd|i|
} | j                  d|||j                  |j                  ||||	d|
}t	        |j                  |j
                  |j                  |j                  |j                  |j                  |j                  |j                        S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        ro   )r#  ro   r   r   r.   r  r   rG   )r  r.   decoder_hidden_statesdecoder_attentionsr&  encoder_last_hidden_stater   encoder_attentionsrS   )	encoderdecoderr  ro   r   r.   r   r   r&  )rW   r   ro   r6  r7  r8  r.   r9  r:  rG   rX   decoder_outputss               rZ   r   zMoonshineModel.forwardU  s    Z "/;t||L/rYg/rkq/rOEQT\\ 
F
'1"1"C"C#2#A#A+/-
F
 
F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r[   )	NNNNNNNNN)r\   r]   r^   r5  r   r   rp   r   r   r   r
   rg   r   r   r   r   rS   r[   rZ   r2  r2  Q  s)   9  262659:>BF6:AE?C!%C
''$.C
 ((4/C
 !++d2	C

 !& 0 04 7C
 uU%6%6784?C
 -t3C
  %U%6%67$>C
 $E$4$45<C
 $;C
 +,C
 
C
  C
r[   r2  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    c                       e Zd ZddiZdef fdZd Zd Zdej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  deeej                        d	z  ded	z  deej                     d	z  deej                     d	z  ded	z  dej                  d	z  dee   defd              Z xZS )!MoonshineForConditionalGenerationzproj_out.weightz!model.decoder.embed_tokens.weightrx   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y r(  )
rU   rw   r2  r   rz   r{   r8   r7   proj_outr  )rW   rx   rY   s     rZ   rw   z*MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r[   c                     | j                   S ru   rF  r  s    rZ   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r[   c                     || _         y ru   rH  )rW   new_embeddingss     rZ   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings  s	    &r[   r   c                 6    | j                   j                         S ru   )r   r  r  s    rZ   r  z6MoonshineForConditionalGeneration.get_input_embeddings  s    zz..00r[   Nr   ro   r6  r7  r8  r.   r9  r:  rG   labelsrX   c                    |
9|7|5t        |
| j                  j                  | j                  j                        } | j                  |f||||||||	d|}| j                  |j                        }d}|
(| j                  ||
| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)ro   r6  r8  r7  r.   r9  r:  rG   )logitsrN  r7   )	lossrP  r.   r<  r=  r&  r>  r   r?  )r)   rx   rO   rF   r   rF  r  loss_functionr7   r   r.   r<  r=  r&  r>  r   r?  )rW   r   ro   r6  r7  r8  r.   r9  r:  rG   rN  rX   outputsrP  rQ  s                  rZ   r   z)MoonshineForConditionalGeneration.forward  s   d  (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+"7!5'
 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r[   )
NNNNNNNNNN)r\   r]   r^   _tied_weights_keysr,   rw   rI  rL  rz   r   r  r   r   rp   r   r   r   r
   rg   r   r   r   r   rj   rk   s   @rZ   rD  rD    sr    ,-PQ '1bii 1  262659:>BF6:AE?C!%*.R
''$.R
 ((4/R
 !++d2	R

 !& 0 04 7R
 uU%6%6784?R
 -t3R
  %U%6%67$>R
 $E$4$45<R
 $;R
   4'R
 +,R
 
R
  R
r[   rD  )r,   r2  r   rD  )Ncollections.abcr   dataclassesr   rp   torch.nnrz   huggingface_hub.dataclassesr   activationsr   cache_utilsr   r	   r
   configuration_utilsr   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r    glm.modeling_glmr"   r#   r$   llama.modeling_llamar%   r&   r'   whisper.modeling_whisperr(   r)   
get_loggerr\   loggerr,   rn   r   rs   r   r   r   r   r   r   r   r"  r2  rD  __all__rS   r[   rZ   <module>rm     s   % !   . ! C C 3 ) J B 9  2 F & R R 7 E U U Y Y G 
		H	% 9:S(& S(  ;S(l 
// / /")) "))  	1 	e) e)PU- U"E6 EP # # #0d
/ d
NV
z V
rI
\ I
X 
h
(@/ h

h
Vr[   