
    i3                        d Z ddlZddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/  e'j`                  e1      Z2 G d dejf                        Z4dejj                  de6dejj                  fdZ7	 d<dejf                  dejj                  dejj                  d ejj                  d!ejj                  dz  d"e8d#e8fd$Z9 G d% d&ejf                        Z: G d' d(ejf                        Z; G d) d*ejf                        Z< G d+ d,ejf                        Z= G d- d.e      Z> G d/ d0e      Z?e% G d1 d2e              Z@e% G d3 d4e@             ZA G d5 d6e@e      ZB e%d78       G d9 d:e@             ZCg d;ZDy)=zPyTorch Zamba model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)lazy_load_kernel)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)resolve_internal_import)capture_outputs   )ZambaConfigc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	ZambaRMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        ZambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer"   	__class__s      y/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.pyr&   zZambaRMSNorm.__init__2   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor(   float32powmeanrsqrtr+   r*   )r,   r1   input_dtypevariances       r/   forwardzZambaRMSNorm.forward:   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r0   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler*   shaper+   )r,   s    r/   
extra_reprzZambaRMSNorm.extra_reprA   s*    ))*+6$2G2G1HIIr0   )gư>)
__name__
__module____qualname__floatr&   r(   Tensorr>   rB   __classcell__r.   s   @r/   r!   r!   1   s7    $ $$ $;U\\ ;ell ;Jr0   r!   r1   n_repr#   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rA   expandreshape)r1   rJ   batchnum_key_value_headsslenhead_dims         r/   	repeat_kvrR   F   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr0   modulequerykeyvalueattention_maskscalingdropoutc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr3   r   r4   )dimr6   )ptrainingr   )rR   num_key_value_groupsr(   matmul	transposer   
functionalsoftmaxr8   r7   r6   rY   r]   
contiguous)rS   rT   rU   rV   rW   rX   rY   kwargs
key_statesvalue_statesattn_weightsattn_outputs               r/   eager_attention_forwardri   R   s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r0   c                        e Zd ZdZdedef fdZ	 ddej                  dedej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  eej                     dz  f   fdZ xZS )ZambaAttentionaA  
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    config	layer_idxc                 .   t         |           || _        || _        |j                  | _        |j
                  | _        |j                  |j                  z  | _	        |j                  | _
        | j                  dz  dz  | _        d| _        |j                  | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j&                  d      | _        y )Nr3         TFbias)r%   r&   rl   rm   attention_hidden_sizeattention_head_dimrQ   num_attention_headsrO   r^   max_position_embeddingsrX   	is_causalattention_dropoutr   Linearq_projk_projv_projr-   o_projr,   rl   rm   r.   s      r/   r&   zZambaAttention.__init__y   s9   "%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejkr0   Nr1   rW   past_key_valuesrd   r#   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
||j                  |	|
|      \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )Nr4   r   r3           )rY   rX   )rA   rQ   ry   viewr`   rz   r{   updater   get_interfacerl   _attn_implementationri   r]   rw   rX   rM   rc   r|   )r,   r1   rm   rW   r~   rd   input_shapehidden_shapequery_statesre   rf   attention_interfacerh   rg   s                 r/   r>   zZambaAttention.forward   sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=j,Xa'b$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r0   N)rC   rD   rE   __doc__r   intr&   r(   rG   r   r   r   r@   r>   rH   rI   s   @r/   rk   rk   k   s    l{ ls l. )-#)||#) #) t+	#)
 #) +,#) 
u||U\\D0%2E2LL	M#)r0   rk   c                   ~     e Zd ZdZdef fdZ	 d
dej                  dedz  fdZ	d
dedz  fdZ
d
dedz  fd	Z xZS )ZambaMambaMixeruE  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
    rl   c           	      	   t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  | _        |j"                  | _        t'        j(                  | j                  | j                  | j                   | j                  | j                  | j                  dz
        | _        |j,                  | _        t0        |j,                     | _        |j4                  | _        t'        j8                  | j                  | j                  dz  | j$                        | _        t'        j<                  t?        j@                  | j                  | j                  | j                  dz  z   | j                              | _!        t'        j<                  t?        j@                  | j                  | j                  | j                        dz
  dz  | j                  dz  z        | _"        t'        j<                  t?        j@                  | j                  | j                              | _#        t?        jH                  d| j                  dz   t>        jJ                        d d d f   }|jM                  | j                  d      jO                         }t'        j<                  t?        jP                  |      jS                  | j                  | j                  d            | _*        t'        j<                  t?        jV                  | j                  | j                              | _,        t'        j8                  | j                  | j                  | j$                        | _-        t]        d      a/ta        t^        d	d       a1ta        t^        d
d       a2t]        d      a3ti        tf        d      a5ta        tf        dd       a6ta        tf        dd       a7tq        tj        tl        td        tb        tn        f      a9tr        stt        jw                  d       y y )Nr   )in_channelsout_channelsrq   kernel_sizegroupspaddingr3   rp   g      ?r6   r4   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fnaq  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)<r%   r&   rl   rm   r-   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizemamba_expandintermediate_sizemamba_dt_ranktime_step_rankn_mamba_headsmamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1dhidden_mamba_act
activationr
   actuse_mamba_kernelsuse_fast_kernelsrx   in_projr'   r(   zerosx_proj_weightdt_proj_weightdt_proj_biasaranger8   rL   rc   logrM   A_logr)   Dout_projr   causal_conv1dgetattrr   r   	mamba_ssmr   selective_state_updater   r   allis_fast_path_availableloggerwarning_once)r,   rl   rm   Ar.   s       r/   r&   zZambaMambaMixer.__init__   s   "!--$22 & 3 3!'!4!4v7I7I!I$22#11"448J8JJ#33..ii..//##--))))A-
 !11&112 & 8 8 yy!1!143I3IA3MTXTaTab  \\KK""##d&9&9A&==##
 !ll[[++T-@-@$BUBUVY\\!!3&'

 LLT5G5GI\I\)]^ LLD//!35==I$PQ'RHHT++R0;;=\\%))A,"6"6t7I7I4K^K^`b"cd
ejj););T=P=PQR		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &^ &r0   Nr1   cache_paramsc                    |j                   \  }}}|d uxr |j                  xr |dk(  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }}	|j                  d      j                         }|	j                  d      }	|	j                  || j                  d|      j                  dd      }	| j                  j                  j	                  | j                  j                  j                  d      | j                  j                  j                  d            }
|rot        |j                  d      |j                  | j                     j                   |
| j                  j"                  | j$                        }|j'                  d      }n|,t)        j*                  |dk(        s||j'                  d      z  }|Xt,        j.                  j1                  || j2                  |j                   d   z
  df      }|j5                  || j                        }t7        ||
| j                  j"                  | j$                        }|,t)        j*                  |dk(        s||j'                  d      z  }|j                  d| j                  | j8                  |      j                  dd      }| j:                  d d d d d d d f   |z  j                  dd      }t)        j<                  || j>                  | j@                  | j@                  gd      \  }}}| jB                  d d d f   |j                  dd      z  }t)        jD                  | jF                  jI                                }| jJ                  | jJ                  jI                         nd }t)        jL                  |d|f|jN                  |jP                        }|rtS        | j                        D ]  }tU        |j                  | j                     jV                  d d |f   ||d	df   ||d	df   ||   ||d d df   ||d d df   | jX                  |   |	|d	df   ||   d

      j'                  d      }t)        jZ                  ||fd      } n5t)        jL                  |d| j8                  | j@                  f|jN                  |jP                        }tS        | j                        D ]  }t]        ||   ||   ||   ||   j                  dd      ||   j                  dd      | jX                  |   jI                         |	|   ||   d
d

      \  }}t)        jZ                  ||fd      j                         }t)        jZ                  ||j'                  d      fd      } |||j_                  || j                         | ja                  |j                  dd            }|S )Nr   r3   r4   r[   r   )r   devicer6   .T)dt_softplus)delta_softplusreturn_last_state)1rA   has_previous_stater   r`   r   chunksqueezerc   rM   r   r   r*   sizer   layersrm   conv_statesrq   r   	unsqueezer(   r   r   ra   padr   update_conv_stater   r   r   splitr   r   r   expr   rF   r   emptyr   r6   ranger   recurrent_statesr   catr   update_recurrent_stater   )r,   r1   r   rW   
batch_sizeseq_len_use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r/   cuda_kernels_forwardz$ZambaMambaMixer.cuda_kernels_forward  s    "/!4!4
GQ!-T!9!nl>]>]!nbimnbn  <<6@@AF.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)##DNN3??  M *33B7M)%))Na<O2P -0H0H0K K' mm//@U@UXeXkXklnXo@oqr?st*<<[$..Y,]L$++JZJZgkgvgvwM)%))Na<O2P -0H0H0K K
 &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a "00D9I<O<OPRTV<WWYYtzz'')** 7;6G6G6S**002Y]{{J7#;MDXDX`m`s`st!4--. O 6 ''7HHAN!!S!),&q#qy1aDaAgJaAgJFF1ICO"1% $! )B-   %yy,)FANO  Q 3 3T5H5HI$++#))I
 4--. S,=!!$&q)aDaDNN1a(aDNN1a(FF1IOO%G"1%#'&*-)z  %yy,)FANYY[!IIy*2F2Fq2I&JPQR	S $)A33It~~N !%l.D.DQ.J K$$r0   c           
         |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }	}
|	j                  d      j                         }	|
j                  d      }
|
j                  || j                  d|      j                  dd      }
|M|j                  | j                        r2|j                  | j                     j                  j                         }nDt        j                   || j                  | j"                  | j$                  f|	j&                  |      }||j                  | j                        r|dk(  r|j)                  |	| j                        }t        j*                  || j,                  j.                  d d dd d f   z  d      }	| j0                  r|	| j,                  j2                  z  }	| j5                  |	      j7                  |      j9                  d      }	n-|+|	|d d |	j                   d    d f   j9                  d      z  }	t:        j<                  j?                  |	| j@                  |	j                   d   z
  df      }|j)                  || j                        }| j5                  | j-                  |	      dd |f         }	||	|d d |	j                   d    d f   j9                  d      z  }	nS||	|j9                  d      z  }	| j5                  | j-                  |	      dd |f         }	||	|j9                  d      z  }	|	j                  d| j                  | j"                  |      j                  dd      }	| jB                  d d d d d d d f   |	z  j                  dd      }t        jD                  || jF                  | j$                  | j$                  gd      \  }}}| jH                  d d d f   |j                  dd      z  | jJ                  d d d d d d f   z   }t:        j<                  jM                  |      }t        jN                  | jP                  jS                                }t        jN                  |d d d d d d d d f   |d d d d d d d d d f   z        }|d d d d d d d d d f   |d d d d d d d d d f   jS                         z  }||	d d d d d d d d d f   jS                         z  }g }tU        |      D ]  }|d d d d d d |d d f   j                  dd      |z  |d d d d d d |d d f   j                  dd      z   }t        jV                  |j                  dd      j7                  |      |d d d d |d d f   j9                  d            }|jY                  |d d d d d d df           t        jZ                  |d      }||	| j\                  d d d d d d f   z  z   }|| j5                  |
      z  }||j_                  || j                         | ja                  |j                  dd      j                  |d|      j                  dd            }|S )	Nr   r3   r4   r   r   r   .r   )1rA   r6   r   r`   r   r   r   rc   rM   r   r   rm   r   r   cloner(   r   r   r   r   r   sumr   r*   r   rq   r   r7   r   r   ra   r   r   r   r   r   r   r   softplusr   r   rF   r   r_   appendstackr   r   r   )r,   input_statesr   rW   r   r   r   r6   r   r1   r   r   
conv_stater   r   r   r   r   r   
discrete_A
discrete_BdeltaB_ur   iscan_outputr   s                             r/   slow_forwardzZambaMambaMixer.slow_forwardn  s   !-!3!3
GQ""<<5??1E.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX#(G(G(W$++DNN;LLRRTIT//1D1DdFYFYZ$++I #..t~~>7a<);;M4>>Z
 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O!-$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nM]]..}t?T?TWdWjWjkmWn?npq>rs
);;JW
 $])CC'M)R S!-$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nM) -0H0H0K K HHT[[%?XgX%NOM) -0H0H0K K &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a #11!T':Y=P=PQSUW=XX\`\m\mtQ]
 
  ]]334FG YYtzz'')**YYqD!T1!458J1aQRTUW[K[8\\]
'1aD(89AaD!Q>N<O<U<U<WW
aAq$.> ? E E GGw 	9A"1aAq=1;;AqAIMPXYZ\]_`bcefYfPgPqPqrsuvPwwI,,y':':1a'@'C'CE'JAaQRTUWXjMLcLcdfLghKAq!QJ 78	9 kk,B7!]TVVAtQ<L5M%MN!DHHTN2#//	4>>J !%!!!Q'//
BHRRSTVWX!
 %$r0   c                    t        t        t        t        t        t
        f      }| j                  rC|r"d| j                  j                  j                  vrt        d      | j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)rW   )r   r   r   r   r   r   r   r   r   type
ValueErrorr   r   )r,   r1   r   rW   rd   r   s         r/   r>   zZambaMambaMixer.forward  s    !$#%68HJ^`no"
   )V4;M;M;T;T;Y;Y-Y i 
 ,,]LYg,hh  ^ \\r0   )NN)rC   rD   rE   r   r   r&   r(   rG   r   r   r   r>   rH   rI   s   @r/   r   r      sd    
M{ M` ^b_%"\\_%9>_%BP%ut| P%d]54< ]r0   r   c                   $     e Zd Z fdZd Z xZS )ZambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFrp   )r%   r&   rl   r-   r   r   rx   	gate_projup_proj	down_projr
   
hidden_actact_fnr,   rl   r.   s     r/   r&   zZambaMLP.__init__  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r0   c                     | j                  | j                  | j                  |            | j                  |      z        }|S r   )r  r	  r  r  )r,   xr  s      r/   r>   zZambaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r0   )rC   rD   rE   r&   r>   rH   rI   s   @r/   r  r    s    0r0   r  c                   
    e Zd Zddededz  f fdZ	 	 	 ddej                  dej                  dedej                  dz  dedz  d	e	dz  d
e
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )ZambaAttentionDecoderLayerNrl   rm   c                     t         |           t        ||      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _        y )Nr"   )r%   r&   rk   	self_attnr  feed_forwardr!   rr   rms_norm_epsinput_layernormr-   pre_ff_layernormr}   s      r/   r&   z#ZambaAttentionDecoderLayer.__init__  s_    '	:$V,+F,H,HfNaNab ,V-?-?VEXEX Yr0   r1   original_hidden_statesrW   r~   	use_cacherd   r#   c           	          t        j                  ||gd      }| j                  |      } | j                  d|||||d|\  }}| j	                  |      }| j                  |      }|S )ac  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r4   r   )r1   rm   rW   r~   r   )r(   concatenater  r  r  r  )	r,   r1   r  rm   rW   r~   r  rd   r   s	            r/   r>   z"ZambaAttentionDecoderLayer.forward  s    2 ))=:P*QWYZ,,];)4>> 
')+
 
q --m<))-8r0   r   )NNF)rC   rD   rE   r   r   r&   r(   rG   r   boolr   r   r@   FloatTensorr>   rH   rI   s   @r/   r  r    s    Z{ ZsTz Z /3(,!&'||' !&' 	'
 t+' ' $;' +,' 
u  %(9(95;L;L(L"MPT"TT	U'r0   r  c                   r    e Zd Zdedef fdZ	 	 	 	 	 	 	 	 ddej                  dej                  dz  dedz  dej                  dz  dej                  dz  d	edz  d
e	dz  dej                  dz  dej                  dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )ZambaMambaDecoderLayerrl   rm   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        || _        y )N)rl   rm   r  )	r%   r&   r   mambar!   r-   r  r  rm   r}   s      r/   r&   zZambaMambaDecoderLayer.__init__  s>    $FiH
+F,>,>FDWDWX"r0   Nr1   r  rW   causal_maskr~   r  position_idstransformer_hidden_statesrd   r#   c
                 t    |}|	||	z   n|}| j                  |      } | j                  d|||d|
}||z   }|S )aX  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )r1   r   rW   r  )r  r   )r,   r1   r  rm   rW   r!  r~   r  r"  r#  rd   residuals               r/   r>   zZambaMambaDecoderLayer.forward  sn    0 !
 :S9^M55dq 	 ,,];"

 
'()
 	
 !=0r0   )NNNNNFNN)rC   rD   rE   r   r   r&   r(   rG   r   r  
LongTensorr   r   r@   r  r>   rH   rI   s   @r/   r  r    s   #{ #s # 7; $.2+/(,!&049=*||* !&t 3* :	*
 t+* \\D(* * $;* &&-* $)<<$#6* +,* 
u  %(9(95;L;L(L"MPT"TT	U*r0   r  c                   J    e Zd Zdedej
                  def fdZ	 	 	 	 	 	 ddej                  dej                  dz  de
dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dee   deej                   eej                   ej                   f   dz  f   fdZ xZS )ZambaHybridLayershared_transflinearr   c                 L    t         |           || _        || _        || _        y r   )r%   r&   r)  r*  mamba_decoder)r,   r)  r*  r   r.   s       r/   r&   zZambaHybridLayer.__init__I  s%    *"r0   Nr1   r  rm   rW   r!  r~   r  rd   r#   c           	           | j                   |f|||||d|}	| j                  |	      }	 | j                  |f|	|||d|}|S )aF  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )r  rm   rW   r~   r  )r#  rW   r~   r  )r)  r*  r,  )
r,   r1   r  rm   rW   r!  r~   r  rd   r#  s
             r/   r>   zZambaHybridLayer.forwardO  s    2 %7D$6$6%
#9&+%
 %
! %)KK0I$J!***
&?)+
 
 r0   )NNNNNF)rC   rD   rE   r  r   rx   r  r&   r(   rG   r   r   r  r   r   r@   r  r>   rH   rI   s   @r/   r(  r(  H  s    #&@ #")) #\r # 7; $.2+/(,!&-||- !&t 3- :	-
 t+- \\D(- - $;- +,- 
u  %(9(95;L;L(L"MPT"TT	U-r0   r(  c                   |     e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZeedZ ej                           fd	       Z xZS )
ZambaPreTrainedModelrl   modelTr(  r  r~   F)r1   
attentionsc                    | j                   j                  }t        |   |       t	        |t
              rt        j                  |j                  d|       | j                   j                  dz  }t        j                  |j                  | |       | j                   j                  | j                   j                  z  | j                   j                  z  }t        j                   t        j"                  | j                   j                  |      t%        j&                  | j                   j(                        t%        j&                  | j                   j*                        z
  z  t%        j&                  | j                   j*                        z         j-                  | j                   j.                        }|t        j&                  t        j0                  |              z   }t        j2                  |j4                  |       t        j6                  d|j8                  dz   t        j:                        d d d f   }|j=                  |j>                  d      jA                         }t        j2                  |jB                  t        j&                  |      jE                  |j                  |jF                  d             t        jH                  |jJ                         y y )Nr   )r:   stdro   )minr   r   r4   )&rl   initializer_ranger%   _init_weights
isinstancer   initnormal_r   r   uniform_r   r   r-   r   r(   r   randmathr   time_step_maxtime_step_minclamptime_step_floorexpm1copy_r   r   r   r8   rL   r   rc   r   rM   r   ones_r   )	r,   rS   r3  dt_init_stdr   dtinv_dtr   r.   s	           r/   r6  z"ZambaPreTrainedModel._init_weights  s   kk++f%fo.LL--CSA++33T9KMM&//+{K![[558O8OOSWS^S^SlSllN

4;;44nE88DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FJJv**F3Q 5 5 9OPTVWPWXA1126AACAJJv||UYYq\%9%9&:N:NPVPePegi%jkJJvxx % /r0   )rC   rD   rE   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  rk   _can_record_outputsr(   no_gradr6  rH   rI   s   @r/   r/  r/    sa    &*#+-EF"3 NL/$
 U]]_! !r0   r/  c                        e Zd ZdZdef fdZeee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  dedz  d	e	j                  dz  d
edz  dee   deez  fd                     Z xZS )
ZambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

    Args:
        config: ZambaConfig
    rl   c                 F   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        |j                  | _	        g }d | _
        t        | j                        D ]  \  }}t        ||      }|dk(  rt        j                  | j                  j                  | j                  j                  d      }|j                  t!        t#        |      ||             | j                  d| dd| di| _
        |j                  |        t        j$                  |      | _        t)        |j                  |j*                  	      | _        d| _        | j1                          y )
N)rm   hybridFrp   z
layers.(?!z\.)\d+.shared_transfzlayers.z.shared_transfr  )r%   r&   pad_token_idpadding_idx
vocab_sizer   	Embeddingr-   embed_tokenslayers_block_type_tied_weights_keys	enumerater  rx   rl   r   r(  r  
ModuleListr   r!   r  final_layernormgradient_checkpointing	post_init)r,   rl   r   layer_id
layer_typer   r*  r.   s          r/   r&   zZambaModel.__init__  sc    !.. ++LL):):F<N<NPTP`P`a!'!9!9"&$-d.D.D$E 
	% Hj*6XFEX%4;;#:#:DKK<S<SZ_`./I&/QSY[`ab**2%hZ/CDPXzYgFh/D+ e$
	% mmF++F,>,>FDWDWX&+#r0   N	input_idsrW   r"  r~   inputs_embedsr  rd   r#   c                 >   |d u |d uz  rt        d      || j                  |      }|}t        j                  |      }	|r|t	        | j
                        }|V||j                         nd}
t        j                  |j                  d   |j                        |
z   }|j                  d      }t        | j
                  ||||      }t        | j                        D ]  \  }} |||	|||f||d|} | j                  |      }t        ||r|      S d       S )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either one)rl   r   r   r   )rl   rd  rW   r~   r"  )r~   r  )last_hidden_stater~   )r   rY  r(   r   r   rl   get_seq_lengthr   rA   r   r   r   r\  r   r^  r   )r,   rc  rW   r"  r~   rd  r  rd   r1   r  past_seen_tokensr!  rm   layers                 r/   r>   zZambaModel.forward  s^    -t";<s    --i8M%!&]!; 0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 !*$++ 6 
	Iu!&	 !0#	 	M
	 ,,];&+/8O
 	
>B
 	
r0   )NNNNNN)rC   rD   rE   r   r   r&   r   r   r   r(   r&  rG   r   r  r  r   r   r@   r   r>   rH   rI   s   @r/   rR  rR    s    { 8   .2.204(,26!%8
##d*8
 t+8
 &&-	8

 8
 ((4/8
 $;8
 +,8
 
(	(8
    8
r0   rR  c                   N    e Zd ZddiZdef fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  dej                  dz  dedz  deej                  z  dee   deez  fd              Z	 	 	 	 	 	 d fd	Z xZS )ZambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightrl   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r  )
r%   r&   rR  r0  rW  r   rx   r-   lm_headr`  r
  s     r/   r&   zZambaForCausalLM.__init__  sU     '
 ++yy!3!3V5F5FUS 	r0   Nrc  rW   r"  r~   rd  labelsr  logits_to_keeprd   r#   c	           
      b    | j                   d||||||d|	}
|
j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |	}t        |||
j                  |
j                  |
j                        S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ZambaForCausalLM

        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rc  rW   r"  r~   rd  r  Nlosslogitsr~   r1   r1  r  )r0  rg  r7  r   slicern  loss_functionrW  r   r~   r1   r1  )r,   rc  rW   r"  r~   rd  ro  r  rp  rd   outputsr1   slice_indicesrt  rs  s                  r/   r>   zZambaForCausalLM.forward  s    H ,64:: ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%% 	D &#33!//))
 	
r0   c           
      h    | j                   j                  |d<   t        
|   |f||||||d|}	|	S )Nrp  )r~   rW   rd  r"  r  is_first_iteration)rl   num_logits_to_keepr%   prepare_inputs_for_generation)r,   rc  r~   rW   rd  r"  r  rz  rd   model_inputsr.   s             r/   r|  z.ZambaForCausalLM.prepare_inputs_for_generation[  sU     $(;;#A#A w<	
+)'%1	
 	
 r0   )NNNNNNNr   )NNNNTF)rC   rD   rE   r[  r   r&   r   r   r(   r&  rG   r   r  r  r   r   r   r@   r   r>   r|  rH   rI   s   @r/   rl  rl    s0   *,GH{   .2.204(,26*.!%-.@
##d*@
 t+@
 &&-	@

 @
 ((4/@
   4'@
 $;@
 ell*@
 +,@
 
'	'@
  @
J   r0   rl  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  dej                  dz  dej                  dz  d	edz  d
ee   deez  fd              Z xZS )ZambaForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r  )
r%   r&   
num_labelsrR  r0  r   rx   r-   scorer`  r
  s     r/   r&   z'ZambaForSequenceClassification.__init__  sS      ++'
YYv114??O
 	r0   Nrc  rW   r"  r~   rd  ro  r  rd   r#   c           	          | j                   |f|||||d|}	|	j                  }
| j                  |
      }||j                  d   }n|j                  d   }| j                  j
                  |dk7  rt        d      | j                  j
                  d}n||| j                  j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}||j                  |j                        }| j                  j"                  | j$                  dk(  rd
| j                  _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                  _        nd| j                  _        | j                  j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                  j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                  j"                  dk(  rt5               } |||      }t7        |||	j8                  |	j:                  |	j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )rW   r"  r~   rd  r  Nr   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r4   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rf  
regressionsingle_label_classificationmulti_label_classificationrr  )r0  rg  r  rA   rl   rU  r   r7   r   r(   int32r   argmaxr   r   r.   rC   problem_typer  r6   longr   r   r   r   r   r   r   r~   r1   r1  )r,   rc  rW   r"  r~   rd  ro  r  rd   transformer_outputsr1   rt  r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrs  loss_fcts                      r/   r>   z&ZambaForSequenceClassification.forward  s   & 8Btzz8
)%+'8
 8
 ,==M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6/ /??-;;*55
 	
r0   )NNNNNNN)rC   rD   rE   r&   r   r   r(   r&  rG   r   r  r  r   r   r@   r   r>   rH   rI   s   @r/   r  r  u  s      .2.204(,26*.!%R
##d*R
 t+R
 &&-	R

 R
 ((4/R
   4'R
 $;R
 +,R
 
1	1R
  R
r0   r  )rl  r  rR  r/  )r   )Er   r<  collections.abcr   r(   r   torch.nnr   r   r    r	   r8  activationsr
   cache_utilsr   r   
generationr   integrations.hub_kernelsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   configuration_zambar   
get_loggerrC   r   Moduler!   rG   r   rR   rF   ri   rk   r   r  r  r  r(  r/  rR  rl  r  __all__r  r0   r/   <module>r     s  &   $   A A & ! . ) 8 / 9 q q F & R R 7 9 5 , 
		H	%J299 J*	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % %2C)RYY C)L\]bii \]@	ryy  0 0f17 1h41 4n $!? $! $!N _
% _
 _
Fg+_ gT ^
%9 ^
^
B gr0   