
    iN                        d Z ddlZddlmZ ddlZddlmZ ddlmZ ddlm	Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZm Z  ddl!m"Z"  ejF                  e$      Z% ed      rddl&m'Z' ndZ' e       rddl(m)Z) ndZ) G d dejT                        Z+ G d dejT                        Z, G d de      Z-e G d de             Z.e ed       G d d e                    Z/e ed!       G d" d#e                    Z0e G d$ d%e.             Z1 ed&       G d' d(e.e             Z2g d)Z3y)*zPyTorch MAMBA model.    N)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)lazy_load_kernel)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_mambapy_availableis_torch_greater_or_equal
is_tracingresolve_internal_import   )MambaConfigz2.9.0)associative_scan)pscanc                       e Zd ZdZddededef fdZ ej                         d        Z
d Z	 	 dd	ej                  d
edz  dej                  dz  fdZdd
edz  dej                  dz  fdZ	 	 dd
edz  dej                  dz  fdZ xZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    config	layer_idxinitialize_mixer_weightsc           	         t         |           || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        |j                        | _
        || _        |j                  | _        t        j                  | j                  | j                  |j                  |j                  | j                  |j                  dz
        | _        |j                   | _        t$        |j                      | _        |j(                  | _        |j*                  | _        t        j,                  | j                  | j                  dz  |j.                        | _        t        j,                  | j                  | j                  | j
                  dz  z   d      | _        t        j,                  | j                  | j                  d      | _        t        j6                  t9        j:                  | j                  | j
                              | _        t        j6                  t9        j:                  | j                              | _        |r=| j4                  j@                  jB                  jD                  dk7  r| jG                          t        j,                  | j                  | j                  |j.                        | _$        |j.                  | _        tK        d      a&tO        tL        d	d       a(tO        tL        d
d       a)tK        d      a*tW        tT        d      a,tO        tT        dd       a-tO        tT        dd       a.| j_                          y )Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r"   FTmetazcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fn)0super__init__r   hidden_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeintermediate_sizeinttime_step_rankr   use_conv_biasr   Conv1dconv1d
hidden_act
activationr   actuse_mambapyuse_associative_scanLinearuse_biasin_projx_projdt_proj	ParametertorchemptyA_logDweightdevicetypeinit_mamba_weightsout_projr   causal_conv1dgetattrr)   r*   	mamba_ssmr   selective_state_updater,   r-   warn_slow_implementation)selfr   r   r   	__class__s       y/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/mamba/modeling_mamba.pyr/   zMambaMixer.__init__B   s   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!--$*$?$?! yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX \\%++d.D.DdFYFY"Z[
ekk$*@*@AB#(;(;(B(B(G(G6(Q##%		$"8"8$:J:JQWQ`Q`a )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC%%'    c                    t        j                  d| j                  dz   t         j                  | j                  j
                        d d d f   }|j                  | j                  d      j                         }t        j                  | j                  t        j                  |             t        j                  | j                         | j                  j                  dz  | j                  j                   z  }| j                  j"                  dk(  r+t        j$                  | j&                  j(                  |       nE| j                  j"                  dk(  r,t        j*                  | j&                  j(                  | |       t        j,                  t        j.                  | j                  | j&                  j0                  j
                  t         j                        t3        j                  | j                  j4                        t3        j                  | j                  j6                        z
  z  t3        j                  | j                  j6                        z         j9                  | j                  j:                        }|t        j                  t        j<                  |              z   }t        j                  | j&                  j0                  |       y )	Nr   )dtyperK   g      constantrandomrK   rY   )min)rF   aranger2   float32rH   rK   expandr5   
contiguousinitcopy_logones_rI   r   r7   time_step_scaletime_step_init_scheme	constant_rD   rJ   uniform_exprandr"   mathtime_step_maxtime_step_minclamptime_step_floorexpm1)rT   Adt_init_stddtinv_dts        rV   rM   zMambaMixer.init_mamba_weightsz   s   LLD//!35==QUQ[Q[QbQbcdhjkdklHHT++R0;;=

4::uyy|,

466kk00$69T9TT;;,,
:NN4<<..<[[..(:MM$,,--|[IYYJJt--dll6G6G6N6NV[VcVcdxx112TXXdkk>W>W5XXZhht{{0012
 %DKK//%
0	 	 eiibS!1 122

4<<$$f-rW   c                     t        t        t        t        t        t
        f      }|sM| j                  r+t               rt        j                  d       y t        d      t        j                  d       y y )Na  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)allrR   r,   r*   r)   r-   r>   r   loggerwarning_onceImportError)rT   is_fast_path_availables     rV   rS   z#MambaMixer.warn_slow_implementation   sw    !$#%68HJ^`no"
 &')''S & Z  ##W &rW   Nhidden_statescache_paramsattention_maskc                 
   | j                  |      j                  dd      }| j                  r%|"t        || j                  j
                  | j                  r| j                  j                  nd | j                  j
                  | j                  j
                  | j                  j
                  | j                  r$| j                  j                  j                         nd t        j                  | j                  j                                d d | j                   j                         | j                  j                  j                         d      }|S |j#                  dd      \  }}|||j%                  d      z  }|d uxr |j'                  | j(                        }| j                  j
                  j+                  | j                  j
                  j-                  d      | j                  j
                  j-                  d            }|rot/        |j1                  d      |j2                  | j(                     j4                  || j                  j                  | j6                        }|j%                  d      }n|Xt8        j:                  j=                  || j>                  |j@                  d   z
  df      }	|jC                  |	| j(                         tE        ||| j                  j                  | j6                        }|||j%                  d      z  }| j                  |j                  dd            }
t        jF                  |
| jH                  | jJ                  | jJ                  gd      \  }}}| j                  j
                  |j                  dd      z  }t        j                  | j                  j                                }tM        | j                  d	      r$| j                  j                  j                         nd }|rgtO        |j2                  | j(                     jP                  |d
   |d
   ||d d df   |d d df   | j                   |d
   |d
      j%                  d      }nptS        ||||j                  dd      |j                  dd      | j                   j                         ||dd
      \  }}|||jU                  || j(                         | j                  |j                  dd            }|S )Nr   r&   T)
delta_biasdelta_softplusdimr   rZ   )r<   r"   ).r   )dt_softplus)r   return_last_state)+rB   	transposetrainingr-   r:   rJ   r8   r"   rC   rD   rN   rA   floatrF   rk   rH   rI   chunk	unsqueezehas_previous_stater   viewsizer)   squeezelayersconv_statesr<   r   
functionalpadr4   shapeupdate_conv_stater*   splitr7   r2   hasattrrR   recurrent_statesr,   update_recurrent_state)rT   r}   r~   r   projected_statescontextualized_statesgateis_decodingconv_weightsr   ssm_parameters	time_stepBCdiscrete_time_steprs   time_proj_biasscan_outputs	ssm_states                      rV   cuda_kernels_forwardzMambaMixer.cuda_kernels_forward   sC     <<6@@AF==\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!t %$S #3"8"8"8"BM4) -0H0H0K K&d2f|7V7VW[WeWe7fK  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL 4!))"- ''7CC KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !22;O 0!<1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOIq! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:WT\\..446]aN5 ''7HH!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 77	4>>R %)MM,2H2HA2N$O!$$rW   c           	         |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  dd      \  }	}
||	|j                  d      z  }	|M|j                  | j                        r2|j                  | j                     j                  j                         }n9t        j                  || j                  | j                  f|	j                  |      }|s|j                  | j                        st         j"                  j%                  |	| j&                  |	j                   d   z
  df      }|j)                  || j                         | j+                  | j-                  |	      dd |f         }	n|j)                  |	| j                        }|j/                  | j,                  j0                  j                        }t        j2                  || j,                  j0                  d d dd d f   z  d      }	| j4                  r|	| j,                  j6                  z  }	| j+                  |	      j/                  |      j                  d      }	n'| j+                  | j-                  |	      dd |f         }	||	|j                  d      z  }	| j9                  |	j                  dd            }t        j:                  || j<                  | j                  | j                  gd      \  }}}| j?                  |      }t         j"                  jA                  |      j                  dd      }t        jB                  | jD                  jG                                }t        jB                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jG                         z  }||	d d d d d d d f   jG                         z  }| jH                  r| jJ                  r|tM        |j                  dd      |j                  dd            }||j                  d      z  jO                  d      j                  dd      }||	| jP                  d d d d f   z  z   }|| j+                  |
      z  }n| jR                  rtT        tW        |	      r|d	 }|j                  jX                  d
v rdnd}tU        |||fd|      \  }}t        jZ                  |j]                  dddd      j/                  |      |j                  d            jO                  d      j]                  ddd      }|d d d d dd d f   }ng }t_        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t        jZ                  |j/                  |      |d d |d d f   j                  d            }|ja                  |d d d d df           t        jb                  |d      }||	| jP                  d d d d f   z  z   }|| j+                  |
      z  }||je                  || j                         | jg                  |j                  dd            }|S )Nr   r&   r   r]   rZ   r   .r   c                 0    | \  }}|\  }}||z  ||z  |z   fS N )leftrighta_leftb_lefta_rightb_rights         rV   
combine_fnz+MambaMixer.slow_forward.<locals>.combine_fnP  s/    %)NFF',$GW"W,g.>.HIIrW   )cudaxpu	pointwisegeneric)r   combine_mode)4r   rY   rB   r   r   r   r   r   r   r   clonerF   zerosr5   r2   rK   r   r   r   r4   r   r=   r:   torJ   sumr8   r"   rC   r   r7   rD   softplusrk   rH   r   r>   r   r   r   rI   r?   r   r   rL   matmulpermuterangeappendstackr   rN   )rT   input_statesr~   r   
batch_sizeseq_len_rY   r   r}   r   r   
conv_stater   r   r   r   r   rs   
discrete_A
discrete_BdeltaB_uhsscan_outputr   r   all_hr   ir   s                                 rV   slow_forwardzMambaMixer.slow_forward  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM#(G(G(W$++DNN;LLRRTIT33T5H5HI$++5I #224>>B]]..!**]-@-@-DDaH

 ..z4>>J $])CC'M)R S);;M4>>Z
']]4;;+=+=+D+DE
 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6K ((-=-IjYfNglx  mAJ
 /9.?.?.D.D.W{]f+JX8NTUdpq5#ll5==Aq!+D+G+G+NPQP[P[\^P_`hhiklttuvxy{|}!!QA+.	  "w >A *1aA: 6 BXaQRTUWXjEY YI"',,y||E/BAaAgJDXDXY[D\"]K ''Aq!G(<=> $kk,B?%a9N)NOK&$7K'33It~~N !%k.C.CAq.I J$$rW   c                    t        t        t        t        t        t
        f      }|rJd| j                  j                  j                  j                  v rt        |      s| j                  |||      S | j                  |||      S )Nr   )rx   rR   r,   r*   r)   r-   rC   rJ   rK   rL   r   r   r   )rT   r}   r~   r   kwargsr|   s         rV   forwardzMambaMixer.forwardn  sv     "%#%68HJ^`no"
 "f0B0B0I0I0N0N&NWaboWp,,]L.YY  nMMrW   )TNN)__name__
__module____qualname____doc__r   r6   boolr/   rF   no_gradrM   rS   Tensorr	   
LongTensorr   r   r   __classcell__rU   s   @rV   r   r   :   s    6({ 6(s 6(VZ 6(p U]]_. .*4 &*26	d%||d% dld% ((4/	d%N]%ut| ]%Z_ZjZjmqZq ]%F &*26	N dlN ((4/	NrW   r   c                   ,     e Zd Zd fd	Zd Zd Z xZS )MambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)r.   r/   r   rE   rF   onesrJ   variance_epsilon)rT   r0   epsrU   s      rV   r/   zMambaRMSNorm.__init__~  s1     	ll5::k#:; #rW   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr&   rZ   T)keepdim)	rY   r   rF   r`   powmeanrsqrtr   rJ   )rT   r}   input_dtypevariances       rV   r   zMambaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::rW   c                 R    | j                   j                  d    d| j                   S )Nr   z, eps=)rJ   r   r   rT   s    rV   
extra_reprzMambaRMSNorm.extra_repr  s*    ++##A&'vd.C.C-DEErW   )gư>)r   r   r   r/   r   r   r   r   s   @rV   r   r   }  s    $;FrW   r   c                   T     e Zd Z fdZ	 	 ddedz  dej                  dz  fdZ xZS )
MambaBlockc                     t         |           || _        || _        |j                  | _        t        |j                  |j                        | _        t        ||d      | _
        y )Nr   F)r   r   )r.   r/   r   r   residual_in_fp32r   r0   layer_norm_epsilonnormr   mixer)rT   r   r   rU   s      rV   r/   zMambaBlock.__init__  sU    " & 7 7 !3!39R9RS	)V[\
rW   Nr~   r   c                    |}| j                  |j                  | j                   j                  j                              }| j                  r|j                  t
        j                        }| j                  |||      }||z   }|S )N)rY   r~   r   )r   r   rJ   rY   r   rF   r`   r   )rT   r}   r~   r   r   residuals         rV   r   zMambaBlock.forward  su     !		-"2"29I9I9O9O"2"PQ  {{5==1H

=|\j
k =0rW   r   )	r   r   r   r/   r	   rF   r   r   r   r   s   @rV   r   r     s9    ] &*26	 dl ((4/	rW   r   c                   Z    e Zd ZU eed<   dZddgZdZdZ e	j                         d        Zy)MambaPreTrainedModelr   backboner   r   Tc                 @   | j                   j                  }t        |t              r#|j	                          t        j                  |j                  j                  t        j                  d             |j                  j                  )t        j                  |j                  j                         t        j                  |j                  j                  t        j                  d             | j                   j                  rB|j                  j                  }|t        j                  | j                   j                        z  }t        |t         j"                        rNt        j$                  |j                  |       |j                   t        j                  |j                         yyt        |t&              r t        j(                  |j                         yt        |t         j*                        r"t        j$                  |j                  |       yy)zInitialize the weights.   )aN)std)r   initializer_range
isinstancer   rM   rc   kaiming_uniform_r:   rJ   rm   sqrtr"   zeros_rN   rescale_prenorm_residualnum_hidden_layersr   r@   normal_r   rf   	Embedding)rT   moduler   ps       rV   _init_weightsz"MambaPreTrainedModel._init_weights  sN    kk++fj) %%'!!&--"6"6$))A,G}}!!-FMM../!!&//"8"8DIIaLI{{33 OO**TYYt{{<<==fbii(LLC0{{&FKK( '-JJv}}%-LLC0 .rW   N)r   r   r   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulrF   r   r
  r   rW   rV   r   r     s>    "%|4&*#LU]]_"1 "1rW   r   z,
    Class for the MAMBA model outputs.
    )custom_introc                   |    e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
eej                     dz  ed<   y)MambaOutputa4  
    cache_params (`Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater~   r}   )r   r   r   r   r  rF   FloatTensorr  r~   r	   r}   tupler   rW   rV   r  r    sG     37u((4/6!%L%$,%59M5**+d29rW   r  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   y)MambaCausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr~   r}   )r   r   r   r   r  rF   r  r  r  r~   r	   r}   r  r   rW   rV   r  r    s[    
 &*D%

d
")'+FE$+!%L%$,%59M5**+d29rW   r  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de
dz  d	edz  d
edz  dedz  dej                  dz  deez  fd       Z xZS )
MambaModelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _        t        |j
                  |j                        | _        | j!                  | j"                         | j%                          y c c}w )N)r   Fr   )r.   r/   r   r  
vocab_sizer0   
embeddings
ModuleListr   r  r   r   gradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)rT   r   idxrU   s      rV   r/   zMambaModel.__init__  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$r3Z#%F$rs&+#"6#5#56;T;TU//? %ss   &Cc                 f    |D ],  }d|v s|j                  |      ||j                  dd      <    y  y )Nz
embedding.zembeddings.)popreplace)rT   
state_dictprefixargsks        rV   r#  zMambaModel.load_hook  s;     	Aq EO^^TUEV
199\=AB	rW   c                     | j                   S r   r  r   s    rV   get_input_embeddingszMambaModel.get_input_embeddings  s    rW   c                     || _         y r   r.  rT   new_embeddingss     rV   set_input_embeddingszMambaModel.set_input_embeddings  s	    (rW   N	input_idsinputs_embedsr~   	use_cacheoutput_hidden_statesreturn_dictr   returnc                 ^   ||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j                  }|du |duz  rt        d      || j                  |      }| j                  r| j                  r|rd}|r|t        | j                         }|}	|rdnd}
| j                  D ]  } ||	||      }	|s|
|	fz   }
 | j                  |	      }	|r|
|	fz   }
|st        d |	||
fD              S t        |	|r||
      S d|
      S )	a  
        cache_params (`Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embeds)r   r   r   c              3   &   K   | ]	  }||  y wr   r   ).0vs     rV   	<genexpr>z%MambaModel.forward.<locals>.<genexpr>W  s     fqXYXefs   )r  r~   r}   )r   r7  r   r6  r8  
ValueErrorr  r   r
   r   r!  r  r  )rT   r4  r5  r~   r6  r7  r8  r   r   r}   all_hidden_statesmixer_blocks               rV   r   zMambaModel.forward   se   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++BYBY-t";<YZZ  OOI6M&&4==YI-'t{{;L%"6BD;; 	IK')-M $$58H$H!	I M2 1]4D Df]LBS$Tfff+)2+
 	
8<+
 	
rW   )NNNNNNN)r   r   r   r/   r#  r/  r3  r   rF   r   r	   r   r  r  r   r   r   s   @rV   r  r    s    
)  .215%)!%,0#'26<
##d*<
 ''$.<
 dl	<

 $;<
 #Tk<
 D[<
 ((4/<
 
	<
 <
rW   r  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                   j    e Zd ZddiZ fdZd Zd Z	 	 	 	 	 ddedz  dej                  dz  d	e
dz  f fd
Ze	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  de
dz  de
dz  de
dz  deej                  z  deez  fd       Z xZS )MambaForCausalLMzlm_head.weightzbackbone.embeddings.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr'   )
r.   r/   r  r   r   r@   r0   r  lm_headr$  )rT   r   rU   s     rV   r/   zMambaForCausalLM.__init__i  sF     "6*yy!3!3V5F5FUSrW   c                 6    | j                   j                         S r   )r   r/  r   s    rV   r/  z%MambaForCausalLM.get_input_embeddingsp  s    }}1133rW   c                 8    | j                   j                  |      S r   )r   r3  r1  s     rV   r3  z%MambaForCausalLM.set_input_embeddingss  s    }}11.AArW   Nr~   r   is_first_iterationc           	      F    t        	|   |f|||||d|}|r|sd |d<   |S )N)r5  r6  r~   r   rH  r   )r.   prepare_inputs_for_generation)
rT   r4  r5  r6  r~   r   rH  r   model_inputsrU   s
            rV   rJ  z.MambaForCausalLM.prepare_inputs_for_generationv  sN     w<
'%)1
 
 /-1L)*rW   r4  r5  labelsr7  r8  r6  logits_to_keepr9  c
           	         ||n| j                   j                  }| j                  |||||||      }|d   }t        |	t              rt        |	 d      n|	}| j                  |dd|ddf   j                  | j                  j                  j                              j                         }d}||j                  |j                        }|dddddf   j                         }|dddf   j                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                        S )aN  
        cache_params (`Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)r~   r5  r7  r8  r6  r   r   .rZ   r   )r  r  r~   r}   )r   r8  r   r   r6   slicerE  r   rJ   rY   r   rK   rb   r   r   r   r  r~   r}   )rT   r4  r   r5  r~   rL  r7  r8  r6  rM  r   mamba_outputsr}   slice_indicesr  r  shift_logitsshift_labelsloss_fctoutputs                       rV   r   zMambaForCausalLM.forward  s   2 &1%<k$++BYBY%'!5#) & 
 &a(8B>SV8W~ot4]kmA}a,?@CCDLLDWDWD]D]^_eegYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
rW   )NNNNF)	NNNNNNNNr   )r   r   r   _tied_weights_keysr/   r/  r3  r	   rF   r   r   rJ  r   r  r6   r   r  r  r   r   r   s   @rV   rC  rC  `  sF    +,HI4B %)26*/
 dl ((4/ !4K2  .22626%)*.,0#'!%-.=
##d*=
 ((4/=
 ((4/	=

 dl=
   4'=
 #Tk=
 D[=
 $;=
 ell*=
 
$	$=
 =
rW   rC  )rC  r  r   )4r   rm   dataclassesr   rF   r   torch.nnr    r   rc   activationsr   cache_utilsr	   r
   
generationr   integrationsr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   r   configuration_mambar   
get_loggerr   ry   (torch._higher_order_ops.associative_scanr   mambapy.pscanr   Moduler   r   r   r   r  r  r  rC  __all__r   rW   rV   <module>rh     sz     !   % & ! . ) , 9 - 
  - 
		H	%W%I #E@N @NF
F299 F(+ 4 *1? *1 *1Z 
:+ : : 
:+ : :& V
% V
 V
r g
+_ g
g
T ErW   