
    i                     r   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3  ejh                  e5      Z6 G d ded      Z7 G d de*      Z8d2dZ9 G d d e&      Z: G d! d"e-      Z; G d# d$ejx                        Z= G d% d&e(      Z> G d' d(e)      Z? G d) d*e$      Z@e G d+ d,e             ZAe G d- d.eA             ZB G d/ d0e'      ZCg d1ZDy)3zPyTorch Bamba model.    )	TypedDictN)nn   )initialization)ACT2FN)CacheDynamicCache)lazy_load_kernel)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)resolve_internal_import)capture_outputs   )JambaAttentionDecoderLayer)LlamaAttentionLlamaForCausalLMLlamaMLPLlamaRMSNormLlamaRotaryEmbeddingrotate_half)MambaRMSNormGatedapply_mask_to_padding_statespad_tensor_by_sizereshape_into_chunkssegment_sum   )BambaConfigc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)BambaFlashAttentionKwargsaU  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor`):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/bamba/modular_bamba.pyr'   r'   <   s7      ######__r7   r'   F)totalc                       e Zd Zy)BambaRotaryEmbeddingNr-   r.   r/   r6   r7   r8   r;   r;   T       r7   r;   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }	}||z  t        |      |z  z   }
||z  t        |      |z  z   }t        j                  |
|gd      }
t        j                  ||	gd      }|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Ndim)	unsqueezeshaper   r1   cat)qkcossinunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               r8   apply_rotary_pos_embrQ   Y   s    ( --
&C
--
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr7   c                       e Zd Zy)BambaAttentionNr<   r6   r7   r8   rS   rS      r=   r7   rS   c                       e Zd Zy)BambaRMSNormGatedNr<   r6   r7   r8   rU   rU      r=   r7   rU   c            
       &    e Zd ZdZdedef fdZ	 	 	 ddej                  de	dz  dej                  dz  d	ej                  dz  fd
Z	 	 dde	dz  dej                  dz  fdZ	 	 	 dde	dz  dej                  dz  d	ej                  dz  fdZ xZS )
BambaMixeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    config	layer_idxc           	         t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  | j                  z        | _        || _        |j                  | _        |j                  | _        t"        |j                     | _        |j&                  | _        |j*                  | _        |j.                  | _        |j2                  | _        |j6                  | _        |j:                  | _        |j<                  | _        |j>                  | _        | j                  d| j0                  z  | j                  z  z   | _         tC        jD                  | j@                  | j@                  |j                  | j                  | j@                  | j                  dz
        | _#        | j                  | j@                  z   | j                  z   }tC        jH                  | j                  || j(                        | _%        tC        jL                  tO        jP                  | j                              | _)        tO        jT                  d| j                  dz         }tC        jL                  tO        jV                  |            | _,        t[        | j                  | j,                        | _.        tC        jL                  tO        jP                  | j                              | _/        tC        jH                  | j                  | j                  | j(                        | _0        tc        d      }te        |dd       a3te        |dd       a4tc        d	      }tk        |d
      a6tk        |d      a7tk        |d      a8ts        tl        tn        tp        th        tf        f      a:tt        stv        jy                  d       y tv        jy                  d       y )Nr   r$   )in_channelsout_channelsbiaskernel_sizegroupspadding)r]   epszcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)=super__init__mamba_n_heads	num_headshidden_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizer4   mamba_expandintermediate_sizerY   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonmamba_n_groupsn_groupsmamba_d_headhead_dimmamba_chunk_size
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearin_proj	Parameterr1   onesdt_biasarangelogA_logrU   normDout_projr
   getattrrc   rd   r   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)selfrX   rY   projection_sizeAcausal_conv1d	mamba_ssm	__class__s          r8   rg   zBambaMixer.__init__   s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11%55#11#11..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
%d&<&<$BYBYZ	ejj89		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &>  fgr7   Nhidden_statescache_paramsattention_maskr,   c                    t        ||      }| j                  |      }|j                  \  }}}| j                  | j                  z  }	|d uxr" |j                  | j                        xr |dk(  }
|
r
|j                  d      j                  | j                  | j                  | j                  gd      \  }}}t        ||j                  | j                     j                  | j                  j                   j                  d      | j                  j"                  | j$                        }t'        j                  || j                  |	|	gd      \  }}}t'        j(                  | j*                  j-                                }|d d d df   d d d d d f   j/                  d| j0                  | j                        j3                  t&        j4                        }|d d d d d f   j/                  dd| j0                        }| j6                  d d d df   j/                  d| j0                        }| j8                  d d d df   j/                  d| j0                        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  | j0                        }t=        |j                  | j                     j>                  ||||||d |d
      }|j;                  || j                  | j0                  z        }| jA                  ||      }| jC                  |      d d d df   }|S t'        j(                  | j*                  j-                                }| jD                  dt-        d	      fk(  ri nd
| jD                  i}| jF                  r|tI        || j                  j                   j                  d      | j                  j"                  | j6                  |f| j8                  | jJ                  || j$                  | j@                  j                   | j@                  jL                  | jB                  j                   | jB                  j"                  | j0                  | j                  ddd|}|S |j                  | j                  | j                  | j                  gd      \  }}}|j|jO                  dd      }tP        jR                  jU                  || jV                  |j                  d   z
  df      }|jY                  || j                        }| j$                  dvrH| j[                  | j                  |jO                  dd            dd |f   jO                  dd            }nqt]        |jO                  dd      | j                  j                   j                  d      | j                  j"                  | j$                  |      jO                  dd      }t        ||      }t'        j                  || j                  |	|	gd      \  }}}t_        |j;                  ||d| j0                        |||j;                  ||| j                  d      |j;                  ||| j                  d      f| jJ                  | j8                  d |d| j6                  dd|\  }}|||ja                  || j                        }|j;                  ||d      }| jA                  ||      }| jC                  |      }|S )Nr$   r?   r@   .dtypeT)zr   dt_softplusg        infdt_limitF)r   r   r,   rt   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   r   )siluswish)xweightr]   rt   r,   )r   r   r   r,   r   r   r   )1r    r   rC   r{   rl   has_previous_staterY   squeezesplitrp   r   ri   rc   layersconv_statesr   r   r]   rt   r1   expr   floatexpandr}   tofloat32r   r   viewr   recurrent_statesr   r   r   trainingr   r   variance_epsilon	transposer   
functionalpadrn   update_conv_stateru   rd   r   update_recurrent_state)r   r   r   r   r,   projected_states
batch_sizeseq_len_groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedr   scan_output	ssm_states                             r8   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward   s    5]NS<<6 "/!4!4
GQ!%1D1D!D $i)H)H)Xi]dhi]i 	
 "*:*B*B1*E*K*K''GR +L +'D#R
 !5!##DNN3??""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2##DNN3DD& M *..z4>>DMM;YZM IImT:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+'  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K #/"@"@dnn"]K??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff#(, LL $* &*&Y" (\-E , C CIt~~ ^I)..z7BG"iiT: mmK0
r7   c                    |j                   \  }}}|j                  }t        ||      }| j                  |      }|j	                  | j
                  | j                  | j                  gd      \  }	}
}|
j                  dd      }
|d uxr" |j                  | j                        xr |dk(  }|r|j                  |
| j                        }t        j                  || j                  j                  j!                  d      z  d      }
| j"                  r|
| j                  j$                  z   }
| j'                  |
      }
n|Xt(        j*                  j-                  |
| j.                  |
j                   d   z
  df      }|j                  || j                        }| j'                  | j                  |
      dd |f   j                  dd            }
t        |
|      }
t        j                  |
| j
                  | j0                  | j2                  z  | j0                  | j2                  z  gd      \  }}}t        j4                  | j6                  j9                                }|r|j:                  | j                     j<                  j>                  }|d d dd d f   d d d df   }|j                  dd      jA                  ||j                   d   | jB                        }| jD                  d   jA                  | jD                  j                   d   | jB                        }t        j(                  j*                  jG                  ||jI                  |j                        z         }t        jJ                  || jL                  d   | jL                  d         }|d   jA                  | j                  | jB                  | j2                        jI                  t        jN                  	      }t        j4                  |d   |z        jI                  |
      }|jQ                  || j0                  d      dd d d f   }|jA                  || j0                  | j                  | j0                  z  |j                   d         jS                         }|jQ                  |d|j                   d         }|d   |dd d d f   z  }|jQ                  |d| jB                        }||d   z  jI                  |
      }|j:                  | j                     j<                  |z  |z   }|jU                  || j                        }|jQ                  || j0                  d      dd d d f   }|jA                  || j0                  | j                  | j0                  z  |j                   d         jS                         }|jQ                  |d|j                   d         }|jI                  |j>                  |j                        }|jW                  || j                  z  | jB                  | j2                        }|jW                  || j                  z  | j2                  d      }t        jX                  ||      }|jW                  || j                  | jB                        }| jZ                  d   jA                  | jZ                  j                   d   | jB                        }|||z  z   jI                  |j                        }|jQ                  |d      d d d df   }nt(        j*                  jG                  || jD                  z         }t        jJ                  || jL                  d   | jL                  d         }|jQ                  ||d| jB                        j9                         }|jQ                  ||d| j2                        j9                         }|jQ                  ||d| j2                        j9                         }|j]                  | j                  | j0                  z  d| j                        }|j]                  | j                  | j0                  z  d| j                        }| j^                  || j^                  z  z
  | j^                  z  }| jZ                  d   ta        ||      z  }||d   z  }|jI                  |j                        |z  }||||fD cg c]  }tc        ||| j^                         c}\  }}}}|je                  dddd      }t        jf                  |d      }t        j4                  ti        |            } |d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }!|!j                  d      }"|"d   | je                  ddddd      d   z  }#|#j                  d      }$|$d   |d d d d d f   z  j                  d      }%t        j4                  |d d d d d d dd f   |z
        }&||&je                  dddd      d   z  }'|'dd d d f   |d   z  j                  d      }(t        jj                  |(d d d df         })t        jl                  |)|(gd      }(t        j4                  ti        t(        j*                  j-                  |d d d d d d df   d                  }*|*j                  dd      }*|*d   |(d d d d d df   z  j                  d      }+|+d d d df   |+d d df   },}(t        j4                  |      }-|dd d d f   |(d d d d d df   z  }.|-je                  dddd      }/|.j                  d      |/d   z  }0|%|0z   }|jQ                  |d| j                  | jB                        }||z   }|dkD  r|d d d |d d d d f   }|jQ                  ||d      }|,||jU                  |,| j                        },| jo                  ||	      }1| jq                  |1jI                  |            }2|2S c c}w )Nr?   r@   r$   r   r   .).N).NNr   device)r   r   )rA   output_sizer      )r$   r   )9rC   r   r    r   r   rp   r   ri   r   r   rY   r   r1   sumr   r   r   rr   r]   ru   r   r   r   rn   r{   rl   r   r   r   r   r   r   r   r}   r   softplusr   clampr   r   reshape
contiguousr   r   bmmr   repeat_interleaver   r!   r"   permutecumsumr#   
zeros_likerD   r   r   )3r   input_statesr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   cache_devicer   dAdBdBx
ssm_statesssm_states_reshaped
C_reshapedyr   pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess3                                                      r8   torch_forwardzBambaMixer.torch_forward  s
    ".!3!3
GQ"" 4L.Q<<5&6&<&<''GR '= '
# .77!<!-T!9!~l>]>]^b^l^l>m!~ry}~r~ "&889JDNN[K %		dkk0088;;! !!$58H8H$H! $): ; ' mm//%(=(=@Q@W@WXZ@[([]^'_ +<<[$..Y $5F)GXgX)V)`)`abde)f g89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'..t~~>OOVVL Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC &,,T^^<MMPRRUXXJ%<<ZXJ 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A $ahhaggFJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF $..va!e}=OYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A(??	4>>Z	ii4(
 !%knnU.C D$$A &{s   o1c                    t         rJd| j                  j                  j                  j                  v rt               s| j                  ||||      S |t        d      |j                  }|B|j                  d   dkD  r0|j                  d   dkD  r||d d d d d f   z  j                  |      }| j                  |||      S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r$   r   )r   r   r   r   typer   r   NotImplementedErrorr   rC   r   r  )r   r   r   r   r,   kwargsr   s          r8   forwardzBambaMixer.forwardQ  s     "f0C0C0J0J0O0O&OXpXr,,]L.Zabb%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*^Aq$J-GGKKERM!!-~NNr7   )NNN)NN)r-   r.   r/   r0   r%   r4   rg   r1   Tensorr   r5   r   r  r  __classcell__r   s   @r8   rW   rW      s    Zh{ Zhs Zh~ &*.2*._||_ dl_ t+	_
 4'_J &*.2	z% dlz% t+	z%@ &*.2*.O dlO t+	O
 4'Or7   rW   c                       e Zd Zy)BambaMLPNr<   r6   r7   r8   r  r  g  r=   r7   r  c                       e Zd Zy)BambaRMSNormNr<   r6   r7   r8   r  r  k  r=   r7   r  c                   J    e Zd Zddededef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
edz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )BambaDecoderLayerrX   rY   
layer_typec                     t         |   ||       | `d}|dk(  rt        nd } ||      | _        || _        |dk(  rt        ||      | _        y |dk(  rt        ||      | _        y t        d      )Nr$   mamba)rX   rY   	attentionzInvalid layer_type)
rf   rg   	self_attnr  feed_forwardr  rW   r  rS   
ValueError)r   rX   rY   r  num_expertsffn_layer_classr   s         r8   rg   zBambaDecoderLayer.__init__p  sv    +N&1Q&6(D+F3$ #6YGDJ;&+FI>DN122r7   Nr   r   position_idspast_key_values	use_cacheposition_embeddingsr  returnc           
      2   |}| j                  |      }| j                  dk(  r | j                  d|||d|}d }	n+| j                  dk(  r | j                  d||||||d|\  }}	||z   }|}| j	                  |      }| j                  |      }||z   }|	fS )Nr  )r   r   r   r  )r   r   r!  r"  r#  r$  r6   )input_layernormr  r  r  pre_ff_layernormr  )
r   r   r   r!  r"  r#  r$  r  residualself_attn_weightss
             r8   r  zBambaDecoderLayer.forward  s     !,,];??g%&DJJ +,- 	M !%__+/=t~~ 0+-) /#$70 0,M, !=0 --m<))-8 =0///r7   )r  )NNNFN)r-   r.   r/   r%   r4   strrg   r1   r  r2   r   booltupler   r'   FloatTensorr  r  r  s   @r8   r  r  o  s    3{ 3s 3 3( /304(,!&HL(0||(0 t+(0 &&-	(0
 (0 $;(0 #5<<#=>E(0 23(0 
u  %(9(95;L;L(L"MPT"TT	U(0r7   r  c                   z     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZeedZ ej                           fd       Z xZS )BambaPreTrainedModelrX   modelTr  r"  )r   
attentionsc           
      j   t         |   |       t        |t              rt	        j
                  |j                         t	        j                  |j                  t        j                  t        j                  d|j                  dz                      t	        j
                  |j                         y y )Nr$   )rf   _init_weights
isinstancerW   initones_r   copy_r   r1   r   r   ri   r   )r   moduler   s     r8   r4  z"BambaPreTrainedModel._init_weights  sq    f%fj)JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  *r7   )r-   r.   r/   r%   r3   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  rS   _can_record_outputsr1   no_gradr4  r  r  s   @r8   r0  r0    s^    &*#,-"3NL*$
 U]]_! !r7   r0  c                        e Zd Zdef fdZeee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ee   defd                     Zd Z xZS )
BambaModelrX   c           	      Z   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ],  }|j                  t        |||j                  |                . t        j                  |      | _        |j                   | _        t#        |j                  |j$                        | _        t)        |      | _        d| _        | j/                          y )N)rY   r  ra   rX   F)rf   rg   pad_token_idpadding_idx
vocab_sizer   	Embeddingrj   embed_tokensrangenum_hidden_layersappendr  layers_block_type
ModuleListr   _attn_implementationr  rx   final_layernormr;   
rotary_embgradient_checkpointing	post_init)r   rX   decoder_layersir   s       r8   rg   zBambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0 	rA!!"3FaTZTlTlmnTo"pq	rmmN3$*$?$?!+F,>,>FDWDWX.f=&+#r7   N	input_idsr   r!  r"  inputs_embedsr#  r  r%  c           
      ^   |d u |d uz  rt        d      || j                  |      }|}|r|t        | j                        }|=t	        j
                  |j                  d   |j                        j                  d      }t        | j                  ||||      }	| j                  ||      }
| j                  ||      }t        | j                        D ]7  \  }}| j                  j                  |   dk(  r|
n|	} ||f|||||d	|\  }}9 | j                  |      }t!        ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedsrF  r$   r   r   )rX   rY  r   r"  r!  )r!  r  )r   r!  r"  r#  r$  )last_hidden_stater"  )r  rK  r	   rX   r1   r   rC   r   rB   r   _update_mamba_maskrS  	enumerater   rO  rR  r   )r   rX  r   r!  r"  rY  r#  r  r   causal_mask
mamba_maskr$  rW  decoder_layer
layer_maskattn_weightss                   r8   r  zBambaModel.forward  sZ    -t";<YZZ  --i8M%0*$++>O <<(;(;A(>}G[G[\ffghiL(;;')+%
 ,,^_M
"oom,oW )$++ 6 	A}'+{{'D'DQ'G7'RXcJ*7+)) /#$7+ +'M<	 ,,];&++
 	
r7   c                 f    |}||j                         s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        Nr$   )r   r1   r   )r   r   r"  r_  s       r8   r\  zBambaModel._update_mamba_mask  s;     $
'O,N,N,P&599^q5H+IJr7   )NNNNNN)r-   r.   r/   r%   rg   r   r   r   r1   r2   r  r   r.  r,  r   r'   r   r  r\  r  r  s   @r8   rD  rD    s    { &   .2.204(,26!%3
##d*3
 t+3
 &&-	3

 3
 ((4/3
 $;3
 233
 
!3
    3
jr7   rD  c                   0    e Zd Z fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  dej                  dz  dej                  dz  d	edz  d
eej                  z  defd              Z	 	 	 	 	 	 d fd	Z xZS )BambaForCausalLMc                 f    t         |   |       |j                  | _        | j                          y )N)rf   rg   z_loss_coefficientrU  )r   rX   r   s     r8   rg   zBambaForCausalLM.__init__   s*     "(";"; 	r7   NrX  r   r!  r"  rY  labelsr#  logits_to_keepr%  c	           
      L    | j                   d
||||||d|	}
|
j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}| | j                  d
||| j                  j                  d|	}| j                  dkD  r[|j                  d      j                  |j                        j                  d      j                         }|| j                  |z  z   }t        |||
j                   |
j"                  |
j$                  	      S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rX  r   r!  r"  rY  r#  N)logitsrh  rI  r   r?   r@   r   r   )lossrk  r"  r   r2  r6   )r1  r[  r5  r4   slicelm_headloss_functionrX   rI  rg  	logsumexpr   r   powmeanr   r"  r   r2  )r   rX  r   r!  r"  rY  rh  r#  ri  r  outputsr   slice_indicesrk  rl  z_losss                   r8   r  zBambaForCausalLM.forward'  s6   H ,64:: ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD&&*))b)1444::4FJJ1MRRTd55>>%#33!//))
 	
r7   c           
      h    | j                   j                  |d<   t        
|   |f||||||d|}	|	S )Nri  )r"  r   rY  r!  r#  is_first_iteration)rX   num_logits_to_keeprf   prepare_inputs_for_generation)r   rX  r"  r   rY  r!  r#  rw  r  model_inputsr   s             r8   ry  z.BambaForCausalLM.prepare_inputs_for_generationh  sU     $(;;#A#A w<	
+)'%1	
 	
 r7   )NNNNNNNr   )NNNNTF)r-   r.   r/   rg   r   r   r1   r2   r  r   r.  r,  r4   r   r  ry  r  r  s   @r8   re  re    s      .2.204(,26*.!%-.=
##d*=
 t+=
 &&-	=

 =
 ((4/=
   4'=
 $;=
 ell*=
 
 =
  =
D   r7   re  )rD  re  r0  )r$   )Er0   typingr   r1   r    r   r6  activationsr   cache_utilsr   r	   integrations.hub_kernelsr
   masking_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   jamba.modeling_jambar   llama.modeling_llamar   r   r   r   r   r   mamba2.modeling_mamba2r   r    r!   r"   r#   configuration_bambar%   
get_loggerr-   r   r'   r;   rQ   rS   rU   ModulerW   r  r  r  r0  rD  re  __all__r6   r7   r8   <module>r     s2  &     & ! . 8 / O - & X X 7 9 5 =   - 
		H	%	 0	/ 	
#L	^ 		) 	
\O \O~	x 		< 	:02 :0z !? ! !. W% W Wt`' `F Er7   