
    i                        d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.  e%j^                  e0      Z1 ed       G d dejd                               Z3d Z4 ed      dCd       Z5dejl                  de7dejl                  fdZ8	 dDdejd                  d ejl                  d!ejl                  d"ejl                  d#ejl                  dz  d$e9d%e9d&e e"   fd'Z: ee5       G d( d)ejd                               Z; G d* d+ejd                        Z< G d, d-ejd                        Z=e G d. d/ejd                               Z> G d0 d1ejd                        Z? G d2 d3e      Z@ G d4 d5e      ZA G d6 d7e      ZBe@eAd8ZCe# G d9 d:eB             ZD	 	 	 dEd;ejl                  eEejl                     z  dz  d<e7dz  d#ejl                  dz  dejl                  e7z  fd=ZFe# G d> d?eBe             ZG G d@ dAeeB      ZHg dBZIy)F    )CallableN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)lazy_load_kerneluse_experts_implementationuse_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)resolve_internal_import)OutputRecordercapture_outputs   )JambaConfigRMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	JambaRMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        JambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer%   	__class__s      y/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/jamba/modeling_jamba.pyr)   zJambaRMSNorm.__init__:   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor+   float32powmeanrsqrtr.   r-   )r/   r4   input_dtypevariances       r2   forwardzJambaRMSNorm.forwardB   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r3   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler-   shaper.   )r/   s    r2   
extra_reprzJambaRMSNorm.extra_reprI   s*    ))*+6$2G2G1HIIr3   )gư>)
__name__
__module____qualname__floatr)   r+   TensorrA   rE   __classcell__r1   s   @r2   r$   r$   8   s7    $ $$ $;U\\ ;ell ;Jr3   r$   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr7   r6   dim)rD   r+   cat)xx1x2s      r2   rotate_halfrT   M   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerT   )qkcossinunsqueeze_dimq_embedk_embeds          r2   apply_rotary_pos_embr_   T   sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr3   r4   n_repr&   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rD   expandreshape)r4   r`   batchnum_key_value_headsslenhead_dims         r2   	repeat_kvrh   n   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr3   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr6   r   r7   rO   r9   )ptrainingr    )rh   num_key_value_groupsr+   matmul	transposer   
functionalsoftmaxr;   r:   r9   ro   rt   
contiguous)ri   rj   rk   rl   rm   rn   ro   rp   
key_statesvalue_statesattn_weightsattn_outputs               r2   eager_attention_forwardr   z   s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r3   c                        e Zd ZdZdedef fdZ	 	 ddej                  dej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  f   f
dZ xZS )JambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )Nrg   g      TFbias)r(   r)   r   r   getattrr0   num_attention_headsrg   re   ru   rn   attention_dropout	is_causalr   Linearq_projk_projv_projo_proj)r/   r   r   r1   s      r2   r)   zJambaAttention.__init__   s,   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr3   Nr4   rm   past_key_valuesrp   r&   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| |j                  ||	| j                        \  }}	t        j                  | j                  j                  t              }
 |
| |||	|f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr7   r    r6           )ro   rn   )rD   rg   r   viewrw   r   r   updater   r   get_interfacer   _attn_implementationr   rt   r   rn   rc   rz   r   )r/   r4   rm   r   rp   input_shapehidden_shapequery_statesr{   r|   attention_interfacer~   r}   s                r2   rA   zJambaAttention.forward   sq    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r3   NN)rF   rG   rH   __doc__r!   intr)   r+   rJ   r   r   r   rC   rA   rK   rL   s   @r2   r   r      s    Gl{ ls l" /3(,	")||") t+") 	")
 +,") 
u||U\\D00	1")r3   r   c                        e Zd ZdZdef fdZ	 	 ddej                  dedz  dej                  dz  fdZ
ddedz  dej                  dz  fd	Z	 	 ddedz  dej                  dz  fd
Z xZS )JambaMambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r   c           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        |j                  | _        t#        j$                  | j                  | j                  | j                  | j                  | j                  | j                  dz
        | _        |j(                  | _        t,        |j(                     | _        t#        j0                  | j                  | j                  dz  | j                         | _        t#        j0                  | j                  | j                  | j                  dz  z   d      | _        t#        j0                  | j                  | j                  d      | _        t9        j:                  d| j                  dz         d d d f   }|j=                  | j                  d      j?                         }t#        j@                  t9        jB                  |            | _"        t#        j@                  t9        jF                  | j                              | _$        t#        j0                  | j                  | j                  | j                         | _%        tM        | j                  |jN                        | _(        tM        | j                  |jN                        | _)        tM        | j                  |jN                        | _*        tW        d	      }tY        |d
d       a-tY        |dd       a.tW        d      }t_        |d      a0tY        |dd       a1tY        |dd       a2tg        t`        tb        t\        tZ        td        f      a4th        stj        jm                  d       y y )Nr    )in_channelsout_channelsr   kernel_sizegroupspaddingr6   r   FTr7   r%   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fna  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d.)7r(   r)   r   r   r0   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizemamba_expandintermediate_sizemamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actr   in_projx_projdt_projr+   arangerb   rz   r*   logA_logr,   Dout_projr$   rms_norm_epsdt_layernormb_layernormc_layernormr   r   r   r   r   selective_state_updater   r   allis_fast_path_availableloggerwarning_once)r/   r   r   Acausal_conv1d	mamba_ssmr1   s         r2   r)   zJambaMambaMixer.__init__   s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &R &r3   Nr4   cache_paramsrm   c                 	   |j                   \  }}}|d uxr" |j                  | j                        xr |dk(  }| j                  |      j	                  dd      }|j                  dd      \  }}	|||j                  d      z  }| j                  j                  j                  | j                  j                  j                  d      | j                  j                  j                  d            }
|rot        |j                  d      |j                  | j                     j                  |
| j                  j                  | j                         }|j                  d      }n|Xt"        j$                  j'                  || j(                  |j                   d   z
  df      }|j+                  || j                         t-        ||
| j                  j                  | j                         }|||j                  d      z  }| j/                  |j	                  dd            }t1        j2                  || j4                  | j6                  | j6                  gd      \  }}}| j9                  |      }| j;                  |      }| j=                  |      }| j>                  j                  j@                  }t1        jB                         5  t1        jD                  | j>                  j                  j@                        | j>                  j                  _         d d d        | j?                  |      j	                  dd      }t1        jB                         5  || j>                  j                  _         d d d        t1        jF                  | jH                  jK                                }||jK                         nd }|rgtM        |j                  | j                     jN                  |d   |d   ||d d df   |d d df   | jP                  |	d   |d	
      j                  d      }nptS        ||||j	                  dd      |j	                  dd      | jP                  jK                         |	|dd

      \  }}|||jU                  || j                         | jW                  |j	                  dd            }|S # 1 sw Y   xY w# 1 sw Y   UxY w)Nr    r6   rN   r   r7   )r   ).r   T)dt_softplus)delta_softplusreturn_last_state),rD   has_previous_stater   r   rw   chunkrW   r   r-   r   sizer   squeezelayersconv_statesr   r   r   rx   padr   update_conv_stater   r   r+   splitr   r   r   r   r   r   datano_grad
zeros_likeexpr   rI   r   recurrent_statesr   r   update_recurrent_stater   )r/   r4   r   rm   
batch_sizeseq_len_use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr   scan_outputs	ssm_statecontextualized_statess                         r2   cuda_kernels_forwardz$JambaMambaMixer.cuda_kernels_forward  s8    "/!4!4
GQ$i)H)H)Xi]dhi]i 	  <<6@@AF /44QA4>t%)N,D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)##DNN3??  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st..{DNNK,]L$++JZJZgkgvgvwM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ **//]]_ 	N%*%5%5dll6G6G6L6L%MDLL"	N!\\)4>>q!D]]_ 	4%3DLL"	4 YYtzz'')**3A3M--/SW!1##DNN3DDf%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A33It~~N !%l.D.DQ.J K$$S	N 	N	4 	4s   AR1R>1R;>Sc           	      
   |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  dd      \  }	}
||	|j                  d      z  }	|M|j                  | j                        r2|j                  | j                     j                  j                         }n9t        j                  || j                  | j                  f|	j                  |      }|I|j                  | j                        r|dk(  r|j!                  |	| j                        }t        j"                  || j$                  j&                  d d dd d f   z  d      }	| j(                  r|	| j$                  j*                  z  }	| j-                  |	      j/                  |      j                  d      }	nt0        j2                  j5                  |	| j6                  |	j                   d   z
  df      }|j!                  || j                        }| j-                  | j%                  |	      dd |f         }	n'| j-                  | j%                  |	      dd |f         }	||	|j                  d      z  }	| j9                  |	j                  dd            }t        j:                  || j<                  | j                  | j                  gd      \  }}}| j?                  |      }| jA                  |      }| jC                  |      }| jE                  |      }t0        j2                  jG                  |      j                  dd      }t        jH                  | jJ                  jM                                }t        jH                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jM                         z  }||	d d d d d d d f   jM                         z  }g }tO        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t        jP                  |j/                  |      |d d |d d f   j                  d            }|jS                  |d d d d df           t        jT                  |d      }||	| jV                  d d d d f   z  z   }|| j-                  |
      z  }||jY                  || j                         | j[                  |j                  dd            }|S )Nr    r6   rN   )devicer9   r   r7   .).rD   r9   r   rw   r   rW   r   r   r   r   cloner+   zerosr   r   r   r   sumr   r-   r   r   r   r:   r   rx   r   r   r   r   r   r   r   r   r   softplusr   r   rI   rangerv   appendstackr   r   r   )r/   input_statesr   rm   r   r   r   r9   r   r4   r   r   
conv_stater   r   r   r   r   r   
discrete_A
discrete_BdeltaB_ur   iscan_outputr   s                             r2   slow_forwardzJambaMambaMixer.slow_forward{  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM#(G(G(W$++DNN;LLRRTIT33T5H5HI$++5I #..t~~>7a<);;M4>>Z
 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 *;;JW
 $])CC'M)R S HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDw 	6A"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45	6 kk,B7!]TVVD!TM5J%JK"TXXd^3#//	4>>J !%k.C.CAq.I J$$r3   c                 V   | j                   j                  rXt        r,d| j                  j                  j
                  j                  vr&t        j                  d       d| j                   _        | j                   j                  r| j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure that they are installed and that the mamba module is on a CUDA device. Turning off the fast path `config.use_mamba_kernels=False` and falling back to the slow path.F)r   use_mamba_kernelsr   r   r-   r   typer   r   r   r  )r/   r4   r   rm   s       r2   rA   zJambaMambaMixer.forward  s     ;;((&&8J8J8Q8Q8V8V*VV
 -2DKK);;((,,]L.YY  nMMr3   r   )rF   rG   rH   r   r!   r)   r+   rJ   r   
LongTensorr   r  rA   rK   rL   s   @r2   r   r      s    A{ AL &*26	c%||c% dlc% ((4/	c%LJ%ut| J%\a\l\los\s J%` &*26	N dlN ((4/	Nr3   r   c                   $     e Zd Z fdZd Z xZS )JambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r(   r)   r   r0   r   r   r   	gate_projup_proj	down_projr   r   act_fnr/   r   r1   s     r2   r)   zJambaMLP.__init__  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r3   c                     | j                  | j                  | j                  |            | j                  |      z        }|S )N)r  r  r  r  )r/   rQ   r  s      r2   rA   zJambaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r3   )rF   rG   rH   r)   rA   rK   rL   s   @r2   r  r    s    0r3   r  c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	JambaExpertsz2Collection of expert weights stored as 3D tensors.r   c                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  d| j                  z  | j
                              | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        |j                     | _        y )Nr6   )r(   r)   num_local_expertsnum_expertsr0   
hidden_dimr   intermediate_dimr   r*   r+   emptygate_up_projr  r   r   r  r  s     r2   r)   zJambaExperts.__init__  s    !33 ,, & 8 8LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV../r3   r4   top_k_indextop_k_weightsr&   c                 f   t        j                  |      }t        j                         5  t         j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        D ]  }|d   }|| j                  k(  rt        j                  |         \  }}	||	   }
t        j                  j                  |
| j                  |         j                  dd      \  }}| j                  |      |z  }t        j                  j                  || j                   |         }|||	|d f   z  }|j#                  d|	|j%                  |j&                                |S # 1 sw Y   xY w)N)num_classesr6   r    r   )r7   rN   r7   )r+   r   r   r   rx   one_hotr  permutegreaterr   nonzerowherelinearr  r   r  r  
index_add_r:   r9   )r/   r4   r  r   final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stater   upcurrent_hidden_statess                 r2   rA   zJambaExperts.forward  s    $..}=]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 % 
	nJ#AJT---#(;;{:/F#G Iy))4M}}++M4;L;LZ;XY__`agi_jHD"$(KK$5$:!$&MM$8$89NPTP^P^_iPj$k!$9M)U^`dJd<e$e!**1i9N9Q9QReRkRk9lm
	n #"#	S 	Ss   A=F&&F0)
rF   rG   rH   r   r!   r)   r+   rJ   rA   rK   rL   s   @r2   r  r    sM    <0{ 0#||# \\# ||	#
 
#r3   r  c                   f     e Zd ZdZdef fdZd Zdej                  dej                  fdZ	 xZ
S )JambaSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    r   c                 ,   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        |      | _        y r  )r(   r)   r0   r  r   ffn_dimr  num_experts_per_toktop_kr   r   routerr  expertsr  s     r2   r)   zJambaSparseMoeBlock.__init__!  sm     ,,//!--//
ii1A1AN#F+r3   c                     t         j                  j                  j                  |dt         j                        }t        j
                  || j                  d      \  }}||j                  |j                        fS )Nr7   rr   rN   )	r+   r   rx   ry   rI   topkr9  r:   r9   )r/   r4   router_logitsrouting_weightsr   r  s         r2   route_tokens_to_expertsz+JambaSparseMoeBlock.route_tokens_to_experts+  sb    ((--55mSXS^S^5_%*ZZQS%T"{M,,]-@-@AAAr3   r4   r&   c                     |j                   \  }}}|j                  d|      }| j                  |      }| j                  ||      \  }}| j	                  |||      }|j                  |||      }|S )Nr7   )rD   r   r:  r@  r;  rc   )r/   r4   r   sequence_lengthr  r>  r  r   s           r2   rA   zJambaSparseMoeBlock.forward0  sx    2?2E2E/
OZ%**2z:M2%)%A%A-Q^%_"]]KO%--j/:Vr3   )rF   rG   rH   r   r!   r)   r@  r+   rJ   rA   rK   rL   s   @r2   r5  r5    s5    	,{ ,B
U\\ ell r3   r5  c                        e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ee   dej                  fdZ xZS )JambaAttentionDecoderLayerr   r   c                 R   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr    r   )r(   r)   layers_num_expertsr   	self_attnr5  r  feed_forwardr$   r0   r   input_layernormpre_ff_layernormr/   r   r   r  ffn_layer_classr1   s        r2   r)   z#JambaAttentionDecoderLayer.__init__;  s    >D>W>Wf//	:]^'	:1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr3   Nr4   rm   position_idsr   	use_cacherp   r&   c           	          |}| j                  |      } | j                  d|||||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r4   rm   rM  r   rN   )rI  rG  rJ  rH  )	r/   r4   rm   rM  r   rN  rp   residualr   s	            r2   rA   z"JambaAttentionDecoderLayer.forwardE  s     !,,];)4>> 
')%+
 
q !=0 --m<))-8 =0r3   )NNNF)rF   rG   rH   r!   r   r)   r+   rJ   r  r   boolr   r   FloatTensorrA   rK   rL   s   @r2   rD  rD  :  s    Z{ Zs Z /304(,!&|| t+ &&-	
  $; +, 
		r3   rD  c                        e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
e   d
ej                  fdZ xZS )JambaMambaDecoderLayerr   r   c                 T   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr    )r   r   r   )r(   r)   rF  r   mambar5  r  rH  r$   r0   r   rI  rJ  rK  s        r2   r)   zJambaMambaDecoderLayer.__init__a  s    >D>W>Wf//	:]^$FiH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr3   Nr4   rm   rM  r   rp   r&   c                     |}| j                  |      }| j                  |||      }||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r4   r   rm   )rI  rW  rJ  rH  )r/   r4   rm   rM  r   rp   rQ  s          r2   rA   zJambaMambaDecoderLayer.forwardj  sv     !,,];

'() # 

 !=0 --m<))-8 =0r3   )NNN)rF   rG   rH   r!   r   r)   r+   rJ   r  r   r   r   rS  rA   rK   rL   s   @r2   rU  rU  `  s    Z{ Zs Z /304(,|| t+ &&-	
  +, 
		r3   rU  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZeege eej"                  d      d	Z ej(                          fd
       Z xZS )JambaPreTrainedModelr   modelTrD  rU  r   r:  )
layer_name)r4   
attentionsr>  c                    t         |   |       t        |t              rt	        j
                  d|j                  dz         d d d f   }|j                  |j                  d      j                         }t        j                  |j                  t	        j                  |             t        j                  |j                         y t        |t               rmt        j"                  |j$                  d| j&                  j(                         t        j"                  |j*                  d| j&                  j(                         y y )Nr    r7   r   )r=   std)r(   _init_weights
isinstancer   r+   r   r   rb   r   rz   initcopy_r   r   ones_r   r  normal_r  r   initializer_ranger  )r/   ri   r   r1   s      r2   r`  z"JambaPreTrainedModel._init_weights  s    f%fo.Q 5 5 9:47CA1126AACAJJv||UYYq\2JJvxx -LL,,3DKK<Y<YZLL))9V9VW .r3   )rF   rG   rH   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulrD  rU  r   r   r   r   _can_record_outputsr+   r   r`  rK   rL   s   @r2   rZ  rZ    sw    &*#57OP"3NL46LM$'		hG U]]_	X 	Xr3   rZ  )	attentionrW  c                        e Zd Zdef fdZeee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ee   defd                     Zd Z xZS )
JambaModelr   c                     t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ]1  }t        |j                  |      }|j                   |||             3 t        j                  |      | _        t!        |j                  |j"                        | _        d| _        | j)                          y )N)r   r   F)r(   r)   pad_token_idpadding_idx
vocab_sizer   	Embeddingr0   embed_tokensr   num_hidden_layersALL_DECODER_LAYER_TYPESlayers_block_typer   
ModuleListr   r$   r   final_layernormgradient_checkpointing	post_init)r/   r   decoder_layersr  layer_classr1   s        r2   r)   zJambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0 	DA1&2J2J12MNK!!+f"BC	D mmN3+F,>,>FDWDWX&+#r3   N	input_idsrm   rM  r   inputs_embedsrN  rp   r&   c           	      2   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        | j                  ||||      }	| j                  ||      }
|}| j                  D ]$  }t        |t              r|
n|	} ||f||||d|}& | j                  |      }t!        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r    )r   )r   r  rm   r   rM  )rm   rM  r   rN  )last_hidden_stater   )
ValueErrorrx  r	   r   get_seq_lengthr+   r   rD   r   rW   r   _update_mamba_maskr   ra  rU  r}  r   )r/   r  rm   rM  r   r  rN  rp   past_seen_tokenscausal_mask
mamba_maskr4   decoder_layer
layer_masks                 r2   rA   zJambaModel.forward  sJ    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ,,^_M
%![[ 
	M'1-AW'X^iJ))) /# M
	 ,,];%++
 	
r3   c                 f    |}||j                         s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        Nr    )r   r+   r   )r/   rm   r   r  s       r2   r  zJambaModel._update_mamba_mask  s;     $
'O,N,N,P&599^q5H+IJr3   )NNNNNN)rF   rG   rH   r!   r)   r   r   r   r+   r  rJ   r   rS  rR  r   r   r   rA   r  rK   rL   s   @r2   rr  rr    s    { $   .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
 2
    2
hr3   rr  gate_logitsr  c                    | t        | t              syt        | t              rC| d   j                  }t        j                  | D cg c]  }|j                  |       c}d      }t        j                  j                  j                  d      }t        j                  ||d      \  }}	t        j                  j                  j                  |	|      }
|>t        j                  |
j                         d      }t        j                  |d      }n|j                  \  }}|j                  d   ||z  z  }|dddddddf   j                  |||||f      j                  d||      j                        }t        j                   |
j                         |z  d      t        j                   |d      z  }|ddddddf   j                  ||||f      j                  d|      j                  |      }t        j                   ||z  d      t        j                   |d      z  }t        j                   ||j#                  d      z        }||z  S c c}w )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   rN   r7   )ra  rC   r   r+   rP   r:   r   rx   ry   r=  r$  r=   rI   rD   rb   rc   r   rW   )r  r  r9  rm   compute_device
layer_gateconcatenated_gate_logitsr?  r   selected_expertsr,  tokens_per_expertrouter_prob_per_expertr   rB  ry  expert_attention_mask router_per_expert_attention_maskoverall_losss                      r2   load_balancing_loss_funcr    s9   : *[%"@+u%$Q..#(99^i-jPZjmmN.K-jpq#r hh))112JPR1SO**_eDA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
O4::1=*B^_ 4AtT12V&
OUKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&
O[QRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1G1Q1QRS1TTUL+%%[ .ks   Ic                   T    e Zd ZddiZddiZddgdgfiZdef fdZee		 	 	 	 	 	 	 	 	 dd
e
j                  d	z  de
j                  d	z  de
j                  d	z  ded	z  de
j                  d	z  de
j                  d	z  ded	z  ded	z  dee
j                  z  dee   defd              Z xZS )JambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr4   logitsr   c                 N   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        |j                  | _
        |j                  | _        | j                          y r  )r(   r)   rr  r[  rv  r   r   r0   r  router_aux_loss_coefr  r8  r  r  s     r2   r)   zJambaForCausalLM.__init__Q  s     '
 ++yy!3!3V5F5FUS$*$?$?!!--#)#=#=  	r3   Nr  rm   rM  r   r  labelsrN  output_router_logitslogits_to_keeprp   r&   c
                 j   ||n| j                   j                  } | j                  d|||||||d|
}|j                  }t	        |	t
              rt        |	 d      n|	}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |
}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, JambaForCausalLM

        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r  rm   rM  r   r  rN  r  )lossaux_lossr  r   r4   r]  r>  rP  )r   r  r[  r  ra  r   slicer  loss_functionrv  r  r>  r  r8  r  r:   r   r   r   r4   r]  )r/   r  rm   rM  r   r  r  rN  r  r  rp   outputsr4   slice_indicesr  r  r  s                    r2   rA   zJambaForCausalLM.forward]  sU   N %9$D $++JjJj 	
 +5$** 	+
)%+'!5	+
 	+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r3   )	NNNNNNNNr   )rF   rG   rH   _tied_weights_keys_tp_plan_pp_planr!   r)   r   r   r+   r  rJ   r   rS  rR  r   r   r   r   rA   rK   rL   s   @r2   r  r  K  s<   *,GH23H_-z:;H
{ 
  .2.204(,26*.!%,0-.P
##d*P
 t+P
 &&-	P

 P
 ((4/P
   4'P
 $;P
 #TkP
 ell*P
 +,P
 
#P
  P
r3   r  c                       e Zd Zy)JambaForSequenceClassificationN)rF   rG   rH   rP  r3   r2   r  r    s    r3   r  )r  r  rr  rZ  )r    )r   )Nr6   N)Jcollections.abcr   r+   r    r   rb  activationsr   cache_utilsr   r	   
generationr
   integrationsr   r   r   r   r   masking_utilsr   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   r   configuration_jambar!   
get_loggerrF   r   Moduler$   rT   r_   rJ   r   rh   rI   r   r   r   r  r  r5  rD  rU  rZ  rz  rr  rC   r  r  r  __all__rP  r3   r2   <module>r     s  2 %   & ! . )  0 [ Q F & R R 7 9 E , 
		H	% Y'J299 J (J(( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*3)RYY 3) +3)lPNbii PNfryy   $#299 $# $#N"")) "J#!; #L7 BX? X8 )CMcd  U% U Ut #
*.	O&ell 33d:O&tO& LL4'	O&
 \\CO&d c
+_ c
 c
L	%EG[ 	 gr3   