
    iBr                        d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0  ejb                  e2      Z3 G d de(      Z4 G d de'      Z5 G d dejl                        Z7 G d de+      Z8 G d  d!e-      Z9 G d" d#ejl                        Z: G d$ d%e      Z; G d& d'e      Z<e;e<d(Z= G d) d*e      Z>e G d+ d,e>             Z? G d- d.e.      Z@ G d/ d0ee>      ZAg d1ZBy)2    )CallableN)nn   )initialization)ACT2FN)CacheDynamicCache)lazy_load_kernel)create_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)resolve_internal_import)OutputRecordercapture_outputs   )LlamaAttentionLlamaRMSNormeager_attention_forward)
MistralMLP)MixtralExpertsMixtralForCausalLM   )JambaConfigc                       e Zd Zy)JambaRMSNormN__name__
__module____qualname__     x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/jamba/modular_jamba.pyr$   r$   .       r*   r$   c                        e Zd Zdedef fdZ	 	 ddej                  dej                  dz  dedz  de	e
   d	eej                  ej                  dz  f   f
d
Z xZS )JambaAttentionconfig	layer_idxc                    t         |   ||       t        j                  |j                  |j
                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _	        t        j                  |j                  |j                  | j                  z  d      | _
        t        j                  |j
                  | j                  z  |j                  d      | _        y NFbias)super__init__r   Linearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_proj)selfr/   r0   	__class__s      r+   r6   zJambaAttention.__init__3   s    +ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr*   Nhidden_statesattention_maskpast_key_valueskwargsreturnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| |j                  ||	| j                        \  }}	t        j                  | j                  j                  t              }
 |
| |||	|f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr!   r           )dropoutscaling)shaper:   r;   view	transposer=   r>   updater0   r   get_interfacer/   _attn_implementationr   trainingattention_dropoutrK   reshape
contiguousr?   )r@   rB   rC   rD   rE   input_shapehidden_shapequery_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                r+   forwardzJambaAttention.forward:   sq    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r*   NN)r&   r'   r(   r"   intr6   torchTensorr   r   r   tupler^   __classcell__rA   s   @r+   r.   r.   2   s    l{ ls l /3(,	")||") t+") 	")
 +,") 
u||U\\D00	1")r*   r.   c                        e Zd ZdZdef fdZ	 	 ddej                  dedz  dej                  dz  fdZ
ddedz  dej                  dz  fd	Z	 	 ddedz  dej                  dz  fd
Z xZS )JambaMambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r/   c           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        |j                  | _        t#        j$                  | j                  | j                  | j                  | j                  | j                  | j                  dz
        | _        |j(                  | _        t,        |j(                     | _        t#        j0                  | j                  | j                  dz  | j                         | _        t#        j0                  | j                  | j                  | j                  dz  z   d      | _        t#        j0                  | j                  | j                  d      | _        t9        j:                  d| j                  dz         d d d f   }|j=                  | j                  d      j?                         }t#        j@                  t9        jB                  |            | _"        t#        j@                  t9        jF                  | j                              | _$        t#        j0                  | j                  | j                  | j                         | _%        tM        | j                  |jN                        | _(        tM        | j                  |jN                        | _)        tM        | j                  |jN                        | _*        tW        d	      }tY        |d
d       a-tY        |dd       a.tW        d      }t_        |d      a0tY        |dd       a1tY        |dd       a2tg        t`        tb        t\        tZ        td        f      a4th        stj        jm                  d       y y )Nr!   )in_channelsout_channelsr4   kernel_sizegroupspaddingr   r3   FTrH   epszcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fna  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d.)7r5   r6   r/   r0   r8   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizemamba_expandintermediate_sizemamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actr7   in_projx_projdt_projra   arangeexpandrU   	ParameterlogA_logonesDout_projr$   rms_norm_epsdt_layernormb_layernormc_layernormr
   getattrrp   rq   r   selective_state_updaters   rt   allis_fast_path_availableloggerwarning_once)r@   r/   r0   Acausal_conv1d	mamba_ssmrA   s         r+   r6   zJambaMambaMixer.__init__g   s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &R &r*   NrB   cache_paramsrC   c                 	   |j                   \  }}}|d uxr" |j                  | j                        xr |dk(  }| j                  |      j	                  dd      }|j                  dd      \  }}	|||j                  d      z  }| j                  j                  j                  | j                  j                  j                  d      | j                  j                  j                  d            }
|rot        |j                  d      |j                  | j                     j                  |
| j                  j                  | j                         }|j                  d      }n|Xt"        j$                  j'                  || j(                  |j                   d   z
  df      }|j+                  || j                         t-        ||
| j                  j                  | j                         }|||j                  d      z  }| j/                  |j	                  dd            }t1        j2                  || j4                  | j6                  | j6                  gd      \  }}}| j9                  |      }| j;                  |      }| j=                  |      }| j>                  j                  j@                  }t1        jB                         5  t1        jD                  | j>                  j                  j@                        | j>                  j                  _         d d d        | j?                  |      j	                  dd      }t1        jB                         5  || j>                  j                  _         d d d        t1        jF                  | jH                  jK                                }||jK                         nd }|rgtM        |j                  | j                     jN                  |d   |d   ||d d df   |d d df   | jP                  |	d   |d	
      j                  d      }nptS        ||||j	                  dd      |j	                  dd      | jP                  jK                         |	|dd

      \  }}|||jU                  || j                         | jW                  |j	                  dd            }|S # 1 sw Y   xY w# 1 sw Y   UxY w)Nr!   r   dimr   rH   )r   ).r   T)dt_softplus)delta_softplusreturn_last_state),rL   has_previous_stater0   r   rN   chunk	unsqueezer   weightrM   sizerp   squeezelayersconv_statesr4   r   r   
functionalpadrx   update_conv_staterq   r   ra   splitr|   rv   r   r   r   r   datano_grad
zeros_likeexpr   floatr   recurrent_statesr   rs   update_recurrent_stater   )r@   rB   r   rC   
batch_sizeseq_len_use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr   scan_outputs	ssm_statecontextualized_statess                         r+   cuda_kernels_forwardz$JambaMambaMixer.cuda_kernels_forward   s8    "/!4!4
GQ$i)H)H)Xi]dhi]i 	  <<6@@AF /44QA4>t%)N,D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)##DNN3??  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st..{DNNK,]L$++JZJZgkgvgvwM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ **//]]_ 	N%*%5%5dll6G6G6L6L%MDLL"	N!\\)4>>q!D]]_ 	4%3DLL"	4 YYtzz'')**3A3M--/SW!1##DNN3DDf%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A33It~~N !%l.D.DQ.J K$$S	N 	N	4 	4s   AR1R>1R;>Sc           	      
   |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  dd      \  }	}
||	|j                  d      z  }	|M|j                  | j                        r2|j                  | j                     j                  j                         }n9t        j                  || j                  | j                  f|	j                  |      }|I|j                  | j                        r|dk(  r|j!                  |	| j                        }t        j"                  || j$                  j&                  d d dd d f   z  d      }	| j(                  r|	| j$                  j*                  z  }	| j-                  |	      j/                  |      j                  d      }	nt0        j2                  j5                  |	| j6                  |	j                   d   z
  df      }|j!                  || j                        }| j-                  | j%                  |	      dd |f         }	n'| j-                  | j%                  |	      dd |f         }	||	|j                  d      z  }	| j9                  |	j                  dd            }t        j:                  || j<                  | j                  | j                  gd      \  }}}| j?                  |      }| jA                  |      }| jC                  |      }| jE                  |      }t0        j2                  jG                  |      j                  dd      }t        jH                  | jJ                  jM                                }t        jH                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jM                         z  }||	d d d d d d d f   jM                         z  }g }tO        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t        jP                  |j/                  |      |d d |d d f   j                  d            }|jS                  |d d d d df           t        jT                  |d      }||	| jV                  d d d d f   z  z   }|| j-                  |
      z  }||jY                  || j                         | j[                  |j                  dd            }|S )Nr!   r   r   )devicedtyper   rH   .).rL   r   r   rN   r   r   r   r0   r   r   clonera   zerosrz   rv   r   r   sumr   r   r~   r4   r   tor   r   r   rx   r   r   r|   r   r   r   r   softplusr   r   r   rangematmulappendstackr   r   r   )r@   input_statesr   rC   r   r   r   r   r   rB   r   r   
conv_stater   r   r   r   r   r   
discrete_A
discrete_BdeltaB_ur   iscan_outputr   s                             r+   slow_forwardzJambaMambaMixer.slow_forward  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM#(G(G(W$++DNN;LLRRTIT33T5H5HI$++5I #..t~~>7a<);;M4>>Z
 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 *;;JW
 $])CC'M)R S HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDw 	6A"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45	6 kk,B7!]TVVD!TM5J%JK"TXXd^3#//	4>>J !%k.C.CAq.I J$$r*   c                 V   | j                   j                  rXt        r,d| j                  j                  j
                  j                  vr&t        j                  d       d| j                   _        | j                   j                  r| j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure that they are installed and that the mamba module is on a CUDA device. Turning off the fast path `config.use_mamba_kernels=False` and falling back to the slow path.F)r/   use_mamba_kernelsr   r   r   r   typer   r   r   r   )r@   rB   r   rC   s       r+   r^   zJambaMambaMixer.forward]  s     ;;((&&8J8J8Q8Q8V8V*VV
 -2DKK);;((,,]L.YY  nMMr*   r_   )r&   r'   r(   __doc__r"   r6   ra   rb   r   
LongTensorr   r   r^   rd   re   s   @r+   rg   rg   _   s    A{ AL &*26	c%||c% dlc% ((4/	c%LJ%ut| J%\a\l\los\s J%` &*26	N dlN ((4/	Nr*   rg   c                       e Zd Zy)JambaMLPNr%   r)   r*   r+   r   r   r  r,   r*   r   c                       e Zd Zy)JambaExpertsNr%   r)   r*   r+   r   r   v  r,   r*   r   c                   f     e Zd ZdZdef fdZd Zdej                  dej                  fdZ	 xZ
S )JambaSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    r/   c                 ,   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        |      | _        y r2   )r5   r6   r8   
hidden_dimrz   ffn_dimnum_expertsnum_experts_per_toktop_kr   r7   routerr   expertsr@   r/   rA   s     r+   r6   zJambaSparseMoeBlock.__init__  sm     ,,//!--//
ii1A1AN#F+r*   c                     t         j                  j                  j                  |dt         j                        }t        j
                  || j                  d      \  }}||j                  |j                        fS )NrH   )r   r   r   )	ra   r   r   softmaxr   topkr   r   r   )r@   rB   router_logitsrouting_weightstop_k_weightstop_k_indexs         r+   route_tokens_to_expertsz+JambaSparseMoeBlock.route_tokens_to_experts  sb    ((--55mSXS^S^5_%*ZZQS%T"{M,,]-@-@AAAr*   rB   rF   c                     |j                   \  }}}|j                  d|      }| j                  |      }| j                  ||      \  }}| j	                  |||      }|j                  |||      }|S )NrH   )rL   rM   r   r   r   rT   )r@   rB   r   sequence_lengthr   r   r   r   s           r+   r^   zJambaSparseMoeBlock.forward  sx    2?2E2E/
OZ%**2z:M2%)%A%A-Q^%_"]]KO%--j/:Vr*   )r&   r'   r(   r   r"   r6   r   ra   rb   r^   rd   re   s   @r+   r   r   z  s5    	,{ ,B
U\\ ell r*   r   c                        e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ee   dej                  fdZ xZS )JambaAttentionDecoderLayerr/   r0   c                 R   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr!   rn   )r5   r6   layers_num_expertsr.   	self_attnr   r   feed_forwardr$   r8   r   input_layernormpre_ff_layernormr@   r/   r0   r   ffn_layer_classrA   s        r+   r6   z#JambaAttentionDecoderLayer.__init__  s    >D>W>Wf//	:]^'	:1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr*   NrB   rC   position_idsrD   	use_cacherE   rF   c           	          |}| j                  |      } | j                  d|||||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rB   rC   r  rD   r  r)   )r  r   r  r  )	r@   rB   rC   r  rD   r  rE   residualr   s	            r+   r^   z"JambaAttentionDecoderLayer.forward  s     !,,];)4>> 
')%+
 
q !=0 --m<))-8 =0r*   )NNNF)r&   r'   r(   r"   r`   r6   ra   rb   r   r   boolr   r   FloatTensorr^   rd   re   s   @r+   r   r     s    Z{ Zs Z /304(,!&|| t+ &&-	
  $; +, 
		r*   r   c                        e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
e   d
ej                  fdZ xZS )JambaMambaDecoderLayerr/   r0   c                 T   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr!   )r/   r0   rn   )r5   r6   r   rg   mambar   r   r  r$   r8   r   r  r  r  s        r+   r6   zJambaMambaDecoderLayer.__init__  s    >D>W>Wf//	:]^$FiH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr*   NrB   rC   r  rD   rE   rF   c                     |}| j                  |      }| j                  |||      }||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rB   r   rC   )r  r  r  r  )r@   rB   rC   r  rD   rE   r	  s          r+   r^   zJambaMambaDecoderLayer.forward  sv     !,,];

'() # 

 !=0 --m<))-8 =0r*   )NNN)r&   r'   r(   r"   r`   r6   ra   rb   r   r   r   r   r  r^   rd   re   s   @r+   r  r    s    Z{ Zs Z /304(,|| t+ &&-	
  +, 
		r*   r  )	attentionr  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZeege eej"                  d      d	Z ej(                          fd
       Z xZS )JambaPreTrainedModelr/   modelTr   r  rD   r   )
layer_name)rB   
attentionsr   c                    t         |   |       t        |t              rt	        j
                  d|j                  dz         d d d f   }|j                  |j                  d      j                         }t        j                  |j                  t	        j                  |             t        j                  |j                         y t        |t               rmt        j"                  |j$                  d| j&                  j(                         t        j"                  |j*                  d| j&                  j(                         y y )Nr!   rH   rI   )meanstd)r5   _init_weights
isinstancerg   ra   r   rv   r   rz   rU   initcopy_r   r   ones_r   r   normal_gate_up_projr/   initializer_range	down_proj)r@   moduler   rA   s      r+   r  z"JambaPreTrainedModel._init_weights  s    f%fo.Q 5 5 9:47CA1126AACAJJv||UYYq\2JJvxx -LL,,3DKK<Y<YZLL))9V9VW .r*   )r&   r'   r(   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr   r  r.   r   r   r7   _can_record_outputsra   r   r  rd   re   s   @r+   r  r    sw    &*#57OP"3NL46LM$'		hG U]]_	X 	Xr*   r  c                        e Zd Zdef fdZeee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ee   defd                     Zd Z xZS )
JambaModelr/   c                     t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ]1  }t        |j                  |      }|j                   |||             3 t        j                  |      | _        t!        |j                  |j"                        | _        d| _        | j)                          y )N)r0   rn   F)r5   r6   pad_token_idpadding_idx
vocab_sizer   	Embeddingr8   embed_tokensr   num_hidden_layersALL_DECODER_LAYER_TYPESlayers_block_typer   
ModuleListr   r$   r   final_layernormgradient_checkpointing	post_init)r@   r/   decoder_layersr   layer_classrA   s        r+   r6   zJambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0 	DA1&2J2J12MNK!!+f"BC	D mmN3+F,>,>FDWDWX&+#r*   N	input_idsrC   r  rD   inputs_embedsr  rE   rF   c           	      2   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        | j                  ||||      }	| j                  ||      }
|}| j                  D ]$  }t        |t              r|
n|	} ||f||||d|}& | j                  |      }t!        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embeds)r/   r   r!   )r   )r/   r?  rC   rD   r  )rC   r  rD   r  )last_hidden_staterD   )
ValueErrorr4  r	   r/   get_seq_lengthra   r   rL   r   r   r   _update_mamba_maskr   r  r  r9  r   )r@   r>  rC   r  rD   r?  r  rE   past_seen_tokenscausal_mask
mamba_maskrB   decoder_layer
layer_masks                 r+   r^   zJambaModel.forward  sJ    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ,,^_M
%![[ 
	M'1-AW'X^iJ))) /# M
	 ,,];%++
 	
r*   c                 f    |}||j                         s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        Nr!   )r   ra   r   )r@   rC   rD   rG  s       r+   rD  zJambaModel._update_mamba_maskP  s;     $
'O,N,N,P&599^q5H+IJr*   )NNNNNN)r&   r'   r(   r"   r6   r   r   r   ra   r   rb   r   r  r
  r   r   r   r^   rD  rd   re   s   @r+   r.  r.    s    { $   .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
 2
    2
hr*   r.  c                   $    e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	ej                  dz  d
e
dz  de
dz  deej                  z  dee   def fdZ xZS )JambaForCausalLMr/   c                 F    t         |   |       |j                  | _        y )N)r5   r6   r   r   s     r+   r6   zJambaForCausalLM.__init___  s     !--r*   Nr>  rC   r  rD   r?  labelsr  output_router_logitslogits_to_keeprE   rF   c
           
      2    t        |   ||||||||	fi |
S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, JambaForCausalLM

        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r5   r^   )r@   r>  rC   r  rD   r?  rN  r  rO  rP  rE   rA   s              r+   r^   zJambaForCausalLM.forwardc  s9    F w

 

 
	
r*   )	NNNNNNNNr   )r&   r'   r(   r"   r6   ra   r   rb   r   r  r
  r`   r   r   r   r^   rd   re   s   @r+   rL  rL  ^  s    .{ . .2.204(,26*.!%,0-.-
##d*-
 t+-
 &&-	-

 -
 ((4/-
   4'-
 $;-
 #Tk-
 ell*-
 +,-
 
#-
 -
r*   rL  c                       e Zd Zy)JambaForSequenceClassificationNr%   r)   r*   r+   rS  rS    r,   r*   rS  )rL  rS  r.  r  )Ccollections.abcr   ra   r    r   r  activationsr   cache_utilsr   r	   integrationsr
   masking_utilsr   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   r   llama.modeling_llamar   r   r   mistral.modeling_mistralr   mixtral.modeling_mixtralr   r    configuration_jambar"   
get_loggerr&   r   r$   r.   Modulerg   r   r   r   r   r  r6  r  r.  rL  rS  __all__r)   r*   r+   <module>ri     s3  & %   & ! . , / [ Q F & @ @ 7 9 E X X 1 I , 
		H	%	< 	*)^ *)ZPNbii PNf	z 		> 	"")) "J#!; #L7 B )CMcd X? X8 U% U Up2
) 2
j	%EG[ 	 gr*   