
    i                     r    d Z ddlZddlmZ ddlmZ ddlmZ  ed      e G d	 d
e                    Zd
gZ	y)zJamba model configuration    N)strict   )PreTrainedConfig)auto_docstringzai21labs/Jamba-v0.1)
checkpointc                   H    e Zd ZU dZdZdgZddiZdZee	d<   dZ
ee	d	<   d
Zee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d <   d!Zedz  e	d"<   d#Zeee   z  dz  e	d$<   d%Zee	d&<   d'Zeez  e	d(<   d#Zee	d)<   d*Z ee	d<   d#Z!ee	d+<   d!Z"ee	d,<   dZ#ee	d-<   d.Z$ee	d/<   dZ%ee	d0<   d*Z&ee	d1<   d.Z'ee	d2<   d#Z(ee	d3<   d4Z)eez  e	d5<   dZ*ee	d6<   dZ+ee	d7<    fd8Z,e-d9        Z.e-d:        Z/e-d;        Z0d< Z1 xZ2S )=JambaConfiga   
    expert_layer_period (`int`, *optional*, defaults to 2):
        Once in this many layers, we will have an expert layer
    expert_layer_offset (`int`, *optional*, defaults to 1):
        The first layer index that contains an expert mlp layer
    attn_layer_period (`int`, *optional*, defaults to 8):
        Once in this many layers, we will have a vanilla attention layer
    attn_layer_offset (`int`, *optional*, defaults to 4):
        The first layer index that contains a vanilla attention mlp layer
    use_mamba_kernels (`bool`, *optional*, defaults to `True`):
        Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
        `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
        `True` and kernels are not available
    mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
        Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
    jambapast_key_valuesnum_local_expertsnum_expertsi   
vocab_sizeFtie_word_embeddingsi   hidden_sizei 8  intermediate_size    num_hidden_layersnum_attention_heads   num_key_value_headssilu
hidden_actg{Gz?initializer_rangegư>rms_norm_epsT	use_cacheoutput_router_logitsgMbP?router_aux_loss_coefr   Npad_token_id   bos_token_id   eos_token_idi   max_position_embeddingsg        attention_dropoutnum_experts_per_tok   expert_layer_periodexpert_layer_offsetattn_layer_period   attn_layer_offsetuse_mamba_kernelsmamba_d_statemamba_d_convmamba_expandautomamba_dt_rankmamba_conv_biasmamba_proj_biasc                     | j                   | j                  | _         | j                  dk(  r"t        j                  | j
                  dz        n| j                  | _        t        |   di | y )Nr0   r&    )r   r   r1   mathceilr   super__post_init__)selfkwargs	__class__s     ~/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/jamba/configuration_jamba.pyr9   zJambaConfig.__post_init__R   s`    ##+'+'?'?D$AEASASW]A]TYYt'7'7"'<=cgcucu''    c                     t        | j                        D cg c]"  }|| j                  z  | j                  k(  rdnd$ c}S c c}w )N	attentionmamba)ranger   r)   r+   r:   is     r=   layers_block_typezJambaConfig.layers_block_typeY   sM     4112
 t5559O9OOKU\\
 	
 
s   'Ac                 R    | j                   }|D cg c]  }|dk(  rdn| c}S c c}w )Nr@   full_attention)rE   )r:   layer_typesxs      r=   rH   zJambaConfig.layer_types`   s0     ,,EPQA$4 !;QQQs   $c                     t        | j                        D cg c],  }|| j                  z  | j                  k(  r| j                  nd. c}S c c}w )Nr   )rB   r   r'   r(   r   rC   s     r=   layers_num_expertszJambaConfig.layers_num_expertsf   sS     4112
 !"D$<$< <@X@X XD^__
 	
 
s   1Ac                     | j                   | j                  k\  r&t        d| j                    d| j                   d      | j                  | j                  k\  r&t        d| j                   d| j                   d      y)zOPart of `@strict`-powered validation. Validates the architecture of the config.zattention layer offset (z/) must be smaller than attention layer period ()zexpert layer offset (z,) must be smaller than expert layer period (N)r+   r)   
ValueErrorr(   r'   )r:   s    r=   validate_architecturez!JambaConfig.validate_architecturem   s    !!T%;%;;*4+A+A*BBqrv  sI  sI  rJ  JK  L  ##t'?'??'(@(@'AAmnr  oG  oG  nH  HI  J  @r>   )3__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   boolr   r   r   r   r   r   strr   floatr   r   r   r   r   r    r"   listr#   r$   r%   r   r'   r(   r)   r+   r,   r-   r.   r/   r1   r2   r3   r9   propertyrE   rH   rK   rO   __classcell__)r<   s   @r=   r	   r	      s   " J#4"5]M J %%K"s"s!!  J#u#L%It!&$&"'%' L#*  L#* +,L#S	/D(,#)S)%(us{(  K    ss"t"M3L#L#%M39% OT !OT!( 
 
 R R
 
 

r>   r	   )
rS   r6   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r5   r>   r=   <module>rc      sM       . 3 # 01]" ]  2]@ /r>   