
    ic                     r    d Z ddlZddlmZ ddlmZ ddlmZ  ed      e G d	 d
e                    Zd
gZ	y)zZamba model configuration    N)strict   )PreTrainedConfig)auto_docstringzZyphra/Zamba-7B-v1)
checkpointc                   H    e Zd ZU dZdZdgZdddZdZee	d<   d	Z
ee	d
<   dZee	d<   dZedz  e	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   d	Zee	d <   d!Zee	d"<   d#Zedz  e	d$<   d!Zedz  e	d%<   dZeee   z  dz  e	d&<   d'Z ee	d(<   d)Z!eez  e	d*<   d+Z"ee	d,<   d-Z#ee	d.<   d	Z$ee	d/<   dZ%ee	d0<   d-Z&ee	d1<   dZ'ee	d2<   d3Z(eez  e	d4<   d5Z)ee	d6<   d7Z*ee	d8<   d9Z+ee	d:<   d	Z,ee	d;<   d<Z-ee	d=<    fd>Z.d? Z/d@ Z0 xZ1S )AZambaConfiga!  
    attention_hidden_size (`int`, *optional*):
        Dimension of the hidden representations of the inputs to the Attention layer.
    attention_head_dim (`int`, *optional*):
        Dimension of the attention head in the Transformer decoder.
    n_mamba_heads (`int`, *optional*, defaults to 2):
        Number of mamba heads for each mamba layer.
    hidden_mamba_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the mamba layer.
    num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
        Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
        integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
        logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
        sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
        significantly.
    attn_layer_period (`int`, *optional*, defaults to 6):
        Once in this many layers, we will have a shared attention layer
    attn_layer_offset (`int`, *optional*, defaults to 4):
        Offset of the shared attention layer
    use_mamba_kernels (`bool`, *optional*, defaults to `True`):
        Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
        `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
        `True` and kernels are not available
    mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
        Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
    zambapast_key_valueslayers_block_typeattention_head_dim)layer_typeshead_dimi }  
vocab_sizeTtie_word_embeddingsi  hidden_sizeNattention_hidden_sizei :  intermediate_sizeL   num_hidden_layers   num_attention_headsnum_key_value_heads   n_mamba_headsgelu
hidden_actsiluhidden_mamba_actg{Gz?initializer_rangegh㈵>rms_norm_eps	use_cache   num_logits_to_keepr   pad_token_idbos_token_ideos_token_idi   max_position_embeddingsg        attention_dropout   attn_layer_period   attn_layer_offsetuse_mamba_kernelsmamba_d_statemamba_d_convmamba_expandautomamba_dt_rankgMbP?time_step_ming?time_step_maxg-C6?time_step_floormamba_conv_biasFmamba_proj_biasc                    | j                   xs d| j                  z  | _         | j                  xs d| j                  z  | j                  z  | _        | j                  dk(  r"t        j                  | j                  dz        n| j                  | _        | j                  | j                  | j                  | j                        | _        t        | 4  di | y )Nr   r2   r    )r   r   r   r   r3   mathceil_layers_block_typer   r+   r-   r   super__post_init__)selfkwargs	__class__s     ~/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/zamba/configuration_zamba.pyr?   zZambaConfig.__post_init__\   s    %)%?%?%W1tGWGWCW""&"9"9"mQAQAQ=QUYUmUm=mAEASASW]A]TYYt'7'7"'<=cgcucu!%!8!8""D$:$:D<R<R"
 	''    c                 l    | j                   | j                  z  | j                  z  dk7  rt        d      y)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   z;`intermediate_size` should be divisible by `n_mamba_heads`.N)r1   r   r   
ValueError)r@   s    rC   validate_architecturez!ZambaConfig.validate_architecturee   s8     0 00D4F4FF!KZ[[ LrD   c                 f    g dt        |dz
        D cg c]  }||z  |k(  rdnd c}z   }|S c c}w )N)mambarI   hybridr   rJ   rI   )range)r@   r   r+   r-   ilayerss         rC   r=   zZambaConfig._layers_block_typej   sN    
 [``qtu`uZvwUV..2CCXPw	x
  xs   .)2__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   boolr   r   r   r   r   r   r   r   r   strr   r    floatr!   r"   r$   r%   r&   r'   listr(   r)   r+   r-   r.   r/   r0   r1   r3   r4   r5   r6   r7   r8   r?   rG   r=   __classcell__)rB   s   @rC   r	   r	      s   6 J#4"5$7EYZMJ $$K(,3:,"s"s!!%)d
)!!M3J"c"#u#L%It L#*  L#* +,L#S	/D(,#'S'%(us{(ss"t"M3L#L#%M39% M5 M5!OU! OT !OT!(\
rD   r	   )
rQ   r;   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r:   rD   rC   <module>r`      sM       . 3 # /0V" V  1Vr /rD   