
    i                     r    d dl mZ ddlmZ ddlmZ ddlmZ  ed      e G d d	e                    Zd	gZ	y
)    )strict   )PreTrainedConfig)RopeParameters)auto_docstringzZyphra/Zamba2-2.7B)
checkpointc                       e Zd ZU dZdZdddZdgZdZee	d<   d	Z
ee	d
<   dZee	d<   dZee	d<   dZee   dz  e	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee   eedf   z  dz  e	d<   d Zee	d!<   d"Zee	d#<   d$Zee	d%<   d&Zee	d'<   d&Zee	d(<   dZ edz  e	d)<   d*Z!ee	d+<   d,Z"ee	d-<   dZ#edz  e	d.<   d/Z$eez  e	d0<   dZ%ee	d1<   d&Z&ee	d2<   d3Z'ee	d4<   d&Z(ee	d5<   dZ)e*e+z  dz  e	d6<   d7Z,ee	d8<   d9Z-ee	d:<   d"Z.ee	d;<   dZ/ee	d<<   d=Z0edz  e	d><   dZ1edz  e	d?<   dZ2eee   z  dz  e	d@<   d&Z3ee	dA<   d"Z4ee	dB<    fdCZ5 xZ6S )DZamba2Configaj	  
    mamba_ngroups (`int`, *optional*, defaults to 1):
        Number of groups for the evolution matrices of mamba 2.
    n_mamba_heads (`int`, *optional*, defaults to 8):
        Number of heads for the evolution matrices of mamba 2.
    use_conv_bias (`bool`, *optional*, defaults to `True`):
        Whether or not to use bias in the convolution layer of the mixer block.
    chunk_size (`int`, *optional*, defaults to 256):
        Size of the chunks that will comprise the sequence.
    use_mem_eff_path (`bool`, *optional*, defaults to `False`):
        Whether or not to use the fused conv1d and scan in mamba2 layers.
    add_bias_linear (`bool`, *optional*, defaults to `False`):
        Flag indicating whether or not to use bias in various layers
    num_mem_blocks (`int`, *optional*, defaults to 1):
        Number of unshared transformer blocks.
    use_shared_attention_adapter (`bool`, *optional*, defaults to `False`):
        If True, unshared adapters (formally the same as LoRA but used in the base model) will be added to the q, k, v projectors in the shared attention layers.
    adapter_rank (`int`, *optional*, defaults to 128):
        Rank of the adapter in the shared MLP and shared attention layers.
    use_mem_rope (`bool`, *optional*, defaults to `False`):
        If True, includes RoPE in the shared attention layers.
    num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
        Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
        integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
        logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
        sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
        significantly.
    use_long_context (`bool`, *optional*, defaults to `False`):
        Activates the context-extended version of Zamba by modifying RoPE.

    Example:
    ```python
    >>> from transformers import Zamba2Model, Zamba2Config
    >>> # Initializing a Zamba2-2.7B style configuration
    >>> configuration = Zamba2Config()
    >>> # Initializing a model from the Zamba2-2.7B style configuration
    >>> model = Zamba2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```zamba2layers_block_typeattention_head_dim)layer_typeshead_dimpast_key_valuesi }  
vocab_sizei   max_position_embeddingsi 
  hidden_size6   num_hidden_layersN@   mamba_d_state   mamba_d_conv   mamba_expand   mamba_ngroupsgMbP?time_step_ming?time_step_maxg-C6?time_step_floor.time_step_limit   n_mamba_headsTuse_conv_bias   
chunk_sizeFuse_mem_eff_pathadd_bias_linearintermediate_sizegelu
hidden_act    num_attention_headsnum_key_value_headsg        attention_dropoutnum_mem_blocksuse_shared_attention_adapter   adapter_rankuse_mem_roperope_parametersg{Gz?initializer_rangegh㈵>rms_norm_eps	use_cachenum_logits_to_keepr   pad_token_idbos_token_ideos_token_iduse_long_contexttie_word_embeddingsc                    | j                   xs d| j                  z  | _         d| j                  z  | _        d| j                  z  | j                  z  | _        t        | j                  | j                  z        | j                  z  | _        | j                  rd| _
        | j                  | j                  | _        | j                  | j                  z  | _        | j                  | _        | j                  3dgdgdz  dgz   dz  z   dgdz  z   dgz   dgdz  z   dgz   dgdz  z   | _        t        | j                        D cg c]  \  }}|dk(  s| c}}| _        t#        | H  d	i | y c c}}w )
Nr   r   i @  mamba   hybrid   r    )r)   r   attention_hidden_sizer-   r   intr   r#   mamba_headdimr=   r   r.   kv_channelsnum_query_groupsr   	enumeratehybrid_layer_idssuper__post_init__)selfkwargsindextype	__class__s       /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/zamba2/configuration_zamba2.pyrM   zZamba2Config.__post_init__n   s   !%!7!7!O1t?O?O;O%&)9)9%9""#d&6&6"6$:R:R"R !2!2T5E5E!EF$J\J\\  +0D(##+'+'?'?D$++t/G/GG $ 8 8 !!)	9q=H:-23)a-  * )a-	 
 * )a-  " ;DDDZDZ:[ p;5$_cgo_o p'' !qs   E.E.)7__name__
__module____qualname____doc__
model_typeattribute_mapkeys_to_ignore_at_inferencer   rF   __annotations__r   r   r   r   liststrr   r   r   r   r   floatr   r    r!   tupler#   r$   boolr&   r'   r(   r)   r+   r-   r.   r/   r0   r1   r3   r4   r5   r   dictr6   r7   r8   r9   r:   r;   r<   r=   r>   rM   __classcell__)rR   s   @rS   r
   r
      s   'R J$7EYZM#4"5J#'S'Ks*.tCy4'.M3L#L#M3 M5 M5!OU!>BOT%[5#44t;BM3M4J"d"!OT!$(sTz(J!!&*t*%(us{(NC). $.L#L$48O^d*T18#u#L%It L#*  L#* +,L#S	/D(,"d" $$( (    r
   N)
huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r
   __all__rD   rc   rS   <module>ri      sK   " / 3 1 # /0n(# n(  1n(b 
rc   