
    i                         d Z ddlmZ ddlmZ ddlmZ ddlmZ e edd	       G d
 de                    Z	e edd	       G d de                    Z
 ed      e G d de                    ZdgZy)zDBRX model configuration    )strict   )PreTrainedConfig)RopeParameters)auto_docstringz4This config is used to instantiate attention layers.z$transformers-community/dbrx-instruct)custom_intro
checkpointc                   R    e Zd ZU dZdZdZeez  ed<   dZ	eez  dz  ed<   dZ
eed<   y)	DbrxAttentionConfigaz  
    attn_pdrop (`float`, *optional*, defaults to 0.0):
        The dropout probability for the attention layers.
    clip_qkv (`float`, *optional*):
        If set, clip the queries, keys, and values in the attention layer to this value.
    kv_n_heads (`int`, *optional*, defaults to 1):
        For grouped_query_attention only, allow user to specify number of kv heads.
    attn_config        
attn_pdropNclip_qkv   
kv_n_heads)__name__
__module____qualname____doc__base_config_keyr   floatint__annotations__r   r        |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/dbrx/configuration_dbrx.pyr   r      s9     $O!J!#'HcEkD 'Jr   r   z6This config is used to instantiate feedforward layers.c                        e Zd ZU dZdZdZeed<   dZe	dz  ed<   dZ
eed<   d	Zeed
<   dZeed<   dZedz  ed<   dZeed<   dZedz  ed<    fdZ xZS )DbrxFFNConfiga  
    ffn_act_fn (`dict`, *optional*, defaults to `None`):
        A dict specifying activation function for the FFN.
        The dict should have a key 'name' with the value being the name of the activation function along with
        any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
    ffn_hidden_size (`int`, *optional*, defaults to 3584):
        The hidden size of the feedforward network.
    moe_num_experts (`int`, *optional*, defaults to 4):
        The number of experts in the mixture of experts layer.
    moe_top_k (`int`, *optional*, defaults to 1):
        The number of experts to use in the mixture of experts layer.
    moe_jitter_eps (`float`, *optional*, defaults to `None`):
        If not `None`, the jitter epsilon for the mixture of experts layer.
    moe_loss_weight (`float`, *optional*, defaults to 0.01):
        The loss weight for the mixture of experts layer.
    moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0):
        The normalization factor for the expert weights.
    
ffn_configi   hidden_sizeN
ffn_act_fni   ffn_hidden_size   moe_num_expertsr   	moe_top_kmoe_jitter_epsg{Gz?moe_loss_weightg      ?moe_normalize_expert_weightsc                     | j                   	ddi| _         dD ]  }||v s|j                  |        t        |      dk7  rt        d|      t	        |   di | y )Nnamesilu)
model_typeattn_implementationexperts_implementationtransformers_version_commit_hashtorch_dtypedtyper   zFound unknown kwargs=r   )r!   poplen
ValueErrorsuper__post_init__)selfkwargsk	__class__s      r   r7   zDbrxFFNConfig.__post_init__Q   so    ??"%v.DO
 
	A F{

1
	 v;!5fY788''r   )r   r   r   r   r   r    r   r   r!   dictr"   r$   r%   r&   r   r'   r(   r7   __classcell__r;   s   @r   r   r   -   sv    & #OK"Jt"OSOSIs#'NEDL'!OU!14 %$,4( (r   r   )r	   c                       e Zd ZU dZdZeedZdddddZd	Z	e
d
z  ed<   dZe
d
z  ed<   dZe
d
z  ed<   d	Ze
d
z  ed<   dZe
ed<   dZed
z  ed<   dZed
z  ed<   d
Zeez  d
z  ed<   d
Zeez  d
z  ed<   dZeed<   dZeed<   dZed
z  ed<   d
Zeez  d
z  ed<   d
Ze
d
z  ed<   d
Ze
d
z  ed<   d
Ze
ee
   z  d
z  ed<   dZ eed<    fdZ!d  Z" xZ#S )!
DbrxConfiga  
    max_seq_len (`int`, *optional*, defaults to 2048):
        The maximum sequence length of the model.
    attn_config (`dict`, *optional*):
        A dictionary used to configure the model's attention module.
    ffn_config (`dict`, *optional*):
        A dictionary used to configure the model's FFN module.

    Example:
    ```python
    >>> from transformers import DbrxConfig, DbrxModel

    >>> # Initializing a Dbrx configuration
    >>> configuration = DbrxConfig(n_layers=2, d_model=256, n_heads=8, vocab_size=128)

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = DbrxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    dbrx)r   r   n_headsd_modeln_layersmax_seq_len)num_attention_headsr    num_hidden_layersmax_position_embeddingsi   N      i }  
vocab_sizer   resid_pdrop	emb_pdropr   r   T	use_cacheg{Gz?initializer_rangeFoutput_router_logitsrope_parameterspad_token_idbos_token_ideos_token_idtie_word_embeddingsc                    | j                   t               | _         n4t        | j                   t              rt        di | j                   | _         | j                  t               | _        n4t        | j                  t              rt        di | j                  | _        | j                   j                  | _        t        | $  di | y )Nr   )
r   r   
isinstancer<   r   r   r   num_key_value_headsr6   r7   )r8   r9   r;   s     r   r7   zDbrxConfig.__post_init__   s    #24D(($/2FT5E5EFD??"+oDO.+>doo>DO#'#3#3#>#> ''r   c                 2    | j                   rt        d      y)zOPart of `@strict`-powered validation. Validates the architecture of the config.z5tie_word_embeddings is not supported for DBRX models.N)rU   r5   )r8   s    r   validate_architecturez DbrxConfig.validate_architecture   s    ##TUU $r   )$r   r   r   r   r,   r   r   sub_configsattribute_maprC   r   r   rB   rD   rE   rK   rL   r   rM   r   r<   r   rN   boolrO   rP   rQ   r   rR   rS   rT   listrU   r7   rZ   r=   r>   s   @r   r@   r@   f   s6   . J"5]SK( '#0	M GS4ZGS4ZHcDj"Kt"J #K#!Iut|!59K$t+d29.2J$t+2It#u#(-$+-48O^d*T18#L#*##L#*#+/L#S	/D(/ %%(Vr   r@   N)r   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r   r   r@   __all__r   r   r   <module>rd      s     . 3 1 # G5* 	 
" I51($ 1(	 
1(h ABDV! DV  CDVN .r   