
    i+                         d Z ddlmZ ddlmZ ddlmZ ddlmZ  ed      e G d	 d
e                    Z	 ed      e G d de                    Z
dgZy)zMpt configuration    )Literal)strict   )PreTrainedConfig)auto_docstringzmosaicml/mpt-7b)
checkpointc                       e Zd ZU dZdZdZed   ed<   dZe	ed<   dZ
eed	<   d
Zed
z  ed<   d
Zed
z  ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZe	ed<   y
)MptAttentionConfigaD  
    attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
        type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
    attn_pdrop (`float`, *optional*, defaults to `0.0`):
        The dropout probability for the attention layers.
    attn_impl (`str`, *optional*, defaults to `"torch"`):
        The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
    clip_qkv (`float`, *optional*):
        If not `None`, clip the queries, keys, and values in the attention layer to this value.
    softmax_scale (`float`, *optional*):
        If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
        `1/sqrt(hidden_size)`.
    prefix_lm (`bool`, *optional*, defaults to `False`):
        Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
        which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
        bi-directionally. Tokens outside the prefix use causal attention.
    qk_ln (`bool`, *optional*, defaults to `False`):
        Whether to apply layer normalization to the queries and keys in the attention layer.
    attn_uses_sequence_id (`bool`, *optional*, defaults to `False`):
        Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
        mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
        token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
    alibi (`bool`, *optional*, defaults to `True`):
        Whether or not to use the alibi bias instead of positional embedding.
    alibi_bias_max (`int`, *optional*, defaults to 8):
        The maximum value of the alibi bias.
    attn_configmultihead_attention)r   multiquery_attention	attn_typer   
attn_pdroptorch	attn_implNclip_qkvsoftmax_scaleF	prefix_lmqk_lnattn_uses_sequence_idTalibi   alibi_bias_max)__name__
__module____qualname____doc__base_config_keyr   r   __annotations__r   intr   strr   floatr   r   boolr   r   r   r        z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/mpt/configuration_mpt.pyr
   r
      s    8 $OH]IwDE]JIs!Hedl!"&M54<&ItE4"'4'E4NCr%   r
   c                       e Zd ZU dZdZdeiZddddZdZe	e
d<   d	Ze	e
d<   d
Ze	e
d<   dZe	e
d<   dZe	e
d<   dZe	e
d<   dZee	z  e
d<   dZee
d<   dZee	z  e
d<   dZee
d<   dZeez  dz  e
d<   dZee
d<   dZeez  dz  e
d<   dZee
d<   dZee
d<   dZee
d<   d Zee
d!<   d"Zee
d#<   dZ ee
d$<   dZ!e	dz  e
d%<   dZ"e	dz  e
d&<   dZ#e	e$e	   z  dz  e
d'<    fd(Z% xZ&S ))	MptConfigaZ  
    expansion_ratio (`int`, *optional*, defaults to 4):
        The ratio of the up/down scale in the MLP.
    max_seq_len (`int`, *optional*, defaults to 2048):
        The maximum sequence length of the model.
    layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
        The epsilon to use in the layer normalization layers.
    learned_pos_emb (`bool`, *optional*, defaults to `True`):
        Whether to use learned positional embeddings.
    attn_config (`dict`, *optional*):
        A dictionary used to configure the model's attention module.
    init_device (`str`, *optional*, defaults to `"cpu"`):
        The device to use for parameter initialization. Defined for backward compatibility
    logit_scale (`float`, *optional*):
        If not None, scale the logits by this value.
    no_bias (`bool`, *optional*, defaults to `True`):
        Whether to use bias in all linear layers.
    embedding_fraction (`float`, *optional*, defaults to 1.0):
        The fraction to scale the gradients of the embedding layer by.
    norm_type (`str`, *optional*, defaults to `"low_precision_layernorm"`):
        Type of layer norm to use. All MPT models uses the same layer norm implementation. Defined for backward
        compatibility.

    Example:

    ```python
    >>> from transformers import MptConfig, MptModel

    >>> # Initializing a Mpt configuration
    >>> configuration = MptConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = MptModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    mptr   n_headsd_modeln_layers)num_attention_headshidden_sizenum_hidden_layersi            expansion_ratiomax_seq_leni  
vocab_sizeg        resid_pdropgh㈵>layer_norm_epsilon	emb_pdropTlearned_pos_embNcpuinit_devicelogit_scaleno_biasg      ?embedding_fractionlow_precision_layernorm	norm_typeF	use_cacheg{Gz?initializer_rangetie_word_embeddingspad_token_idbos_token_ideos_token_idc                     | j                   t               | _         n4t        | j                   t              rt        di | j                   | _         t	        |   di | y )Nr$   )r   r
   
isinstancedictsuper__post_init__)selfkwargs	__class__s     r&   rK   zMptConfig.__post_init__   sP    #13D(($/1ED4D4DED''r%   )'r   r   r   r   
model_typer
   sub_configsattribute_mapr+   r    r   r*   r,   r3   r4   r5   r6   r"   r7   r8   r9   r#   r   rI   r;   r!   r<   r=   r>   r@   rA   rB   rC   rD   rE   rF   listrK   __classcell__)rN   s   @r&   r(   r(   E   sH   %N J "45K( 'M GSGSHcOSKJ"K" $$ Ius{  OT 48K**T18K&*Kt#*GT ##.Is.It#u# $$#L#*##L#*#+/L#S	/D(/( (r%   r(   N)r   typingr   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r
   r(   __all__r$   r%   r&   <module>rY      sx      . 3 # ,-() (  .(V ,-L(  L(  .L(^ -r%   