
    i                     r    d dl mZ d dlmZ ddlmZ ddlmZ  ed      e G d d	e                    Zd	gZ	y
)    )Literal)strict   )PreTrainedConfig)auto_docstringzanswerdotai/ModernBERT-base)
checkpointc                       e Zd ZU dZdZdgZdddZdZee	d<   d	Z
ee	d
<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   d Zeee   z  dz  e	d!<   d"Zedz  e	d#<   d"Zedz  e	d$<   d Zedz  e	d%<   dZee	d&<   d'Zeez  e	d(<   dZee   dz  e	d)<   dZ e!e"d*   e!f   dz  e	d+<   d,Z#ee	d-<   d'Z$eez  e	d.<   dZ%ee	d/<   d'Z&eez  e	d0<   d1Z'ee	d2<   d3Z(e"d4   e	d5<   d'Z)eez  e	d6<   dZ*ee	d7<   dZ+ee	d8<   dZ,ee	d9<   dZ-ee	d:<   d;Z.ee	d<<   d1Z/ee	d=<    fd>Z0d? Z1 fd@Z2e3dA        Z4e4jj                  dB        Z4 xZ6S )CModernBertConfiga+  
    initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
        The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    norm_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the normalization layers.
    local_attention (`int`, *optional*, defaults to 128):
        The window size for local attention.
    mlp_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the MLP layers.
    decoder_bias (`bool`, *optional*, defaults to `True`):
        Whether to use bias in the decoder layers.
    classifier_pooling (`str`, *optional*, defaults to `"cls"`):
        The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
        CLS token doesn't attend to all tokens on long sequences.
    classifier_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the classifier.
    classifier_activation (`str`, *optional*, defaults to `"gelu"`):
        The activation function for the classifier.
    deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
        Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
    sparse_prediction (`bool`, *optional*, defaults to `False`):
        Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
    sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
        The index to ignore for the sparse prediction.

    Examples:

    ```python
    >>> from transformers import ModernBertModel, ModernBertConfig

    >>> # Initializing a ModernBert style configuration
    >>> configuration = ModernBertConfig()

    >>> # Initializing a model from the modernbert-base style configuration
    >>> model = ModernBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
modernbertpast_key_valuesg     Ag     @)globallocali  
vocab_sizei   hidden_sizei  intermediate_size   num_hidden_layers   num_attention_headsgeluhidden_activationi    max_position_embeddingsg{Gz?initializer_rangeg       @initializer_cutoff_factorgh㈵>norm_epsF	norm_biasik  Npad_token_idij  eos_token_idii  bos_token_idcls_token_idsep_token_idattention_biasg        attention_dropoutlayer_types)full_attentionsliding_attentionrope_parameters   local_attentionembedding_dropoutmlp_biasmlp_dropoutTdecoder_biascls)r.   meanclassifier_poolingclassifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionisparse_pred_ignore_indextie_word_embeddingsc                     |j                  dd      }| j                  8t        | j                        D cg c]  }t	        ||z        rdnd c}| _        t        |   di | y c c}w )Nglobal_attn_every_n_layersr   r&   r%    )getr$   ranger   boolsuper__post_init__)selfkwargsr9   i	__class__s       /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/modernbert/configuration_modernbert.pyr?   zModernBertConfig.__post_init__q   st    %+ZZ0La%P"# t556  (,A0J,J'K#Qaa D
 	'' s   A(c                    |j                  dd       }ddiddid}| j                  | j                  n|| _        |<| j                  d   j                  |       | j                  d   j                  |       | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d| j
                  d	                | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d
| j
                  d                | j                          |S )Nrope_scaling	rope_typedefault)r&   r%   r%   r&   
rope_thetaglobal_rope_thetar   local_rope_thetar   )popr'   updater;   
setdefaultdefault_thetastandardize_rope_params)r@   rA   rF   default_rope_paramss       rD   convert_rope_params_to_dictz,ModernBertConfig.convert_rope_params_to_dict|   sc   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G  !45<<\J ##$45=6A95MD  !12-.99&**%8$:L:LX:VW	
 ##$78@9Di8PD  !4501<<&**%79K9KG9TU	

 	$$&    c                 H    t         |          }|j                  dd        |S )Nreference_compile)r>   to_dictrL   )r@   outputrC   s     rD   rV   zModernBertConfig.to_dict   s#    "

&-rS   c                      | j                   dz  S )zKHalf-window size: `local_attention` is the total window, so we divide by 2.   r)   )r@   s    rD   sliding_windowzModernBertConfig.sliding_window   s     ##q((rS   c                     |dz  | _         y)z<Set sliding_window by updating local_attention to 2 * value.rY   NrZ   )r@   values     rD   r[   zModernBertConfig.sliding_window   s      %qyrS   )7__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencerO   r   int__annotations__r   r   r   r   r   strr   r   floatr   r   r   r=   r   r   listr   r    r!   r"   r#   r$   r'   dictr   r)   r*   r+   r,   r-   r0   r1   r2   r3   r4   r5   r6   r7   r?   rR   rV   propertyr[   setter__classcell__)rC   s   @rD   r
   r
      s   (T J#4"5(8<MJK!s!s!!#s##'S'#u#'*u*HeIt$L#*$+0L#S	/D(0$L#*$$L#*$$L#*$ ND %(us{($(KcT!(Y]OT'"GH$NORVV]OS%(us{(Hd"K"L$16.6&))!OT!!'3'%*d*#t#$(c( $$	(<
 ) ) ) )rS   r
   N)
typingr   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r
   __all__r:   rS   rD   <module>rr      sK   ,  . 3 # 89G)' G)  :G)T 
rS   