
    id                     x    d Z ddlmZ ddlmZ ddlmZ ddlmZ e edd	       G d
 de                    Z	dgZ
y)zAFMoE model configuration    )strict   )PreTrainedConfig)RopeParameters)auto_docstringz
    AFMoE is an Adaptive Feedforward MoE (Mixture of Experts) model with token-choice routing, shared experts, and a
    hybrid attention mechanism combining sliding window and full attention patterns.
    zarcee-ai/Trinity-Mini)custom_intro
checkpointc                   v    e Zd ZU dZdZdgZdgdgfddgdgfdgdgfdZd	Zee	d
<   dZ
ee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   dZee	d<   dZedz  e	d<   dZedz  e	d<   dZee	d<   dZee	d<   dZee	d <   d!Zee	d"<   d#Zee	d$<   d%Zee	d&<   dZeez  dz  e	d'<   d(Zedz  e	d)<   d*Zedz  e	d+<   d,Z edz  e	d-<   d.Z!edz  e	d/<   d%Z"ee	d0<   d1Z#edz  e	d2<   d3Z$edz  e	d4<   dZ%e&e   dz  e	d5<   d6Z'eez  dz  e	d7<   d%Z(edz  e	d8<   dZ)ee&e   z  dz  e	d9<   dZ*edz  e	d:<   dZ+edz  e	d;<   d%Z,ee	d<<    fd=Z- xZ.S )>AfmoeConfiga%  
    global_attn_every_n_layers (`int`, *optional*, defaults to 4):
        The frequency of full attention layers. Every Nth layer will use full attention, while others use sliding
        window attention.
    mup_enabled (`bool`, *optional*, defaults to `False`):
        Whether to enable muP (Maximal Update Parametrization) input scaling. When enabled, input embeddings
        are scaled by `sqrt(hidden_size)`.

    Example:
    ```python
    >>> from transformers import AfmoeModel, AfmoeConfig

    >>> # Initializing an AFMoE configuration
    >>> configuration = AfmoeConfig()

    >>> # Initializing a model from the afmoe-small-sft-v1 style configuration
    >>> model = AfmoeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    afmoepast_key_values	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei   hidden_sizei   intermediate_sizei  moe_intermediate_size    num_hidden_layers   Nnum_dense_layers   num_attention_headsnum_key_value_heads   head_dimsilu
hidden_acti @  max_position_embeddingsg{Gz?initializer_rangegh㈵>rms_norm_epsT	use_cacheFtie_word_embeddingsrope_parameters@   num_experts   num_experts_per_tok   num_shared_expertsg      ?route_scaleoutput_router_logits   global_attn_every_n_layersi   sliding_windowlayer_typesg        attention_dropoutmup_enabledeos_token_idpad_token_idbos_token_idattention_biasc                 
   | j                   Et        | j                        D cg c]!  }t        |dz   | j                  z        rdnd# c}| _         | j
                  | j                  | _        t        |    di | y c c}w )Nr   sliding_attentionfull_attention )	r5   ranger   boolr3   r   r   super__post_init__)selfkwargsi	__class__s      ~/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/afmoe/configuration_afmoe.pyrC   zAfmoeConfig.__post_init__`   s    # t556  (,QUd6U6U,U'V#\ll D
 ##+'+'?'?D$'' s   &B )/__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_pp_planr   int__annotations__r   r   r   r   r   r   r   r!   r#   strr$   r%   floatr&   r'   rA   r(   r)   r   dictr+   r-   r/   r0   r1   r3   r4   r5   listr6   r7   r8   r9   r:   r;   rC   __classcell__)rG   s   @rH   r   r      s   . J#4"5 &(9:#%568IJ!"_$56 JK!s!!%3%s#$cDj$!!&*t*HcDjJ#(S(#u#L%It %%48O^d*T18 Kt &'t'%&d
& #K#!&$&-.d
.!%NC$J%$(KcT!(,/us{T)/$K$+/L#S	/D(/#L#*##L#*# ND 
( 
(    r   N)rL   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r   __all__r?   rW   rH   <module>r]      sV      . 3 1 #  'K(" K( K(\ /rW   