
    i                     @   d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ d	d
lmZmZmZmZ ddlmZ  ej&                  e      Z G d ded      Z G d dej.                        Z G d de      Z G d de      Z G d de      Z G d de      Zg dZy)    )	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)GraniteFlashAttentionKwargsaT  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   r   $   s7      ######__r"   r   F)totalc                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                 `   t         |           |j                  | _        |j                  | _        t
        |j                     | _        t        j                  | j                  | j                  dz  d      | _
        t        j                  | j                  | j                  d      | _        y )Nr
   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr'   	__class__s     r#   r+   zGraniteMoeSharedMLP.__init__E   s     ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr"   hidden_statesreturnc                     | j                  |      }|j                  dd      }| j                  |d         |d   z  }| j                  |      }|S )Nr
   )dimr   r   )r2   chunkr0   r3   )r5   r7   chunked_hidden_statess      r#   forwardzGraniteMoeSharedMLP.forwardN   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r"   )
r   r   r   r   r   r+   r   Tensorr>   __classcell__r6   s   @r#   r&   r&   <   s2    V5 VU\\ ell r"   r&   c                   P    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
e
dz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )GraniteMoeSharedDecoderLayerr'   	layer_idxc                 t    t         |   ||       |j                  dk(  rd | _        y t        |      | _        y )Nr   )r*   r+   r.   r&   
shared_mlpr5   r'   rD   r6   s      r#   r+   z%GraniteMoeSharedDecoderLayer.__init__W   s3    +"("A"AQ"F$L_`fLgr"   Nr7   attention_maskposition_idspast_key_valuesoutput_attentions	use_cacheposition_embeddingskwargsr8   c                 <   |}	| j                  |      } | j                  d|||||||d|\  }}
|	|| j                  z  z   }|}	| j                  |      }| j	                  |      }| j
                  |}n|| j                  |      z   }|	|| j                  z  z   }|S )N)r7   rH   rI   rJ   rK   rL   rM   r!   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moerF   )r5   r7   rH   rI   rJ   rK   rL   rM   rN   residual_moe_hidden_statess               r#   r>   z$GraniteMoeSharedDecoderLayer.forward[   s     !,,]; *4>> 	
')%+/ 3	
 	
q !=43K3K#KK 55mD 11-@??"-M-0NNM =43K3K#KKr"   )NNNFFN)r   r   r   r   r   r+   r   r?   r   r   booltupler   r   FloatTensorr>   r@   rA   s   @r#   rC   rC   V   s    h5 h# h /304(,).!&HL%||% t+% &&-	%
 %  $;% $;% #5<<#=>E% 45% 
u  %(9(95;L;L(L"MPT"TT	U%r"   rC   c                       e Zd ZU eed<   dgZy)GraniteMoeSharedPreTrainedModelr'   rC   N)r   r   r   r   r   _no_split_modulesr!   r"   r#   r\   r\      s    ""78r"   r\   c                   $     e Zd Zdef fdZ xZS )GraniteMoeSharedModelr'   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w N)r*   r+   r   
ModuleListrangenum_hidden_layersrC   layersrG   s      r#   r+   zGraniteMoeSharedModel.__init__   sE     mmNSTZTlTlNmn)&)<n
ns   A)r   r   r   r   r+   r@   rA   s   @r#   r_   r_      s    
5 
 
r"   r_   c                   ,     e Zd ZddiZdef fdZ xZS )GraniteMoeSharedForCausalLMzlm_head.weightzmodel.embed_tokens.weightr'   c                 d    t         |   |       t        |      | _        | j	                          y ra   )r*   r+   r_   model	post_initr4   s     r#   r+   z$GraniteMoeSharedForCausalLM.__init__   s&     *62
r"   )r   r   r   _tied_weights_keysr   r+   r@   rA   s   @r#   rg   rg      s!    *,GH5  r"   rg   )rg   r_   r\   )typingr   r   r   activationsr   cache_utilsr   processing_utilsr   utilsr	   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler&   rC   r\   r_   rg   __all__r!   r"   r#   <module>rw      s       !   &   C 
		H	%)5 0")) 4*#9 *Z9&? 9

O 
"7  fr"   