
    i1                        d dl mZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.  ej^                  e0      Z1 G d de$      Z2 G d de      Z3 G d de      Z4 G d de'      Z5 G d de!      Z6 G d d e)      Z7 G d! d"e%      Z8 G d# d$e*      Z9 G d% d&e(      Z: G d' d(e&      Z;g d)Z<y)*    )CallableN)nn   )initialization)CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )BambaConfig)
BambaMixerBambaRMSNormGated)Gemma2RotaryEmbedding)
GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedMoEGraniteMoeSharedPreTrainedModelapply_rotary_pos_embeager_attention_forward   )GraniteMoeHybridConfigc                        e Zd Zdedef fdZ	 	 ddej                  dej                  dz  dedz  de	ej                  ej                  f   dz  d	e
e   d
e	ej                  ej                  f   fdZ xZS )GraniteMoeHybridAttentionconfig	layer_idxc                 &    t         |   ||       y Nsuper__init__selfr&   r'   	__class__s      /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr,   z"GraniteMoeHybridAttention.__init__3   s    +    Nhidden_statesattention_maskpast_key_valuesposition_embeddingskwargsreturnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
||\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                   | j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr"   r   g        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr    updater'   r   get_interfacer&   _attn_implementationr!   trainingattention_dropoutr;   reshape
contiguouso_proj)r.   r2   r3   r4   r5   r6   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r0   forwardz!GraniteMoeHybridAttention.forward6   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST**HC';L*VY[^'_$L*&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r1   )NN)__name__
__module____qualname__r#   intr,   torchTensorr   tupler   r   rU   __classcell__r/   s   @r0   r%   r%   2   s    ,5 ,# , )-HL')||') t+') 	')
 #5<<#=>E') +,') 
u||U\\)	*')r1   r%   c                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridMambaLayerr&   r'   c                 8    t         |   t        |      |       y r)   )r+   r,   r   r-   s      r0   r,   z#GraniteMoeHybridMambaLayer.__init__a   s    V,i8r1   )rV   rW   rX   r#   rY   r,   r]   r^   s   @r0   r`   r`   `   s    95 9# 9 9r1   r`   c                         e Zd Zd fd	Z xZS )GraniteMoeHybridRMSNormGatedc                 &    t         |   ||       y r)   r*   )r.   hidden_sizeepsr/   s      r0   r,   z%GraniteMoeHybridRMSNormGated.__init__f   s    c*r1   )gư>)rV   rW   rX   r,   r]   r^   s   @r0   rc   rc   e   s    + +r1   rc   c                   $     e Zd Zdef fdZ xZS )GraniteMoeHybridMLPr&   c                 $    t         |   |       y r)   r*   r.   r&   r/   s     r0   r,   zGraniteMoeHybridMLP.__init__k   s     r1   )rV   rW   rX   r#   r,   r]   r^   s   @r0   rh   rh   j   s    !5 ! !r1   rh   c                       e Zd Zy)GraniteMoeHybridRotaryEmbeddingNrV   rW   rX    r1   r0   rl   rl   o       r1   rl   c                       e Zd Zy)GraniteMoeHybridMoENrm   rn   r1   r0   rq   rq   s   ro   r1   rq   c                   .    e Zd Zdedef fdZe	 	 	 	 ddej                  dej                  dz  de	dz  de
dz  d	eej                  ej                  f   dz  d
ee   deej                  eej                  ej                  f   dz  f   fd       Z xZS )GraniteMoeHybridDecoderLayerr&   r'   c                 `   t         |   ||       t        |      | _        d | _        d | _        |j                  |   dk(  rt        ||      | _        nt        ||      | _        |j                  |   | _	        |j                  dkD  rt        |      nd | _        t        |dd      dkD  | _        y )Nmambar   num_local_experts)r+   r,   rh   
shared_mlp	self_attnru   layers_block_typer`   r%   
layer_typerv   rq   block_sparse_moegetattrhas_expertsr-   s      r0   r,   z%GraniteMoeHybridDecoderLayer.__init__x   s    +-f5
##I.'93FIFDJ6vyIDN 229= @F?W?WZ[?[ 3F ;ae #6+>BQFr1   Nr2   r3   r4   	use_cacher5   r6   r7   c           	         |}| j                  |      }| j                   | j                  d|||d|}n | j                  d|||||d|\  }}||| j                  z  z   }|}| j	                  |      }| j
                  r&| j                  |      }	|	| j                  |      z   }n| j                  |      }||| j                  z  z   }|S )N)r2   cache_paramsr3   )r2   r3   r4   r~   r5   rn   )input_layernormru   rx   residual_multiplierpost_attention_layernormr}   r{   rw   )
r.   r2   r3   r4   r~   r5   r6   residual_moe_hidden_statess
             r0   rU   z$GraniteMoeHybridDecoderLayer.forward   s    !,,];::!&DJJ +,- 	M  .t~~  +- /#$7   M1 !=43K3K#KK 55mD $ 5 5m D-0NNM OOM:M =43K3K#KKr1   )NNFN)rV   rW   rX   r#   rY   r,   r   rZ   r[   r   boolr\   r   r   FloatTensorrU   r]   r^   s   @r0   rs   rs   w   s    G5 G# G&  /3(,!&HL(||( t+( 	(
 $;( #5<<#=>E( 45( 
u  %(9(95;L;L(L"MPT"TT	U( (r1   rs   c                   \     e Zd ZU eed<   dgZdZ ej                          fd       Z	 xZ
S )GraniteMoeHybridPreTrainedModelr&   rs   Tc           
         t         |   |       t        |t              rt	        j
                  |j                         t	        j                  |j                  t        j                  t        j                  d|j                  dz                      t	        j
                  |j                         y t        |t              r t	        j
                  |j                         y y )Nr"   )r+   _init_weights
isinstancer`   initones_dt_biascopy_A_logrZ   logarange	num_headsDrc   weight)r.   moduler/   s     r0   r   z-GraniteMoeHybridPreTrainedModel._init_weights   s    f%f89JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  <=JJv}}% >r1   )rV   rW   rX   r#   __annotations___no_split_modules_is_statefulrZ   no_gradr   r]   r^   s   @r0   r   r      s1    ""78LU]]_& &r1   r   c                        e Zd Zdef fdZeee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ee   deez  fd                     Zd Z xZS )GraniteMoeHybridModelr&   c           	      (   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        |j                  | _        |j                  dk(  rt        |      | _        y d | _        y c c}w )Nrope)r+   r,   r   
ModuleListrangenum_hidden_layersrs   layersembedding_multiplierposition_embedding_typerl   
rotary_embr-   s      r0   r,   zGraniteMoeHybridModel.__init__   sz     mmNSTZTlTlNmn)&)<n
 %+$?$?!EKEcEcgmEm9&Asw os   BN	input_idsr3   position_idsr4   inputs_embedsr~   r6   r7   c           	         |d u |d uz  rt        d      || j                  |      }|| j                  z  }|r|t        | j                        }|V||j                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        | j                  |||      }	| j                  ||      }
|}d }| j                  | j                  ||      }t        | j                        D ]3  \  }}| j                  j                   |   dk(  r|
n|	} ||f||||d|}5 | j#                  |      }t%        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embeds)r&   r   r"   )device)r&   r   r3   r4   ru   )r3   r4   r~   r5   )last_hidden_stater4   )
ValueErrorembed_tokensr   r   r&   get_seq_lengthrZ   r   r<   r   	unsqueezer	   _update_mamba_maskr   	enumerater   ry   normr   )r.   r   r3   r   r4   r   r~   r6   past_seen_tokenscausal_mask
mamba_maskr2   r5   idecoder_layer
layer_masks                   r0   rU   zGraniteMoeHybridModel.forward   s    -t";<YZZ  --i8M%(A(AA0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+	
 ,,^_M
 &"??&"&//-"N )$++ 6 	A}'+{{'D'DQ'G7'RXcJ)) /#$7 M		 		-0%++
 	
r1   c                 f    |}||j                         s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        Nr"   )has_previous_staterZ   all)r.   r3   r4   r   s       r0   r   z(GraniteMoeHybridModel._update_mamba_mask  s;     $
'O,N,N,P&599^q5H+IJr1   )NNNNNN)rV   rW   rX   r#   r,   r   r   r   rZ   
LongTensorr[   r   r   r   r   r   r\   r
   rU   r   r]   r^   s   @r0   r   r      s    x5 x  .2.204(,26!%9
##d*9
 t+9
 &&-	9

 9
 ((4/9
 $;9
 459
 
(	(9
    9
vr1   r   c                   6     e Zd ZddiZdef fdZ fdZ xZS )GraniteMoeHybridForCausalLMzlm_head.weightzmodel.embed_tokens.weightr&   c                 d    t         |   |       t        |      | _        | j	                          y r)   )r+   r,   r   model	post_initrj   s     r0   r,   z$GraniteMoeHybridForCausalLM.__init__  s&     *62
r1   c                 "    t        |   di |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm-granite/granite-4.0-h-tiny")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.0-h-tiny")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```rn   )r+   rU   )r.   super_kwargsr/   s     r0   rU   z#GraniteMoeHybridForCausalLM.forward%  s    . w...r1   )rV   rW   rX   _tied_weights_keysr#   r,   rU   r]   r^   s   @r0   r   r     s&    *,GH5 / /r1   r   )r   r   r   )=collections.abcr   rZ   r    r   r   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   bamba.configuration_bambar   bamba.modeling_bambar   r   gemma2.modeling_gemma2r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   r   r    r!   configuration_granitemoehybridr#   
get_loggerrV   loggerr%   r`   rc   rh   rl   rq   rs   r   r   r   __all__rn   r1   r0   <module>r      s    %   & . / O 5 & @ @ 7 5 3 @ :   C 
		H	%+) 9 +)\9 9
+#4 +
!- !
	&; 		- 	=#? =@&&E & R1 Rj /"=  /F fr1   