
    iT                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ	 ddl
mZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,  G d de&      Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1 G d  d!ej\                        Z2 G d" d#ej\                        Z3 G d$ d%ej\                        Z4 G d& d'e      Z5 G d( d)e      Z6e G d* d+e6             Z7 G d, d-e6e      Z8g d.Z9y)/z"Modular components for DBRX model.    )Callable)AnyN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )LlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)load_balancing_loss_func   )
DbrxConfigc                       e Zd Zy)DbrxRotaryEmbeddingN)__name__
__module____qualname__     v/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/dbrx/modular_dbrx.pyr    r    -   s    r%   r    c                        e Zd ZdZ	 ddedz  f fdZ	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
ej                  ej                  f   f
d
Z xZS )DbrxAttentionzYModular DBRX attention component that can be reused across different model architectures.N	layer_idxc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        |j                  | _	        || _
        |j                  }|j                  | _        |j                  | _        |j                  | _        | j                  | j                   z  | _        | j                  dz  | _        |j&                  | _        d| _        t+        j,                  | j                  | j                  d| j                   z  | j                  z  z   d      | _        t+        j,                  | j                  | j                  d      | _        y )Ng      Tr   Fbias)super__init__configd_modelhidden_sizen_heads	num_headshead_dimmax_seq_lenmax_position_embeddingsr)   attn_config
attn_pdropattention_dropoutclip_qkv
kv_n_headsnum_key_value_headsnum_key_value_groupsscaling
rope_theta	is_causalr   LinearWqkvout_proj)selfr/   r)   kwargsr7   	__class__s        r&   r.   zDbrxAttention.__init__4   s*    	!>>((DNN:'-'9'9$"((!,!7!7#,,#.#9#9 $(NNd6N6N$N!}}d*%00IId..T5M5M1MPTP]P]1]]di
	 		$"2"2D4D4D5Qr%   hidden_statesattention_maskposition_embeddingspast_key_valuesreturnc                    |j                   d d }g |d| j                  }| j                  |      }| j                  | j                   nd }	|j	                  |	| j                        }|j                  | j                  | j                  | j                  z  | j                  | j                  z  gd      \  }
}}|
j                  |      j                  dd      }
|j                  |      j                  dd      }|j                  |      j                  dd      }|\  }}t        |
|||      \  }
}| |j                  ||| j                        \  }}t        j                  | j                  j                   t"              } || |
|||f| j$                  sdn| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )N)minmaxr   dimr           )dropoutr>   )shaper4   rB   r:   clampsplitr1   r<   view	transposer   updater)   r   get_interfacer/   _attn_implementationr   trainingr9   r>   reshape
contiguousrC   )rD   rG   rH   rI   rJ   rE   input_shapehidden_shape
qkv_statesmin_valquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                     r&   forwardzDbrxAttention.forwardP   s    $))#2.88b8$--8YY}-
$(MM$=4==.4%%'t}}%E
1;1A1A  ((4==8((4==8
  2B 2
.j, $((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHmmK0L((r%   NNNN)r!   r"   r#   __doc__intr.   torchTensor
LongTensorr	   tuplerk   __classcell__rF   s   @r&   r(   r(   1   s    c
 !%R :R> /37;(,3)||3) t+3) #--4	3)
 3) 
u||U\\)	*3)r%   r(   c            
            e Zd Z fdZdej
                  dej
                  dej
                  dej
                  dej
                  f
dZ xZS )DbrxExpertGLUc                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  | j                  z  | j                              | _	        t        j                  t        j                  | j                  | j                  z  | j                              | _
        t        j                  t        j                  | j                  | j                  z  | j                              | _        |j                  j                  dd      }t        |   | _        y )Nnamesilu)r-   r.   r1   ffn_hidden_sizemoe_num_expertsr   	Parameterrp   emptyw1v1w2
ffn_act_fngetr   activation_fn)rD   r/   act_fn_namerF   s      r&   r.   zDbrxExpertGLU.__init__   s    !--%55%55,,u{{4+?+?$BVBV+VX\XhXhij,,u{{4+?+?$BVBV+VX\XhXhij,,u{{4+?+?$BVBV+VX\XhXhij''++FF;#K0r%   x	expert_w1	expert_v1	expert_w2rK   c                     |j                  |      }|j                  |      }| j                  |      }||z  }|j                  |j                               }|S rl   )matmulr   t)	rD   r   r   r   r   	gate_projup_projintermediate_states	down_projs	            r&   rk   zDbrxExpertGLU.forward   sW     HHY'	((9%&&y1	''1'..y{{}=	r%   r!   r"   r#   r.   rp   rq   rk   rt   ru   s   @r&   rw   rw      sK    1*/,,CH<<\a\h\h	r%   rw   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej
                  fdZ xZS )DbrxExpertsc                     t         |           t        |      | _        |j                  | _        |j
                  | _        |j                  | _        y rl   )r-   r.   rw   mlpr1   r{   r|   num_expertsrD   r/   rF   s     r&   r.   zDbrxExperts.__init__   sD     (!--%55!11r%   rG   top_k_indextop_k_weightsrK   c                    |j                   d   }|j                  d| j                        }t        j                  ||j
                  |j                        }t        j                         5  t        j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        d| j                  | j                   f}D ]  }	|	d   }	t        j                         5  t        j"                  |	         \  }
}d d d        | j$                  j&                  j)                  |      |	   }| j$                  j*                  j)                  |      |	   }| j$                  j,                  j)                  |      |	   }| j%                  |   |||      }|j)                  d| j                        ||
d f   z  }|j/                  d||       
 |j)                  |d| j                        }|S # 1 sw Y   OxY w# 1 sw Y   xY w)	Nr   rM   )dtypedevice)num_classesr   r   )rM   rP   )rT   r]   r{   rp   
zeros_liker   r   no_gradr   
functionalone_hotr   permutegreatersumnonzeror1   wherer   r   rW   r   r   
index_add_)rD   rG   r   r   
batch_sizenext_statesexpert_mask
expert_hitsplit_expert_shape
expert_idxidx	token_idxr   r   r   statess                   r&   rk   zDbrxExperts.forward   s    #((+
%--b$2F2FG&&}M<O<OXeXlXlm]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 !$"6"68H8HI$ 		9J#AJ F!&[-D!EYF!!"45jAB!!"45jAB!!"45jABXXmI6BCF[[T%9%9:]9VY[_K_=``F""1i8		9 "&&z2t7K7KL%	S 	SF Fs   ,A=H6)I6I I	r   ru   s   @r&   r   r      sC    2|| \\ ||	
 
r%   r   c                        e Zd Z fdZdej
                  deej
                  ej
                  ej                  f   fdZ xZ	S )
DbrxRouterc                     t         |           |j                  | _        |j                  | _        t        j                  | j                  |j                  d      | _        y NFr+   )	r-   r.   r{   r1   moe_jitter_epsr   rA   r|   layerr   s     r&   r.   zDbrxRouter.__init__   sJ    !11$33YYt//1G1GeT
r%   rG   rK   c                    | j                   rN| j                  B|t        j                  |      j	                  d| j                  z
  d| j                  z         z  }|j                  d|j                  d         }| j                  |      }|S )Ng      ?rM   )r\   r   rp   
empty_likeuniform_rW   rT   r   )rD   rG   router_logitss      r&   rk   zDbrxRouter.forward   s    ==T00<U--m<EEd)))31D1D+D M &**2}/B/B2/FG

=1r%   )
r!   r"   r#   r.   rp   rq   rs   rr   rk   rt   ru   s   @r&   r   r      s;    UU\\ eELL%,,X]XhXh<h6i r%   r   c                   ~     e Zd ZdZ fdZd Zdej                  deej                  ej                  f   fdZ	 xZ
S )DbrxFFNz0Modular DBRX MLP/FFN component with MoE support.c                     t         |           t        |j                        | _        t        |j                        | _        |j                  j                  | _        |j                  j                  | _	        y rl   )
r-   r.   r   
ffn_configrouterr   expertsmoe_normalize_expert_weights	moe_top_ktop_k)rD   r/   rE   rF   s      r&   r.   zDbrxFFN.__init__   sY     !2!23"6#4#45,2,=,=,Z,Z)&&00
r%   c                 $   t         j                  j                  j                  |d|j                        }t        j
                  || j                  d      \  }}| j                  &|t        j                  || j                  dd      z  }||fS )Nr   )rQ   r   rM   rP   T)prQ   keepdim)	rp   r   r   softmaxr   topkr   r   norm)rD   r   router_top_valuerouter_indicess       r&   route_tokens_to_expertszDbrxFFN.route_tokens_to_experts   s    ++33MqP]PcPc3d+0::mTZZUW+X(.,,8/%** D$E$E2W[3    //r%   rG   rK   c                 v    | j                  |      }| j                  |      \  }}| j                  |||      }|S rl   )r   r   r   )rD   rG   r   r   r   outputs         r&   rk   zDbrxFFN.forward   s<    M2%)%A%A-%P"{m[-Hr%   )r!   r"   r#   rn   r.   r   rp   rq   rs   rk   rt   ru   s   @r&   r   r      s9    :10U\\ eELL%,,<V6W r%   r   c                        e Zd Zddededz  f fdZ	 	 ddej                  dej                  dej                  dz  de	dz  d	e
d
eej                  ej                  f   fdZ xZS )DbrxNormAttentionNormNr/   r)   c                    t         |           || _        |j                  | _        t	        j
                  |j                  d      | _        t        ||      | _	        t	        j
                  |j                  d      | _
        y )NFr+   r/   r)   )r-   r.   r)   resid_pdropr   	LayerNormr0   norm_1r(   attnnorm_2rD   r/   r)   rF   s      r&   r.   zDbrxNormAttentionNorm.__init__   sc    "!--ll6>>>!
	 ll6>>>r%   rG   rI   rH   rJ   rE   rK   c                 d   |}| j                  |      j                  |j                        } | j                  d||||d|\  }}t        j
                  j                  || j                  | j                        }||z   }|}| j                  |      j                  |j                        }||fS N)rG   rH   rI   rJ   )r   r\   r$   )
r   tor   r   r   r   rS   r   r\   r   )rD   rG   rI   rH   rJ   rE   residual_states_s           r&   rk   zDbrxNormAttentionNorm.forward   s     (M255m6I6IJ$499 
') 3+	

 
q --mt?O?OZ^ZgZg-h%7'M255m6I6IJ--r%   rl   )NN)r!   r"   r#   r   ro   r.   rp   rq   rr   r	   r   rs   rk   rt   ru   s   @r&   r   r      s    	?z 	?cDj 	? /3(,.||. #--. t+	.
 . . 
u||U\\)	*.r%   r   c                        e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
f
d
Z xZS )	DbrxBlockr/   r)   c                     t         |           |j                  | _        |j                  | _        || _        t        ||      | _        t        |      | _	        y )Nr   r/   )
r-   r.   r0   r1   r   r)   r   norm_attn_normr   ffnr   s      r&   r.   zDbrxBlock.__init__  sP    !>>!--"3
 &)r%   NrG   rH   rI   rJ   rE   c                      | j                   d||||d|\  }}| j                  |      }t        j                  j	                  || j
                  | j                        }||z   }|S r   )r   r   r   r   rS   r   r\   )rD   rG   rH   rI   rJ   rE   resid_statess          r&   rk   zDbrxBlock.forward&  s~     ':d&9&9 '
') 3+	'

 '
#m /--mt?O?OZ^ZgZg-h$}4r%   rm   )r!   r"   r#   r   ro   r.   rp   rq   rr   r	   r   rk   rt   ru   s   @r&   r   r     sr    	*z 	*c 	* /37;(,|| t+ #--4	
  r%   r   c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZ ej$                         dej(                  f fd	       Z xZS )
DbrxPreTrainedModelr/   transformerTr   rJ   F)rG   
attentionsmodulec                 >   t         |   |       | j                  j                  }t	        |t
              rgt        j                  |j                  d|       t        j                  |j                  d|       t        j                  |j                  d|       y y )NrR   )meanstd)r-   _init_weightsr/   initializer_range
isinstancerw   initnormal_r   r   r   )rD   r   r   rF   s      r&   r   z!DbrxPreTrainedModel._init_weightsL  sj    f%kk++fm,LL#6LL#6LL#6 -r%   )r!   r"   r#   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flex_attn_supports_attention_backend_supports_flash_attn_supports_sdpa_can_compile_fullgraphr   r(   _can_record_outputsrp   r   r   Moduler   rt   ru   s   @r&   r   r   <  sx    %&*#$#4"5"&N""#
 U]]_7BII 7 7r%   r   c                   6    e Zd ZdZdef fdZdej                  fdZdej                  fdZ	e
ee	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dedz  dej"                  dz  dedz  dee   defd                     Z xZS )	DbrxModela  Transformer decoder consisting of *config.num_hidden_layers*. Each layer is a [`DbrxBlock`] layer.

    Args:
        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    r/   c           	      ,   t         |   |       |j                  | _        |j                  | _        |j
                  | _        t        |      | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j"                  |j                  d      | _        d| _        | j)                          y c c}w r   )r-   r.   pad_token_idpadding_idx
vocab_size	emb_pdropr    
rotary_embr   	Embeddingr0   wte
ModuleListrangen_layersr   blocksr   norm_fgradient_checkpointing	post_initr   s      r&   r.   zDbrxModel.__init__`  s     !.. ++))-f5<< 1 16>>4CSCSTmmSXY_YhYhSi$jiYvy%A$jkll6>>>&+# 	 %ks   4DrK   c                     | j                   S rl   r	  rD   s    r&   get_input_embeddingszDbrxModel.get_input_embeddingsn  s    xxr%   valuec                     || _         y rl   r  rD   r  s     r&   set_input_embeddingszDbrxModel.set_input_embeddingsq  s	    r%   N	input_idsrH   position_idsrJ   inputs_embeds	use_cacherE   c           
      :   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        | j                  ||||      }	|}
| j                  |
|      }| j                  d | j                  j                   D ]  } ||
f||	|||d|}
 | j                  |
      }
t        |
|      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   )r/   r  rH   rJ   r  )rI   rH   r  rJ   r  )last_hidden_staterJ   )
ValueErrorr
   r/   r	  get_seq_lengthrp   arangerT   r   	unsqueezer   r  r  num_hidden_layersr  r   )rD   r  rH   r  rJ   r  r  rE   past_seen_tokenscausal_maskrG   rI   decoder_layers                r&   rk   zDbrxModel.forwardt  sJ    -t";<YZZ0*$++>O  HHY/MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 & #oom\J![[)H4;;+H+HI 		M)$7*) /# M		 M2%++
 	
r%   )NNNNNN)r!   r"   r#   rn   r   r.   r   r  r  r  r   r   r   rp   rr   rq   r	   FloatTensorboolr   r   r   rk   rt   ru   s   @r&   r  r  V  s    z bll ",,    .2.204(,26!%5
##d*5
 t+5
 &&-	5

 5
 ((4/5
 $;5
 +,5
 
 5
    5
r%   r  c                       e Zd ZddiZddiZddgdgfiZdef fdZd	ej                  fd
Z
dej                  fdZd	ej                  fdZdej                  fdZdefdZd	efdZee	 	 	 	 	 	 	 	 	 ddej*                  dz  dej,                  dz  dej*                  dz  dedz  dej0                  dz  dej*                  dz  dedz  dedz  deej,                  z  dee   d	efd              Z xZS ) DbrxForCausalLMzlm_head.weightztransformer.wte.weightlm_headcolwise_gather_outputrG   logitsr/   c                    t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  j                  | _        |j                  j                  | _        |j                  j                  | _        | j!                          y r   )r-   r.   r  r   r  r   rA   r1   r+  r   moe_loss_weightrouter_aux_loss_coefr|   r   r   num_experts_per_tokr  r   s     r&   r.   zDbrxForCausalLM.__init__  s     $V, ++yy!3!3V5F5FUS$*$5$5$E$E!!,,<<#)#4#4#>#> r%   rK   c                 6    | j                   j                         S rl   )r   r  r  s    r&   r  z$DbrxForCausalLM.get_input_embeddings  s    4466r%   r  c                 :    | j                   j                  |       y rl   )r   r  r  s     r&   r  z$DbrxForCausalLM.set_input_embeddings  s    --e4r%   c                     | j                   S rl   r+  r  s    r&   get_output_embeddingsz%DbrxForCausalLM.get_output_embeddings  s    ||r%   new_embeddingsc                     || _         y rl   r5  )rD   r7  s     r&   set_output_embeddingsz%DbrxForCausalLM.set_output_embeddings  s	    %r%   decoderc                     || _         y rl   r   )rD   r:  s     r&   set_decoderzDbrxForCausalLM.set_decoder  s
    "r%   c                     | j                   S rl   r<  r  s    r&   get_decoderzDbrxForCausalLM.get_decoder  s    r%   Nr  rH   r  rJ   r  labelsr  output_router_logitslogits_to_keeprE   c
                 j   ||n| j                   j                  } | j                  d|||||||d|
}|j                  }t	        |	t
              rt        |	 d      n|	}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |
}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, DbrxForCausalLM

        >> model = DbrxForCausalLM.from_pretrained("transformers-community/dbrx-instruct")
        >> tokenizer = AutoTokenizer.from_pretrained("transformers-community/dbrx-instruct")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```
        N)r  rH   r  rJ   r  r  rA  )lossaux_lossr-  rJ   rG   r   r   r$   )r/   rA  r   r  r   ro   slicer+  loss_functionr  r   r   r   r1  r0  r   r   r   rJ   rG   r   )rD   r  rH   r  rJ   r  r@  r  rA  rB  rE   outputsrG   slice_indicesr-  rD  rE  s                    r&   rk   zDbrxForCausalLM.forward  sW   N %9$D $++JjJj 	
 +;$*:*: 	+
)%+'!5	+
 	+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r%   )	NNNNNNNNr   ) r!   r"   r#   _tied_weights_keys_tp_plan_pp_planr   r.   r   r  r  r  rA   r6  r9  r  r=  r?  r   r   rp   rr   rq   r	   r'  r(  ro   r   r   r   rk   rt   ru   s   @r&   r*  r*    s   *,DE23H_-z:;Hz 7bll 75",, 5ryy &BII &#9 # Y    .2.204(,26*.!%,0-.P
##d*P
 t+P
 &&-	P

 P
 ((4/P
   4'P
 $;P
 #TkP
 ell*P
 +,P
 
#P
  P
r%   r*  )r*  r  r   ):rn   collections.abcr   typingr   rp   r    r   r   activationsr   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   mixtral.modeling_mixtralr   configuration_dbrxr   r    r   r(   rw   r   r   r   r   r   r   r  r*  __all__r$   r%   r&   <module>r_     s   ) $    & ! . ) / R F & I I 7 5 
 @ *	. 	R)BII R)jBII 2$")) $N "bii 6%.BII %.P* D7/ 74 U
# U
 U
ps
)? s
l Br%   