
    i*                     P   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ  ej8                  e      Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$g dZ%y)zPyTorch BitNet model.    )CallableN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )GemmaMLP)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaRMSNormapply_rotary_pos_embeager_attention_forward   )BitNetConfigc                       e Zd Zy)BitNetRMSNormN__name__
__module____qualname__     z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.pyr   r   )       r   r   c                   *     e Zd Zdef fdZd Z xZS )	BitNetMLPconfigc                 p    t         |   |       t        |j                  |j                        | _        y N)eps)super__init__r   intermediate_sizerms_norm_epsffn_sub_norm)selfr"   	__class__s     r   r'   zBitNetMLP.__init__.   s+     )&*B*BH[H[\r   c           	          | j                  | j                  | j                  | j                  |            | j	                  |      z              }|S N)	down_projr*   act_fn	gate_projup_proj)r+   xr/   s      r   forwardzBitNetMLP.forward2   sF    NN4#4#4T[[PQAR5SVZVbVbcdVe5e#fg	r   )r   r   r   r   r'   r4   __classcell__r,   s   @r   r!   r!   -   s    ]| ]r   r!   c                        e Zd Zdedef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  f   fdZ xZS )BitNetAttentionr"   	layer_idxc                 r    t         |   ||       t        |j                  |j                        | _        y r$   )r&   r'   r   hidden_sizer)   attn_sub_norm)r+   r"   r9   r,   s      r   r'   zBitNetAttention.__init__8   s-    +*6+=+=6CVCVWr   Nhidden_statesposition_embeddingsattention_maskpast_key_valueskwargsreturnc                 ,   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                   | j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }| j+                  |      }||fS )Nr   r   g        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr   updater9   r   get_interfacer"   _attn_implementationr   trainingattention_dropoutrF   reshape
contiguousr<   o_proj)r+   r=   r>   r?   r@   rA   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r   r4   zBitNetAttention.forward<   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFH((5kk+.L((r   r.   )r   r   r   r   intr'   torchTensortupler   r	   r   r4   r5   r6   s   @r   r8   r8   7   s    X| X X )-')||') #5<<#=>') t+	')
 ') -.') 
u||U\\D00	1')r   r8   c                       e Zd Zy)BitNetDecoderLayerNr   r   r   r   re   re   f   r   r   re   c                       e Zd Zy)BitNetModelNr   r   r   r   rg   rg   j   r   r   rg   c                   4     e Zd ZddiZdZdZdef fdZ xZS )BitNetForCausalLMzlm_head.weightzmodel.embed_tokens.weightNrB   c                 "    t        |   di |S )a$  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BitNetForCausalLM

        >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T")

        >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: '
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=100)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?"
        ```r   )r&   r4   )r+   super_kwargsr,   s     r   r4   zBitNetForCausalLM.forwards   s    4 w...r   )	r   r   r   _tied_weights_keys_tp_plan_pp_planr   r4   r5   r6   s   @r   ri   ri   n   s-    *,GHHH/ 
 / /r   ri   )ri   rg   BitNetPreTrainedModel)&__doc__collections.abcr   ra   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr	   utilsr
   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   configuration_bitnetr   
get_loggerr   loggerr   r!   r8   re   rg   ri   __all__r   r   r   <module>r~      s     $    B 6 5 &  +   / 
		H	%	L 	 ,)n ,)^	* 		* 	/( /Dr   