
    i%                        d dl mZ d dlZd dlmZ d dlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZm Z  ddl!m"Z"  ejF                  e$      Z% ed      e G d de                    Z& G d de"      Z' G d de      Z( G d de      Z) G d de      Z* G d de      Z+ G d de      Z, G d  d!e      Z- G d" d#e      Z. G d$ d%e      Z/g d&Z0y)'    )CallableN)strict)nn   )ACT2CLS)Cache)PreTrainedConfig)RopeParameters)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)NemotronMLPz!swiss-ai/Apertus-8B-Instruct-2509)
checkpointc            	           e Zd ZU dZdZdgZdZdddddddddZd	gd
gfddgdgfdgdgfdZdZ	e
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
ed<   dZe
dz  ed<   dZeed<   dZe
ed<   dZeed<   dZeed <   d!Zeed"<   d#Ze
dz  ed$<   d%Ze
dz  ed&<   d'Ze
ee
   z  dz  ed(<   d)Zeed*<   dZee z  dz  ed+<   d)Z!eed,<   d-Z"ee
z  ed.<    fd/Z# xZ$S )0ApertusConfigaz  
    ```python
    >>> from transformers import ApertusModel, ApertusConfig

    >>> # Initializing a Apertus-8B style configuration
    >>> configuration = ApertusConfig()

    >>> # Initializing a model from the Apertus-8B style configuration
    >>> model = ApertusModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```apertuspast_key_values    `fAcolwisereplicated_with_grad_allreducerowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   
vocab_sizei   hidden_sizei 8  intermediate_size    num_hidden_layersnum_attention_headsNnum_key_value_headsxielu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegh㈵>rms_norm_epsT	use_cacher   pad_token_id   bos_token_idr   eos_token_idFtie_word_embeddingsrope_parametersattention_bias        attention_dropoutc                     | j                   | j                  | _         | j                  ddddddd| _        t        |   di | y )	Nllama3r!   g       @i    g      ?g      @)	rope_type
rope_thetafactor original_max_position_embeddingslow_freq_factorhigh_freq_factor )r2   r1   r>   super__post_init__)selfkwargs	__class__s     |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/apertus/modular_apertus.pyrL   zApertusConfig.__post_init__e   sW    ##+'+'?'?D$'%(48#&$'$D  	''    )%__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planr,   int__annotations__r-   r.   r0   r1   r2   r4   strr5   r6   floatr7   r8   boolr9   r;   r<   listr=   r>   r
   dictr?   rA   rL   __classcell__rO   s   @rP   r   r   .   sb    J#4"5M%.%.%.%E%E%. )"+	 &(9:#%568IJ!"_$56 JK"s"s!!&*t*J#(S(#u#L%It L#*  L#* +,L#S	/D(, %%48O^d*T18 ND %(us{(( (rQ   r   c                        e Zd Z fdZ xZS )
ApertusMLPc                 D   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        |j                  dk(  rt        d   |j                        | _        y y )NF)biasr3   )dtype)rK   __init__r   Linearr-   r.   up_proj	down_projr4   r   rh   act_fn)rM   configrO   s     rP   ri   zApertusMLP.__init__v   sz     yy!1!143I3IPUV4#9#94;K;KRWX'!'*>DK (rQ   )rR   rS   rT   ri   rb   rc   s   @rP   re   re   u   s    ? ?rQ   re   c                       e Zd Zy)ApertusRMSNormNrR   rS   rT   rJ   rQ   rP   rp   rp   ~       rQ   rp   c                       e Zd Zy)ApertusRotaryEmbeddingNrq   rJ   rQ   rP   rt   rt      rr   rQ   rt   c                        e Zd Zddededz  f fdZ	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	e
e   d
eej                  ej                  f   fdZ xZS )ApertusAttentionNrn   	layer_idxc                     t         |   ||       t        | j                  |j                        | _        t        | j                  |j                        | _        y N)rK   ri   rp   head_dimr7   q_normk_normrM   rn   rw   rO   s      rP   ri   zApertusAttention.__init__   sB    +$T]]F4G4GH$T]]F4G4GHrQ   r'   position_embeddingsr(   r    rN   returnc                 N   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      }| j                  |	      }	|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  sdn| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nr:   r   r@   )dropoutscaling)shaperz   q_projview	transposek_projv_projr{   r|   r   updaterw   r   get_interfacern   _attn_implementationr   trainingrA   r   reshape
contiguouso_proj)rM   r'   r~   r(   r    rN   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   rP   forwardzApertusAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((rQ   ry   )rR   rS   rT   r   r[   ri   torchTensortupler   r   r   r   rb   rc   s   @rP   rv   rv      s    I} It I )-()||() #5<<#=>() t+	()
 () +,() 
u||U\\)	*()rQ   rv   c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
eej                  ej                  f   dz  dee   dej                  fdZ xZS )ApertusDecoderLayerrn   rw   c                     t         |   ||       t        |j                  |j                        | _        t        |j                  |j                        | _        | `| `y )N)eps)	rK   ri   rp   r-   r7   attention_layernormfeedforward_layernorminput_layernormpost_attention_layernormr}   s      rP   ri   zApertusDecoderLayer.__init__   sT    +#1&2D2D&J]J]#^ %3F4F4FFL_L_%`" )rQ   Nr'   r(   position_idsr    r8   r~   rN   r   c           
          |}| j                  |      } | j                  d||||||d|\  }}	||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r'   r(   r   r    r8   r~   rJ   )r   	self_attnr   mlp)
rM   r'   r(   r   r    r8   r~   rN   residual_s
             rP   r   zApertusDecoderLayer.forward   s     !00?)4>> 
')%+ 3
 
q !=0 !22=A/ =0rQ   )NNNFN)rR   rS   rT   r   r[   ri   r   r   
LongTensorr   r_   r   r   r   r   rb   rc   s   @rP   r   r      s    *} * * /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
rQ   r   c                       e Zd Zy)ApertusPreTrainedModelNrq   rJ   rQ   rP   r   r      rr   rQ   r   c                       e Zd Zy)ApertusModelNrq   rJ   rQ   rP   r   r      rr   rQ   r   c                        e Zd Z fdZ xZS )ApertusForCausalLMc                 "    t        |   di |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ApertusForCausalLM

        >>> model = ApertusForCausalLM.from_pretrained("swiss-ai/Apertus-8B-Instruct-2509")
        >>> tokenizer = AutoTokenizer.from_pretrained("swiss-ai/Apertus-8B-Instruct-2509")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```rJ   )rK   r   )rM   super_kwargsrO   s     rP   r   zApertusForCausalLM.forward   s    . w...rQ   )rR   rS   rT   r   rb   rc   s   @rP   r   r      s    / /rQ   r   c                       e Zd Zy)ApertusForTokenClassificationNrq   rJ   rQ   rP   r   r     rr   rQ   r   )r   r   r   r   r   )1collections.abcr   r   huggingface_hub.dataclassesr   r   activationsr   cache_utilsr   configuration_utilsr	   modeling_rope_utilsr
   modeling_utilsr   processing_utilsr   utilsr   r   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   nemotron.modeling_nemotronr   
get_loggerrR   loggerr   re   rp   rt   rv   r   r   r   r   r   __all__rJ   rQ   rP   <module>r      s    %  .  "   3 1 5 & @ @   5 
		H	% >?B($ B(  @B(J? ?	\ 		1 	.)~ .)b%+ %P	1 		: 	/) /6	$? 	rQ   