
    iR#                        d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z#  ed      e G d de                    Z$ G d de!      Z% G d de      Z& G d de      Z' G d de      Z( G d de       Z) G d de      Z* G d  d!e      Z+g d"Z,y)#    )CallableN)strict   )CacheDynamicCache)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstring)TransformersKwargs   )Gemma2RotaryEmbedding)Olmo2Config)Olmo2AttentionOlmo2DecoderLayerOlmo2ForCausalLM
Olmo2ModelOlmo2PreTrainedModelOlmo2RMSNormapply_rotary_pos_embeager_attention_forwardzallenai/Olmo-3-7B-Instruct)
checkpointc                        e Zd ZU dZdZdgZddddddddZd	gd
gfddgdgfdgdgfdZdZe	dz  e
d<   dZee   dz  e
d<    fdZ xZS )Olmo3Configa  
    Example:

    ```python
    >>> from transformers import Olmo3Model, Olmo3Config

    >>> # Initializing a Olmo3 7B style configuration
    >>> configuration = Olmo3Config()

    >>> # Initializing a model from the Olmo3 7B style configuration
    >>> model = Olmo3Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    olmo3past_key_valuescolwise_gather_outputrowwise_split_inputcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   Nsliding_windowlayer_typesc                     | j                   | j                  | _         | j                  5t        | j                        D cg c]  }|dz   dz  dk7  rdnd c}| _        t        |   di | y c c}w )N      r   sliding_attentionfull_attention )num_key_value_headsnum_attention_headsr+   rangenum_hidden_layerssuper__post_init__)selfkwargsi	__class__s      x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/olmo3/modular_olmo3.pyr7   zOlmo3Config.__post_init__R   sz    ##+'+'?'?D$#W\]a]s]sWt RSA{a'7#=MM D 	''	 s   A0)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr*   int__annotations__r+   liststrr7   __classcell__r;   s   @r<   r   r   *   s    " J#4"5%<%<%<%:"+ )"+ &(9:#%568IJ!"_$56 "&NC$J%$(KcT!(	( 	(    r   c                       e Zd Zy)Olmo3RMSNormNr=   r>   r?   r1   rK   r<   rM   rM   ^       rK   rM   c                        e Zd Zdedef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  f   fdZ xZS )Olmo3Attentionconfig	layer_idxc                     t         |   ||       |j                  |   | _        | j                  dk(  r|j                  | _        y d | _        y )N)rS   r/   )r6   __init__r+   attention_typer*   r8   rR   rS   r;   s      r<   rU   zOlmo3Attention.__init__e   sL    95$00;7;7J7JNa7af33gkrK   Nr%   position_embeddingsr&   r   r9   returnc                 h   |j                   d d }g |d| j                  }| j                  | j                  |            }| j	                  | j                  |            }	| j                  |      }
|j                  |      j                  dd      }|	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  sdn| j$                  | j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr-   r   g        )dropoutscalingr*   )shapehead_dimq_normq_projk_normk_projv_projview	transposer   updaterS   r   get_interfacerR   _attn_implementationr   trainingattention_dropoutr]   r*   reshape
contiguouso_proj)r8   r%   rX   r&   r   r9   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r<   forwardzOlmo3Attention.forwardj   s    $))#2.88b8$--8{{4;;}#=>[[]!;<
{{=1#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((rK   )N)r=   r>   r?   r   rE   rU   torchTensortupler   r   r   ry   rI   rJ   s   @r<   rQ   rQ   d   s    l{ ls l )-+)||+) #5<<#=>+) t+	+)
 +) +,+) 
u||U\\D00	1+)rK   rQ   c                       e Zd Zy)Olmo3DecoderLayerNrN   r1   rK   r<   r~   r~      rO   rK   r~   c                       e Zd Zy)Olmo3RotaryEmbeddingNrN   r1   rK   r<   r   r      rO   rK   r   c                       e Zd Zy)Olmo3PreTrainedModelNrN   r1   rK   r<   r   r      rO   rK   r   c                        e Zd Zdef fdZ	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	e
dz  d
ee   defdZ xZS )
Olmo3ModelrR   c           	      &   t         |   |       t        |j                  |j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |      | _        y c c}w )N)epsrR   )r6   rU   rM   hidden_sizerms_norm_epsr)   nn
ModuleListr4   r5   r~   r(   r   
rotary_embrW   s      r<   rU   zOlmo3Model.__init__   so      !3!39L9LM	mmCHIaIaCbcivy1c
 /f= ds   BNr#   r&   position_idsr   r$   	use_cacher9   rY   c           	         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|V||j	                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        |x}	t              s)| j                  ||||d}
t        d
i |
t        d
i |
d}	|}| j                  ||      }t        | j                   d | j                  j"                         D ]-  \  }} ||f|	| j                  j$                  |      |||d|}/ | j'                  |      }t)        ||	      S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r-   )device)rR   r$   r&   r   r   )r0   r/   )r&   r   r   rX   )last_hidden_stater   r1   )
ValueErrorr'   r   rR   get_seq_lengthrz   aranger^   r   	unsqueeze
isinstancedictr   r	   r   	enumerater(   r5   r+   r)   r
   )r8   r#   r&   r   r   r$   r   r9   past_seen_tokenscausal_mask_mappingmask_kwargsr%   rX   r:   decoder_layers                  r<   ry   zOlmo3Model.forward   s    -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-F ++!."0#2 ,K #5"C{"C%F%U%U#
 &"oom\J )$++6U8U8U*V W 	A})24;;3J3J13MN) /$7 M	 		-0&++
 	
rK   )NNNNNN)r=   r>   r?   r   rU   rz   
LongTensorr{   r   FloatTensorboolr   r   r
   ry   rI   rJ   s   @r<   r   r      s    >{ > .2.204(,26!%9
##d*9
 t+9
 &&-	9

 9
 ((4/9
 $;9
 +,9
 
!9
rK   r   c                       e Zd Zy)Olmo3ForCausalLMNrN   r1   rK   r<   r   r      rO   rK   r   )r   r   r   r   )-collections.abcr   rz   torch.nnr   huggingface_hub.dataclassesr   cache_utilsr   r   masking_utilsr   r	   modeling_outputsr
   modeling_utilsr   processing_utilsr   utilsr   utils.genericr   gemma2.modeling_gemma2r   olmo2.configuration_olmo2r   olmo2.modeling_olmo2r   r   r   r   r   r   r   r   r   rM   rQ   r~   r   r   r   r   __all__r1   rK   r<   <module>r      s    %   . . R 7 5 & # / : 3	 	 	 78/(+ /(  9/(d	< 	1)^ 1)h	) 		0 		/ 	B
 B
J	' 	rK   