
    i!                        d dl mZ d dlZd dlmZ d dlmZ d dlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ d
dlmZmZmZ d
dlmZ d
dlmZmZmZmZmZmZ  ej>                  e       Z! ed      e G d de                    Z" G d de      Z# G d de      Z$d Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z) G d d e      Z*g d!Z+y)"    )CallableN)strict)TransformersKwargs   )Cache)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringlogging   )LlamaPreTrainedModelLlamaRMSNormeager_attention_forward)
OlmoConfig)OlmoAttentionOlmoDecoderLayerOlmoForCausalLM	OlmoModelOlmoRotaryEmbeddingapply_rotary_pos_embzallenai/Olmo2-7B-1124-hf)
checkpointc                   l    e Zd ZU dZdZddddddddZdgd	gfd
dgd
gfd
gd
gfdZdZee	d<    e
       Zy)Olmo2Configa  
    Example:

    ```python
    >>> from transformers import Olmo2Model, Olmo2Config

    >>> # Initializing a Olmo2 7B style configuration
    >>> configuration = Olmo2Config()

    >>> # Initializing a model from the Olmo2 7B style configuration
    >>> model = Olmo2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    olmo2colwise_gather_outputrowwise_split_inputcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormgh㈵>rms_norm_epsN)__name__
__module____qualname____doc__
model_typebase_model_tp_planbase_model_pp_planr&   float__annotations__AttributeErrorclip_qkv     x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/olmo2/modular_olmo2.pyr   r   /   su    " J%<%<%<%:"+ )"+ &(9:#%568IJ!"_$56 L%Hr3   r   c                       e Zd Zd Zy)Olmo2RMSNormc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |z  j                  |      S )Nr   T)keepdim)	dtypetotorchfloat32powmeanrsqrtvariance_epsilonweight)selfr!   input_dtypevariances       r4   forwardzOlmo2RMSNorm.forwardZ   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UUm+//<<r3   N)r'   r(   r)   rF   r2   r3   r4   r6   r6   Y   s    =r3   r6   c                       e Zd Zy)Olmo2RotaryEmbeddingNr'   r(   r)   r2   r3   r4   rH   rH   b       r3   rH   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr8   r   )dim)shaper<   cat)xx1x2s      r4   rotate_halfrR   f   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   c                        e Zd Zddededz  f fdZ	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  f   fdZ xZS )Olmo2AttentionNconfig	layer_idxc                     t         |   ||       t        |j                  | j                  z  |j
                        | _        t        |j                  | j                  z  |j
                        | _        y )NrV   )	super__init__r6   num_attention_headshead_dimr&   q_normnum_key_value_headsk_normrC   rU   rV   	__class__s      r4   rZ   zOlmo2Attention.__init__q   s[    95"6#=#=#MvObObc"6#=#=#MvObObcr3   r!   position_embeddingsr"   past_key_valueskwargsreturnc                 R   |j                   d d }g |d| j                  }| j                  | j                  |            }| j	                  | j                  |            }	| j                  |      }
|j                  |      j                  dd      }|	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|\  }}t        ||	||      \  }}	| |j                  |	|
| j                        \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  sdn| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nr8      r   g        )dropoutscaling)rM   r\   r]   q_projr_   k_projv_projview	transposer   updaterV   r   get_interfacerU   _attn_implementationr   trainingattention_dropoutri   reshape
contiguouso_proj)rC   r!   rb   r"   rc   rd   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r4   rF   zOlmo2Attention.forwardv   s    $))#2.88b8$--8{{4;;}#=>[[]!;<
{{=1#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r3   )N)r'   r(   r)   r   intrZ   r<   Tensortupler   r	   r   rF   __classcell__ra   s   @r4   rT   rT   p   s    d{ dsTz d )-*)||*) #5<<#=>*) t+	*)
 *) +,*) 
u||U\\D00	1*)r3   rT   c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
eej                  ej                  f   dz  dee   dej                  fdZ xZS )Olmo2DecoderLayerrU   rV   c                     t         |   ||       t        |j                  |j                        | _        t        |j                  |j                        | _        t        ||      | _        | `	y )NrX   eps)rU   rV   )
rY   rZ   r6   hidden_sizer&   post_attention_layernormpost_feedforward_layernormrT   	self_attninput_layernormr`   s      r4   rZ   zOlmo2DecoderLayer.__init__   s_    95(4V5G5GVM`M`(a%*6v7I7IvObOb*c''vK r3   Nr!   r"   position_idsrc   	use_cacherb   rd   re   c           
          |} | j                   d||||||d|\  }}	| j                  |      }||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r!   r"   r   rc   r   rb   r2   )r   r   mlpr   )
rC   r!   r"   r   rc   r   rb   rd   residual_s
             r4   rF   zOlmo2DecoderLayer.forward   s     !)4>> 
')%+ 3
 
q 55mD =0 !/77F =0r3   )NNNFN)r'   r(   r)   r   r   rZ   r<   r   
LongTensorr   boolr   r	   r   rF   r   r   s   @r4   r   r      s    !{ !s ! /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
r3   r   c                       e Zd Zy)Olmo2PreTrainedModelNrI   r2   r3   r4   r   r      rJ   r3   r   c                   $     e Zd Zdef fdZ xZS )
Olmo2ModelrU   c           	         t         |   |       t        |j                  |j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w )Nr   )rY   rZ   r6   r   r&   r%   nn
ModuleListrangenum_hidden_layersr   r$   r`   s      r4   rZ   zOlmo2Model.__init__   s^      !3!39L9LM	mmCHIaIaCbcivy1c
cs   A=)r'   r(   r)   r   rZ   r   r   s   @r4   r   r      s    
{ 
 
r3   r   c                       e Zd Zy)Olmo2ForCausalLMNrI   r2   r3   r4   r   r      rJ   r3   r   )r   r   r   r   ),collections.abcr   r<   torch.nnr   huggingface_hub.dataclassesr   transformers.utils.genericr   cache_utilsr   modeling_utilsr   processing_utilsr	   utilsr
   r   llama.modeling_llamar   r   r   olmo.configuration_olmor   olmo.modeling_olmor   r   r   r   r   r   
get_loggerr'   loggerr   r6   rH   rR   rT   r   r   r   r   __all__r2   r3   r4   <module>r      s   ( %   . 9   5 & , ^ ^ 0  
		H	% 56# * #   7# P=< =	. 	(0)] 0)l$( $N	/ 	
 
	 	r3   