
    i"                     B   d dl Z d dl mZ ddlmZmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ  ej:                  e      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$g dZ%y)    N)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   4     e Zd ZdZddededz  f fdZ xZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                 H    t         |   ||       |j                  | _        y N)super__init__attention_multiplierscalingselfr   r   	__class__s      |/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/granite/modular_granite.pyr    zGraniteAttention.__init__*   s    +22    r   )__name__
__module____qualname____doc__r   intr    __classcell__r%   s   @r&   r   r   '   s"    G3} 3t 3 3r'   r   c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
eej                  ej                  f   dz  dee   dej                  fdZ xZS )GraniteDecoderLayerr   r   c                 l    t         |   ||       |j                  | _        t        ||      | _        y )N)r   r   )r   r    residual_multiplierr   	self_attnr#   s      r&   r    zGraniteDecoderLayer.__init__0   s.    +#)#=#= )9Mr'   Nhidden_statesattention_maskposition_idspast_key_values	use_cacheposition_embeddingskwargsreturnc           
          |}| j                  |      } | j                  d||||||d|\  }}	||| j                  z  z   }|}| j                  |      }| j	                  |      }||| j                  z  z   }|S )af  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r4   r5   r6   r7   r8   r9    )input_layernormr3   r2   post_attention_layernormmlp)
r$   r4   r5   r6   r7   r8   r9   r:   residual_s
             r&   forwardzGraniteDecoderLayer.forward5   s    < !,,];)4>> 
')%+ 3
 
q !=43K3K#KK 55mD/ =43K3K#KKr'   )NNNFN)r(   r)   r*   r   r,   r    torchTensor
LongTensorr   booltupler
   r   rC   r-   r.   s   @r&   r0   r0   /   s    N} N N /304(,!&HL2||2 t+2 &&-	2
 2 $;2 #5<<#=>E2 +,2 
2r'   r0   c                       e Zd ZeedZy)GranitePreTrainedModel)r4   
attentionsN)r(   r)   r*   r0   r   _can_record_outputsr=   r'   r&   rJ   rJ   j   s    ,&r'   rJ   c                        e Zd Zdef fdZeee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ee   defd                     Z xZS )GraniteModelr   c           	          t         |   |       |j                  | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r   )	r   r    embedding_multiplierr   
ModuleListrangenum_hidden_layersr0   layersr#   s      r&   r    zGraniteModel.__init__r   sR     $*$?$?!mmEJ6KcKcEde	 3e
es   A(N	input_idsr5   r6   r7   inputs_embedsr8   r:   r;   c           
      Z   |d u |d uz  rt        d      || j                  |      }|| j                  z  }|r|t        | j                        }|V||j                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      }t        | j                  ||||      }	|}
| j                  |
|      }| j                  d | j                  j                   D ]  } ||
f|	||||d|}
 | j                  |
      }
t!        |
|	      S )
Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r   )device)r   rV   r5   r7   r6   )r6   )r5   r6   r7   r8   r9   )last_hidden_stater7   )
ValueErrorembed_tokensrP   r   r   get_seq_lengthrD   arangeshaperX   	unsqueezer   
rotary_embrT   rS   normr   )r$   rU   r5   r6   r7   rV   r8   r:   past_seen_tokenscausal_maskr4   r9   decoder_layers                r&   rC   zGraniteModel.forwardy   s\    -t";<YZZ  --i8M%(A(AA0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oom,oW![[)H4;;+H+HI 		M)*) /#$7 M		 		-0&++
 	
r'   )NNNNNN)r(   r)   r*   r   r    r   r   r   rD   rF   rE   r   FloatTensorrG   r
   r   r   rC   r-   r.   s   @r&   rN   rN   q   s    
} 
   .2.204(,26!%5
##d*5
 t+5
 &&-	5

 5
 ((4/5
 $;5
 +,5
 
!5
    5
r'   rN   c                      e Zd Zee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  de
dz  d	eej                  z  d
ee   defd              Zy)GraniteForCausalLMNrU   r5   r6   r7   rV   labelsr8   logits_to_keepr:   r;   c	           
          | j                   d||||||d|	}
|
j                  }t        |t              rt	        | d       n|}| j                  |d d |d d f         }|| j                  j                  z  }d }|* | j                  d||| j                  j                  d|	}t        |||
j                  |
j                  |
j                        S )N)rU   r5   r6   r7   rV   r8   )logitsrh   
vocab_size)lossrk   r7   r4   rK   r=   )modelrY   
isinstancer,   slicelm_headr   logits_scalingloss_functionrl   r	   r7   r4   rK   )r$   rU   r5   r6   r7   rV   rh   r8   ri   r:   outputsr4   slice_indicesrk   rm   s                  r&   rC   zGraniteForCausalLM.forward   s     ,64:: ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@A$++444%4%%pVFt{{OeOepiopD%#33!//))
 	
r'   )NNNNNNNr   )r(   r)   r*   r   r   rD   rF   rE   r   re   rG   r,   r
   r   r	   rC   r=   r'   r&   rg   rg      s     .2.204(,26*.!%-.%
##d*%
 t+%
 &&-	%

 %
 ((4/%
   4'%
 $;%
 ell*%
 +,%
 
 %
  %
r'   rg   )rg   rN   rJ   )&rD   r   cache_utilsr   r   masking_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr(   loggerr   r0   rJ   rN   rg   __all__r=   r'   r&   <module>r      s       . / O & R R 7 5  1 
		H	%3~ 38+ 8v1 @
: @
F(
) (
V Kr'   