
    i-c                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%  ejL                  e'      Z( G d dejR                        Z* G d dejV                        Z, G d dejV                        Z- G d de      Z.e G d de             Z/e G d de/             Z0 ed        G d! d"e/e             Z1g d#Z2y)$zPyTorch XGLM model.    N)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)OutputRecordercapture_outputs   )
XGLMConfigc            
       `     e Zd ZdZd
dededededz  f fdZdej                  f fd	Z	 xZ
S )XGLMScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scaleNc                 6    t         |   |||       || _        y N)super__init__r   )selfr   r   r   r   	__class__s        w/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/xglm/modeling_xglm.pyr#   z XGLMScaledWordEmbedding.__init__-   s    D&    	input_idsc                 <    t         |   |      | j                  z  S r!   )r"   forwardr   )r$   r(   r%   s     r&   r*   zXGLMScaledWordEmbedding.forward1   s    wy)D,<,<<<r'   )      ?)__name__
__module____qualname____doc__intfloatr#   torchTensorr*   __classcell__r%   s   @r&   r   r   (   sE    's '3 'S '_dgk_k '= = =r'   r   c            	            e Zd ZdZddedededz  f fdZddedededz  fdZeddedededz  fd	       Z e	j                         dd
e	j                  dz  defd       Z xZS )!XGLMSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   r   c                     t         |           d| _        || _        || _        || _        | j                  || j                  z   ||       y )N   )r"   r#   offsetr8   r   r   make_weights)r$   r8   r   r   r%   s       r&   r#   z*XGLMSinusoidalPositionalEmbedding.__init__8   sH    **&-$++5}kRr'   r   c                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtor>   r@   rA   register_buffer)r$   r   r   r   emb_weightss        r&   r<   z.XGLMSinusoidalPositionalEmbedding.make_weights@   s[    ((T4#%..t||/A/A$,,J]J].^KYFr'   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r:   i'  r   )r@   r   dimN)mathlogr2   exparangeint64r1   	unsqueezecatsincosviewzerosrE   get_default_dtype)r   r   r   half_dimembs        r&   rC   z/XGLMSinusoidalPositionalEmbedding.get_embeddingH   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r'   position_idspast_key_values_lengthc                    |j                         \  }}|| j                  z   }d|z   |z   }|| j                  j                  d      kD  r'| j                  || j                  | j
                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr:   r   rK   )
sizer;   r>   r<   r   r   index_selectrU   shapedetach)r$   rZ   r[   bszseq_lenmax_poss         r&   r*   z)XGLMSinusoidalPositionalEmbedding.forward]   s    #((*W#dkk1g+ 66T\\&&q))gt'9'94;K;KL||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvr'   r!   )Nr   )r,   r-   r.   r/   r0   r#   r<   staticmethodrC   r2   no_gradr3   r*   r4   r5   s   @r&   r7   r7   5   s    NSc S# SCRVJ SG3 Gs GQTW[Q[ G 1c 1# 1CRVJ 1 1( U]]_wELL4$7 wX[ w wr'   r7   c                   4    e Zd ZdZ	 	 	 	 ddedededz  dedz  dedz  dedz  f fd	Z	 	 	 dd
ej                  dej                  dz  de
dz  dej                  dz  dee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )XGLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rl   )r"   r#   rh   ri   rj   head_dim
ValueErrorscalingrk   rm   r   Lineark_projv_projq_projout_proj)r$   rh   ri   rj   rk   rl   rm   r%   s          r&   r#   zXGLMAttention.__init__l   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr'   hidden_stateskey_value_statespast_key_valuesattention_maskkwargsreturnc                    |du}|j                         \  }}}	|r|j                  d   n|}
| j                  |      | j                  z  }d}|St	        |t
              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j!                  ||
d| j"                        j%                  dd      }|j!                  ||
d| j"                        j%                  dd      }|Kj'                  ||| j                        \  }}|r)t	        |t
              rd|j                  | j                  <   || j(                  z  d| j"                  f}|j!                  ||| j(                  | j"                        j%                  dd      } |j*                  | } |j*                  | } |j*                  | }|j                  d      }
t-        j.                  ||j%                  dd            }|j                         || j(                  z  ||
fk7  r/t1        d|| j(                  z  ||
f d|j                                ||j                         |d||
fk7  r#t1        d	|d||
f d|j                                |j!                  || j(                  ||
      |z   }t-        j2                  |t-        j4                  t-        j6                  |j8                        j:                  |j<                  
            }|j!                  || j(                  z  ||
      }|j8                  t,        j>                  k(  rNt@        jB                  jE                  |dt,        jF                        jI                  t,        j>                        }n!t@        jB                  jE                  |d      }|j!                  || j(                  ||
      }|j!                  || j(                  z  ||
      }t@        jB                  jK                  || jJ                  | jL                        }t-        j.                  ||      }|j                         || j(                  z  || j"                  fk7  r7t1        d|| j(                  || j"                  f d|j                                |j!                  || j(                  || j"                        }|j%                  dd      }|j+                  ||| jN                        }| jQ                  |      }||fS )z#Input shape: Batch x Time x ChannelNr   FrK   r:   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size )rA   )rJ   r@   rI   ptrainingz `attn_output` should be of size ))r]   r_   rv   rr   
isinstancer	   
is_updatedgetrm   cross_attention_cacheself_attention_cachelayerskeysvaluesrt   ru   rU   rp   	transposeupdateri   reshaper2   bmmrq   maxtensorfinfor@   minrA   float16r   
functionalsoftmaxfloat32rE   rj   r   rh   rw   )r$   rx   ry   rz   r{   r|   is_cross_attentionra   tgt_len_src_lenquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_states
proj_shapeattn_weightsattn_weights_reshaped
attn_probsattn_outputs                         r&   r*   zXGLMAttention.forward   s    .T9',,.Wa/A"((+w {{=1DLL@
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#gr4==ISSTUWXYJ',,S'2t}}MWWXY[\]L*+?+F+FzS_aeaoao+p(
L%*_FY*ZAEO..t~~>DNN*B>
#((gt~~t}}U__`acde+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL 99ell5;;|7I7I+J+N+NWcWjWjkL (,,S4>>-A7GTL .==002U]]0[^^_d_l_lmL==0020FL !- 1 1#t~~wPW X,11#2FQXY]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK0111r'   )g        FTN)NNN)r,   r-   r.   r/   r0   r1   boolr#   r2   r3   r   r   r   tupler*   r4   r5   s   @r&   rg   rg   i   s   G !$"' !%CC C 	C
 4KC TkC $;C@ 15(,.2l2||l2  ,,-l2 	l2
 t+l2 +,l2 
u||U\\D0%2E2LL	Ml2r'   rg   c                        e Zd Zddef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
e	e
   dej                  fdZ xZS )XGLMDecoderLayerNconfigc                 0   t         |           |j                  | _        t	        | j                  |j
                  |j                  d|      | _        |j                  | _        t        |j                     | _        |j                  | _        |j                  rWt	        | j                  |j
                  |j                  d|      | _        t        j                   | j                        | _        t        j                   | j                        | _        t        j&                  | j                  |j(                        | _        t        j&                  |j(                  | j                        | _        t        j                   | j                        | _        y )NT)rh   ri   rj   rk   rm   )r"   r#   d_modelrh   rg   attention_headsattention_dropout	self_attnrj   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normrs   ffn_dimfc1fc2final_layer_norm)r$   r   rm   r%   s      r&   r#   zXGLMDecoderLayer.__init__   s   &nn,,,,
 ~~#F$>$>?"(";";%% -.. 0000#!D ,.<<+GD($&LL$@!99T^^V^^<99V^^T^^< "T^^ <r'   rx   r{   encoder_hidden_statesencoder_attention_maskrz   	use_cacher|   r}   c                    |}| j                  |      } | j                  |f||d|\  }}	t        j                  j	                  || j                  | j
                        }||z   }|h|}| j                  |      } | j                  |f|||d|\  }}	t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
        )rz   r{   r   )ry   r{   rz   )r   r   r   r   rj   r   r   r   r   r   r   r   r   )
r$   rx   r{   r   r   rz   r   r|   residualr   s
             r&   r*   zXGLMDecoderLayer.forward  s   * !11-@ *4>>
+)
 	
q --mt||VZVcVc-d =0 !,$H 88GM0t00 !65 /	 
  M1 MM11-4<<Z^ZgZg1hM$}4M !--m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0r'   r!   )NNNNT)r,   r-   r.   r   r#   r2   r3   r   r   r   r   r*   r4   r5   s   @r&   r   r      s    =z =D /3596:(,!%:||: t+:  %||d2	:
 !&t 3: : $;: +,: 
:r'   r   c                   8     e Zd ZU eed<   dZdZdgZ fdZ xZ	S )XGLMPreTrainedModelr   modelTr   c                    t         |   |       t        |t              r_|j	                  |j
                  |j                  z   |j                  |j                        }t        j                  |j                  |       y y r!   )r"   _init_weightsr   r7   rC   r8   r;   r   r   initcopy_r>   )r$   modulerG   r%   s      r&   r   z!XGLMPreTrainedModel._init_weights\  sg    f%f?@ ..$$v}}4f6J6JFL^L^K JJv~~{3	 Ar'   )
r,   r-   r.   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r4   r5   s   @r&   r   r   U  s(    &*#+,4 4r'   r   c                       e Zd Ze eedd       eedd      dZdef fdZe	e
e	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dedz  dee   deej                     ez  fd                     Z xZS )	XGLMModelr   r   )index
layer_namer   )rx   
attentionscross_attentionsr   c           	         t         |   |       |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  rt        j                  |j                        nd}t        |j                  |j                  | j
                  |      | _        t        |j                  |j                  |j                        | _        t#        j$                  t'        |j(                        D cg c]  }t+        ||       c}      | _        t#        j.                  |j                        | _        d| _        | j5                          y c c}w )Nr+   )r   )rm   F)r"   r#   rj   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrL   sqrtr   r   
vocab_sizeembed_tokensr7   embed_positionsr   
ModuleListrange
num_layersr   r   r   
layer_normgradient_checkpointing	post_init)r$   r   r   ir%   s       r&   r#   zXGLMModel.__init__m  s    ~~))!..$*$B$B!393I3Idii/s3v~~t/?/?[
  A**NN 

 mmTYZ`ZkZkTl$mq%5f%J$mn,,v~~6&+# %ns   
E&Nr(   r{   rZ   r   r   rz   inputs_embedsr   r|   r}   c	                 D   |du |duz  rt        d      || j                  |      }|rd|b|| j                  j                  r4t	        t        | j                        t        | j                              nt        | j                        }||j                         nd}
t        | j                  |||      }|`t        j                  |
|j                  d   |
z   t        j                  ||j                  n|j                        }|j                  d      }||t        | j                  |||      }|| j                  ||
      j!                  |j                        z   }t"        j$                  j'                  |t)        | j&                        | j*                  	      }t-        | j.                        D ]E  \  }}| j*                  r%t        j0                  g       }|| j2                  k  r7 ||||f|||d
|	}G | j5                  |      }t7        ||      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   )r   r   r{   rz   r   r?   )r   r   r{   r   r   )r   rz   r   )last_hidden_staterz   )rq   r   r   is_encoder_decoderr	   r   get_seq_lengthr   r2   rO   r_   longrA   rQ   r   r   rE   r   r   rj   r1   r   	enumerater   randr   r   r   )r$   r(   r{   rZ   r   r   rz   r   r   r|   r[   rx   idxdecoder_layerdropout_probabilitys                  r&   r*   zXGLMModel.forward  s0   8 -t";<YZZ  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg+;;')+	
  <<&##A&)??jj+4+@y''mFZFZ	L (11!4L !,1G1S%>{{+5&;	&" &(<(<\Ka(b(e(e  )
 
 --muT\\?R]a]j]j-k"+DKK"8 	C}}&+jjn#&7)% (> /# M	" 68++
 	
r'   )NNNNNNNN)r,   r-   r.   r   r   rg   _can_record_outputsr   r#   r   r   r   r2   r3   r   r   r   r   r   r   r*   r4   r5   s   @r&   r   r   e  s3    *$]!T*=n]z 0   *..2,0596:(,-1!%]
<<$&]
 t+]
 llT)	]

  %||d2]
 !&t 3]
 ]
 ||d*]
 $;]
 +,]
 
u||	H	H]
    ]
r'   r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       e Zd ZdZddiZ fdZeee	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dee	j                  z  dee   dee	j                     ez  fd                     Z xZS )XGLMForCausalLMr   zlm_head.weightzmodel.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFro   )
r"   r#   r   r   r   rs   hidden_sizer   lm_headr   )r$   r   r%   s     r&   r#   zXGLMForCausalLM.__init__  sH     v&
yy!3!3V5F5FUS 	r'   Nr(   r{   rZ   r   r   rz   r   labelsr   logits_to_keepr|   r}   c                     | j                   d||||||||	d|}|j                  }t        |
t              rt	        |
 d      n|
}| j                  |dd|ddf         }d}|=| j                  ||| j                  j                  | j                  j                        }t        |||j                  |j                  |j                  |j                        S )ai  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r(   r{   rZ   r   r   rz   r   r   N)r   r   )losslogitsrz   rx   r   r    )r   r   r   r0   slicer   loss_functionr   r   r   r   rz   rx   r   r   )r$   r(   r{   rZ   r   r   rz   r   r   r   r   r|   outputsrx   slice_indicesr   r   s                    r&   r*   zXGLMForCausalLM.forward  s    F >HTZZ 
>
)%"7#9+'
>
 
>
  118B>SV8W~ot4]kmA}a,?@A%%;;11![[55	 & D 1#33!//))$55
 	
r'   )
NNNNNNNNNr   )r,   r-   r.   r   _tied_weights_keysr#   r   r   r   r2   r3   r   r   r0   r   r   r   r   r*   r4   r5   s   @r&   r   r     sI     *,GH   *..2,0596:(,-1&*!%-.A
<<$&A
 t+A
 llT)	A

  %||d2A
 !&t 3A
 A
 ||d*A
 t#A
 $;A
 ell*A
 +,A
 
u||	@	@A
    A
r'   r   )r   r   r   )3r/   rL   r2   r    r   r   activationsr   cache_utilsr   r   r	   
generationr
   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   r   configuration_xglmr   
get_loggerr,   logger	Embeddingr   Moduler7   rg   r   r   r   r   __all__r   r'   r&   <module>r     s	       & ! C C ) J 9 l - & @ @ 7 E * 
		H	%
=bll 
=1w		 1whL2BII L2^Z1 Zz 4/ 4 4 
# 
 
D P
)? P
P
f Br'   