
    ih                        d dl Z d dlmZ d dlZd dlmZ d dlmZmZmZ ddl	m
Z ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4  e,jj                  e6      Z7 G d dejp                        Z9e) G d de$             Z: G d dejp                        Z;	 	 dBdejx                  dejz                  dejz                  dejz                  dejz                  dz  d e>dz  d!e>d"e&e(   fd#Z? G d$ d%ejx                        Z@ G d& d'e      ZA G d( d)e:      ZB G d* d+e      ZC G d, d-e:      ZDd.ejz                  d/eEfd0ZFe) G d1 d2e:             ZG e)d34       G d5 d6e:e             ZH G d7 d8ejx                        ZI e)d94       G d: d;e:             ZJ G d< d=e:      ZK e)d>4       G d? d@e:e             ZLg dAZMy)C    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilingloggingtorch_compilable_check)merge_with_config_defaults)OutputRecordercapture_outputs   )PLBartConfigc            
       `     e Zd ZdZd
dededededz  f fdZdej                  f fd	Z	 xZ
S )PLBartScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scaleNc                 6    t         |   |||       || _        y N)super__init__r,   )selfr)   r*   r+   r,   	__class__s        {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/plbart/modeling_plbart.pyr0   z"PLBartScaledWordEmbedding.__init__B   s    D&    	input_idsc                 <    t         |   |      | j                  z  S r.   )r/   forwardr,   )r1   r5   r2   s     r3   r7   z!PLBartScaledWordEmbedding.forwardF   s    wy)D,<,<<<r4   )      ?__name__
__module____qualname____doc__intfloatr0   torchTensorr7   __classcell__r2   s   @r3   r(   r(   =   sE    's '3 'S '_dgk_k '= = =r4   r(   c                   F     e Zd ZU eed<   dZdZddgZdZdZ	dZ
 fdZ xZS )PLBartPreTrainedModelconfigmodelTPLBartDecoderLayerPLBartEncoderLayerc                     t         |   |       t        |t              r t	        j
                  |j                         y y r.   )r/   _init_weights
isinstancePLBartForConditionalGenerationinitzeros_final_logits_bias)r1   moduler2   s     r3   rK   z#PLBartPreTrainedModel._init_weightsT   s2    f%f<=KK001 >r4   )r:   r;   r<   r&   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnrK   rB   rC   s   @r3   rE   rE   J   s<    &*#-/CDN2 2r4   rE   c                   v     e Zd ZdZdedef fdZ	 d
dej                  dedej                  dz  f fd	Z xZ	S ) PLBartLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    r)   r*   c                 N    d| _         t        | 	  || j                   z   |       y )N   )offsetr/   r0   )r1   r)   r*   r2   s      r3   r0   z)PLBartLearnedPositionalEmbedding.__init___   s$     $++5}Er4   Nr5   past_key_values_lengthposition_idsc                 $   |a|j                   dd \  }}t        j                  |||z   t        j                  | j                  j
                        j                  |d      }n|j                  d      }t        | %  || j                  z         S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr\   )dtypedevicer   )shaper@   arangelongweightrb   expand	unsqueezer/   r7   r]   )r1   r5   r^   r_   bszseq_lenr2   s         r3   r7   z(PLBartLearnedPositionalEmbedding.forwarde   s    
 $??2A.LC <<&(>(HPUPZPZcgcncncucufS"o  (11!4Lw|dkk9::r4   )r   N)
r:   r;   r<   r=   r>   r0   r@   rA   r7   rB   rC   s   @r3   rZ   rZ   Z   sW    Fs F3 F mq;;?B;V[VbVbeiVi; ;r4   rZ   rQ   querykeyvalueattention_maskscalingdropoutkwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nrc         r\   r   dimptrainingr%   )
sizer@   matmul	transposer   
functionalsoftmaxrq   ry   
contiguous)
rQ   rl   rm   rn   ro   rp   rq   rr   attn_weightsattn_outputs
             r3   eager_attention_forwardr   u   s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r4   c                       e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZ	 	 	 dde	j                  de	j                  dz  dedz  de	j                  dz  dee   dee	j                  e	j                  dz  f   fdZ xZS )PLBartAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsrq   
is_decoderbias	is_causalrF   	layer_idxc	                    t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        || _        |9| j                  r-t        j                  d| j                  j                   d       t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rt   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )r/   r0   r   r   rq   head_dimrF   
ValueErrorrp   r   r   r   loggerwarning_oncer2   r:   r   Lineark_projv_projq_projout_proj)
r1   r   r   rq   r   r   r   rF   r   r2   s
            r3   r0   zPLBartAttention.__init__   s$    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	94@ii	94@ii	94@		)YTBr4   hidden_stateskey_value_statespast_key_valuesro   rr   returnc                    |du}|j                   dd }g |d| j                  }| j                  |      j                  |      j	                  dd      }	d}
|St        |t              rA|j                  j                  | j                        }
|r|j                  }n|j                  }n|}|r|n|}|rK|I|
rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j!                  |      }g |j                   dd d| j                  }|j                  |      j	                  dd      }|j                  |      j	                  dd      }|Kj#                  ||| j                        \  }}|r)t        |t              rd|j                  | j                  <   t%        j&                  | j(                  j*                  t,              } || |	|||f| j.                  sdn| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j9                  |      }||fS )	z#Input shape: Batch x Time x ChannelNrc   r%   r\   FT        )rq   rp   )rd   r   r   viewr|   rL   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacerF   _attn_implementationr   ry   rq   rp   reshaper   r   )r1   r   r   r   ro   rr   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacer   r   s                      r3   r7   zPLBartAttention.forward   sd    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+FzS_aeaoao+p(
L%*_FY*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ *k));;;;FFHmmK0L((r4   )r   FTFNNNNN)r:   r;   r<   r=   r>   r?   boolr&   r0   r@   rA   r   r   r   tupler7   rB   rC   s   @r3   r   r      s   G  &* $%C%C %C 	%C
 %C %C %C t#%C :%CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H)r4   r   c                        e Zd Zd
dededz  f fdZdej                  dej                  dee	   dej                  fd	Z xZS )rI   NrF   r   c                 j   t         |           |j                  | _        t	        | j                  |j
                  |j                  ||      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)r   r   rq   rF   r   )r/   r0   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrq   r
   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normr1   rF   r   r2   s      r3   r0   zPLBartEncoderLayer.__init__  s    (nn44,,
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r4   r   ro   rr   r   c                 F   |} | j                   |fd|i|\  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|j                  t        j                  k(  rht        j                  |      j                         sEt        j                   |j                        j"                  dz
  }t        j$                  || |      }|S )Nro   rw   i  )minmax)r   r   r}   rq   ry   r   r   r   r   r   r   ra   r@   float16isfiniteallfinfor   clamp)r1   r   ro   rr   residual_clamp_values          r3   r7   zPLBartEncoderLayer.forward  sh    !)4>>
)
 
q
 --mt||VZVcVc-d =011-@ **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<%--/}8U8Y8Y8[++m&9&9:>>EK!KKK<[YMr4   r.   )r:   r;   r<   r&   r>   r0   r@   FloatTensorr   r   rA   r7   rB   rC   s   @r3   rI   rI     s[    =| =d
 =&(( )) +,	
 
r4   rI   c                        e Zd ZdZeedZdef fdZe	e
e	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ee   d
ef
d                     Z xZS )PLBartEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`PLBartEncoderLayer`].

    Args:
        config: PLBartConfig
        embed_tokens (nn.Embedding): output embedding
    )r   
attentionsrF   c           	         t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |      nd}t        |j                  || j                  |      | _        t!        |j                  |      | _        t%        j&                  t)        |j*                        D cg c]  }t-        ||       c}      | _        t%        j0                  |      | _        d| _        | j7                          y c c}w Nr8   r,   )r   F)r/   r0   rq   encoder_layerdrop	layerdropr   pad_token_idr+   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtr(   
vocab_sizeembed_tokensrZ   embed_positionsr   
ModuleListrangeencoder_layersrI   r   r   layernorm_embeddinggradient_checkpointing	post_init)r1   rF   r   r,   ir2   s        r3   r0   zPLBartEncoder.__init__H  s     ~~11NN	!..$*$B$B!.4.D.Ddii	*#5y$*:*:
  @** 
 mmV[\b\q\qVr$sQR%7!%L$st#%<<	#: &+# %ts   -D?Nr5   ro   inputs_embedsrr   r   c                 X   |d u |d uz  rt        d      || j                  |      }| j                  |d d d d df         }|j                  |j                        }||z   }| j                  |      }t        j                  j                  || j                  | j                        }t        | j                  ||      }t        | j                        D ]F  \  }}d}	| j                  r&t        j                  g       }
|
| j                   k  rd}	|	r= |||fi |}H t#        |      S )Nz:You must specify exactly one of input_ids or inputs_embedsrc   rw   )rF   r   ro   FT)last_hidden_state)r   r   r   torb   r   r   r}   rq   ry   r   rF   	enumerater   r@   randr   r   )r1   r5   ro   r   rr   	embed_posr   idxencoder_layerto_dropdropout_probabilitys              r3   r7   zPLBartEncoder.forwardb  s7    -t";<YZZ  --i8M((q!Rx)@A	LL!5!56	%	100?--mt||VZVcVc-d2;;')

 #,DKK"8 	CG}}&+jjn#&7"G -!"! !	 +
 	
r4   r   )r:   r;   r<   r=   rI   r   _can_record_outputsr&   r0   r"   r$   r   r@   
LongTensorrA   r   r   r   r   r7   rB   rC   s   @r3   r   r   9  s     ,%
| 4   .2.226	*
##d**
 t+*
 ((4/	*

 +,*
 
*
    *
r4   r   c                        e Zd Zddededz  f fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	edz  d
e	dz  de
e   dej                  fdZ xZS )rH   NrF   r   c           	         t         |           |j                  | _        t	        | j                  |j
                  |j                  dd||      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                  d||      | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )NT)r   r   rq   r   r   rF   r   )rq   r   rF   r   )r/   r0   r   r   r   decoder_attention_headsr   r   rq   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   r   s      r3   r0   zPLBartDecoderLayer.__init__  s    (nn44,,
 ~~#F$>$>?"(";";$&LL$@!+NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r4   r   ro   encoder_hidden_statesencoder_attention_maskr   	use_cacherr   r   c                    |} | j                   |f||d|\  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }|h|} | j                  |f|||d|\  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|S )N)r   ro   rw   )r   ro   r   )r   r   r}   rq   ry   r   r   r   r   r   r   r   r   )
r1   r   ro   r   r   r   r   rr   r   r   s
             r3   r7   zPLBartDecoderLayer.forward  s    ! *4>>
+)
 	
q --mt||VZVcVc-d =011-@ !,$H0t00 !65 /	 
  M1 MM11-4<<Z^ZgZg1hM$}4M 88GM !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<r4   r.   )NNNNT)r:   r;   r<   r&   r>   r0   r@   rA   r   r   r   r   r7   rB   rC   s   @r3   rH   rH     s    =| =d
 =D /3596:(,!%/||/ t+/  %||d2	/
 !&t 3/ / $;/ +,/ 
/r4   rH   c                   F    e Zd ZdZe eedd       eedd      dZdef fdZ	e
ee	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                   d	z  dej                  d	z  ded	z  dej                   d	z  ded	z  dee   defd                     Z xZS )PLBartDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`PLBartDecoderLayer`]

    Args:
        config: PLBartConfig
        embed_tokens (nn.Embedding): output embedding
    r%   r   )index
layer_namer   )r   r   cross_attentionsrF   c           	         t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd}t        |j                  |j                  | j                  |      | _        t!        |j                  |j                        | _        t%        j&                  t)        |j*                        D cg c]  }t-        ||       c}      | _        t%        j0                  |j                        | _        d| _        | j7                          y c c}w r   )r/   r0   rq   decoder_layerdropr   r   r+   r   max_target_positionsr   r   r   r   r(   r   r   rZ   r   r   r   r   decoder_layersrH   r   r   r   r   r   )r1   rF   r,   r   r2   s       r3   r0   zPLBartDecoder.__init__  s    ~~11!..$*$B$B!393I3Idii/s5v~~t/?/?[
  @**NN 
 mmV[\b\q\qVr$sQR%7!%L$st#%<<#? &+# %ts   ?ENr5   ro   r   r   r   r   r   rr   r   c                    |d u |d uz  rt        d      || j                  |      }|rd|b|| j                  j                  r4t	        t        | j                        t        | j                              nt        | j                        }|j                         d d \  }	}
||j                         nd}t        j                  |
|j                        |z   }|1t               s'||
z   }t        j                  |	||j                        }t        |t              r|j                  n|}t        | j                  |||      }t!        | j                  |||      }| j#                  t$        ||      }|j'                  |j                        }||z   }| j)                  |      }t*        j,                  j/                  || j.                  | j0                  	      }t3        | j4                        D ]E  \  }}| j0                  r%t        j6                  g       }|| j8                  k  r7 ||||f|||d
|}G t;        ||      S )NzJYou must specify exactly one of decoder_input_ids or decoder_inputs_embeds)rF   rc   r   rb   )rF   r   ro   r   )rF   r   ro   r   )r_   rw   )r   r   r   )r   r   )r   r   rF   is_encoder_decoderr   r   rz   get_seq_lengthr@   re   rb   r   onesrL   r   r   r   r   inputr   r   r   r}   rq   ry   r   r   r   r   r   )r1   r5   ro   r   r   r   r   r   rr   
batch_size
seq_lengthr^   r_   mask_seq_lengthself_attn_cache	positionsr   r   decoder_layerr   s                       r3   r7   zPLBartDecoder.forward  sP    -t";<ijj  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
JETE`!?!?!Afg||J}7K7KLOee!*B*D4zAO"ZZ
OML`L`aN /+>? 00  	 ,;;')+	
 ";;;'1"7	"
 ((0FUa(b	LL!5!56	%	100?--mt||VZVcVc-d"+DKK"8 	C}}&+jjn#&7)% (> /# M	" 9++
 	
r4   )NNNNNNN)r:   r;   r<   r=   rH   r#   r   r   r&   r0   r"   r$   r   r@   r   rA   r   r   r   r   r   r   r7   rB   rC   s   @r3   r  r    s    ,$_A+V*?!P^_| 0   .2.2:>:>(,26!%R
##d*R
 t+R
  %0047	R

 !& 0 04 7R
 R
 ((4/R
 $;R
 +,R
 
3R
    R
r4   r  r5   r   c                 f   | j                         }|t        d      |j                  |dk(  |       |j                  |      j	                  d      dz
  j                  d      }|j                  d|      j                         }|ddddf   j                         |ddddf<   ||dddf<   |S )z
    Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that PLBart does not
    have a single `decoder_start_token_id` in contrast to other Bart-like models.
    Nz1self.model.config.pad_token_id has to be defined.ir%   ru   rc   r   )cloner   masked_fill_nesumri   gathersqueeze)r5   r   prev_output_tokensindex_of_eosdecoder_start_tokenss        r3   shift_tokens_rightr  c  s    
 #*LMM##$6$$>M&)),7;;;BQFQQRTUL-44QEMMO 21crc6 : @ @ Bq!"u3q!tr4   c                       e Zd ZdddZdef fdZd Zd Zee	e
	 	 	 	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  deej                     dz  dedz  dej                  dz  dej                  dz  dedz  dee   deej                     ez  fd                     Z xZS )PLBartModelzshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightrF   c                 J   t         |   |       |j                  |j                  }}|j                  rt        j                  |j                        nd}t        ||j                  ||      | _	        t        |      | _        t        |      | _        | j                          y )Nr8   r   )r/   r0   r   r   r   r   r   r   r(   sharedr   encoderr  decoderr   )r1   rF   r+   r   r,   r2   s        r3   r0   zPLBartModel.__init__~  sz     "("5"5v7H7HZ393I3Idii/s/
FNNKepq$V,$V,r4   c                     | j                   S r.   )r#  r1   s    r3   get_input_embeddingsz PLBartModel.get_input_embeddings  s    {{r4   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r.   )r#  r$  r   r%  r1   rn   s     r3   set_input_embeddingsz PLBartModel.set_input_embeddings  s)    $(KK!$(KK!r4   Nr5   ro   decoder_input_idsdecoder_attention_maskencoder_outputsr   r   decoder_inputs_embedsr   rr   r   c
                 
   |"| t        || j                  j                        }| | j                  d	|||d|
}nGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      } | j                  d	|||d   ||||	d|
}t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        N)r5   ro   r   r   r%   r\   )r   r   r   r5   ro   r   r   r   r   r   )r   r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentions )r  rF   r   r$  rL   r   lenr%  r   r   r   r   r   r  )r1   r5   ro   r,  r-  r.  r   r   r/  r   rr   decoder_outputss               r3   r7   zPLBartModel.forward  s3   P $)>)F 29dkk>V>V W"/;t|| 0#-+0 	0O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO '$,, 	
'1"1!"4#1+/	
 	
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r4   	NNNNNNNNN)r:   r;   r<   _tied_weights_keysr&   r0   r(  r+  r"   r$   r   r@   r   rA   listr   r   r   r   r   r   r   r7   rB   rC   s   @r3   r!  r!  w  sP    (7'6

| 
0
   .226596::>(,26:>!%J
##d*J
 ((4/J
 !++d2	J

 !&t 3J
 e//047J
 J
 ((4/J
  %0047J
 $;J
 +,J
 
u||	1	1J
    J
r4   r!  zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )custom_introc                       e Zd ZdZdgZddiZdef fdZ	 dded	edz  d
e	de
j                  f fdZdeddfdZeee	 	 	 	 	 	 	 	 	 	 ddej$                  dz  dej$                  dz  dej$                  dz  dej&                  dz  deej*                     dz  dedz  dej*                  dz  dej*                  dz  dej&                  dz  de	dz  dee   deej&                     ez  fd                     Zdej&                  fdZ xZS )rM   rG   rP   lm_head.weightzmodel.shared.weightrF   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )NrP   r%   Fr   )r/   r0   r!  rG   register_bufferr@   zerosr#  r)   r   r   r   lm_headr   r1   rF   r2   s     r3   r0   z'PLBartForConditionalGeneration.__init__  s~      (
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^r4   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 z    t         |   |||      }| j                  |j                  j                  d          |S )Nr   )r/   resize_token_embeddings_resize_final_logits_biasrg   rd   )r1   rD  rE  rF  new_embeddingsr2   s        r3   rH  z6PLBartForConditionalGeneration.resize_token_embeddings  s?     8I[]jk&&~'<'<'B'B1'EFr4   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nrc   r%   r
  ru   rP   )rP   rd   r@   rA  rb   catr@  )r1   rD  old_num_tokensnew_bias
extra_biass        r3   rI  z8PLBartForConditionalGeneration._resize_final_logits_bias  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r4   r5   ro   r,  r-  r.  r   r   r/  labelsr   rr   c                 T   |	$|"| t        |	| j                  j                        } | j                  |f||||||||
d|}| j	                  |j
                        }|| j                  j                  |j                        z   }d}|	Ft               } ||j                  d| j                  j                        |	j                  d            }t        |||j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  	      S )a
  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example Mask-filling:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

        >>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

        >>> # en_XX is the language symbol id <LID> for English
        >>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
        >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

        >>> logits = model(input_ids).logits
        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
        >>> probs = logits[0, masked_index].softmax(dim=0)
        >>> values, predictions = probs.topk(5)

        >>> tokenizer.decode(predictions).split()
        ['first', 'same', 'highest', 'result', 'number']
        ```
        N)ro   r,  r.  r-  r   r   r/  r   rc   	losslogitsr   r2  r3  r  r4  r   r5  )r  rF   r   rG   rB  r   rP   r   rb   r   r   r   r   r   r2  r3  r  r4  r   r5  )r1   r5   ro   r,  r-  r.  r   r   r/  rP  r   rr   outputs	lm_logitsmasked_lm_lossloss_fcts                   r3   r7   z&PLBartForConditionalGeneration.forward  s2   @  (-B-J$6vt{{?W?W$X!&0djj'
)/+#9+'"7'
 '
 LL!:!:;	 6 6 9 9):J:J KK	')H%innR9O9O&PRXR]R]^`RabN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r4   c                 B    t        || j                  j                        S r.   )r  rF   r   )r1   rP  s     r3   %prepare_decoder_input_ids_from_labelszDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labelsj  s    !&$++*B*BCCr4   )NT)
NNNNNNNNNN)r:   r;   r<   rS   _keys_to_ignore_on_load_missingr:  r&   r0   r>   r   r   	EmbeddingrH  rI  r"   r$   r   r@   r   rA   r;  r   r   r   r   r   r   r7   rZ  rB   rC   s   @r3   rM   rM     s     ':&;#/|  ae!7:TzY]	< < <   .226596::>(,26:>&*!%_
##d*_
 ((4/_
 !++d2	_

 !&t 3_
 e//047_
 _
 ((4/_
  %0047_
 t#_
 $;_
 +,_
 
u||		._
    _
BDELL Dr4   rM   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z	 xZ
S )
PLBartClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                     t         |           t        j                  ||      | _        t        j
                  |      | _        t        j                  ||      | _        y )N)rx   )r/   r0   r   r   denseDropoutrq   r   )r1   r_  r`  ra  rb  r2   s        r3   r0   z!PLBartClassificationHead.__init__q  sD     	YYy)4
zzN3		)[9r4   r   r   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r.   )rq   rd  r@   tanhr   )r1   r   s     r3   r7   z PLBartClassificationHead.forward}  sN    ]3

=1

=1]3m4r4   r9   rC   s   @r3   r^  r^  n  sL    7
:
: 
: 	
:
 
:U\\ ell r4   r^  z
    PLBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g.
    for GLUE tasks.
    c                   h    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de
ej                     dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  dee   deez  fd              Z xZS )PLBartForSequenceClassificationrF   c                     t        |   |fi | t        |      | _        t	        |j
                  |j
                  |j                  |j                        | _        | j                          y r.   )
r/   r0   r!  rG   r^  r   
num_labelsclassifier_dropoutclassification_headr   )r1   rF   rr   r2   s      r3   r0   z(PLBartForSequenceClassification.__init__  sZ    *6* (
#;NNNN%%	$
  	r4   Nr5   ro   r,  r-  r.  r   r/  rP  r   rr   r   c
                    |d}	|$|"t        d| j                  j                          | j                  |f|||||||	d|
}|d   }|j	                  | j
                  j                        j                  |j                        }t        t        j                  |j                  d            j                         dk(  d       ||ddf   j                  |j                  d      d|j                  d            dddddf   }| j!                  |      }d}||j                  |j                        }| j
                  j"                  | j
                  j$                  dk(  rd	| j
                  _        nv| j
                  j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd
| j
                  _        nd| j
                  _        | j
                  j"                  d	k(  rSt-               }| j
                  j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j
                  j"                  d
k(  rGt1               } ||j                  d| j
                  j$                        |j                  d            }n,| j
                  j"                  dk(  rt3               } |||      }t5        |||j6                  |j8                  |j:                  |j<                  |j>                  |j@                  |jB                  	      S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for )ro   r,  r-  r.  r   r/  r   r   r%   z7All examples must have the same number of <eos> tokens.rc   
regressionsingle_label_classificationmulti_label_classificationrR  )"NotImplementedErrorr2   r:   rG   eqrF   eos_token_idr   rb   r!   r@   unique_consecutiver  numelr   rz   rm  problem_typerk  ra   rf   r>   r   r  r   r   r   r   r2  r3  r  r4  r   r5  )r1   r5   ro   r,  r-  r.  r   r/  rP  r   rr   rU  r   eos_masksentence_representationrT  rS  rX  s                     r3   r7   z'PLBartForSequenceClassification.forward  s   P I!:%J4>>KbKbJcd  '1djj
'
)/#9+'"7
'
 
'
  
<< 8 89<<]=Q=QR$$X\\!_5;;=BE	
 #0!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r4   r9  )r:   r;   r<   r&   r0   r   r   r@   r   rA   r;  r   r   r   r   r   r   r7   rB   rC   s   @r3   ri  ri    s,   |   .2.259:>:>26:>*.!%h
##d*h
 t+h
 !++d2	h

 !& 0 04 7h
 e//047h
 ((4/h
  %0047h
   4'h
 $;h
 +,h
 
0	0h
  h
r4   ri  c                   (     e Zd ZdZ fdZd Z xZS )PLBartDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 d    t         |   |       t        |      | _        | j	                          y r.   )r/   r0   r  r%  r   rC  s     r3   r0   zPLBartDecoderWrapper.__init__  s&     $V,r4   c                 &     | j                   |i |S r.   )r%  )r1   argsrr   s      r3   r7   zPLBartDecoderWrapper.forward  s    t||T,V,,r4   )r:   r;   r<   r=   r0   r7   rB   rC   s   @r3   r{  r{    s    

-r4   r{  zw
    PLBART decoder with a language modeling head on top (linear layer with weights tied to the input embeddings).
    c                   \    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dee	j                  z  dee   deez  fd              Z xZS )PLBartForCausalLMr>  z!model.decoder.embed_tokens.weightc                     d|_         d|_        t        |   |       t	        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y )NTFr   )r   r  r/   r0   r{  rG   r   r   hidden_sizer   rB  r   rC  s     r3   r0   zPLBartForCausalLM.__init__   sX     $)! )&1
yy!3!3V5F5FUS 	r4   c                 B    | j                   j                  j                  S r.   rG   r%  r   r'  s    r3   r(  z&PLBartForCausalLM.get_input_embeddings+  s    zz!!...r4   c                 :    || j                   j                  _        y r.   r  r*  s     r3   r+  z&PLBartForCausalLM.set_input_embeddings.  s    */

'r4   Nr5   ro   r   r   r   r   rP  r   logits_to_keeprr   r   c
                     | j                   j                  d|||||||d|
}|d   }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|a|j                  |j                        }t               } ||j                  d| j                  j                        |j                  d            }t        |||j                  |j                  |j                  |j                         S )aF  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
        >>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```r1  r   Nrc   )rS  rT  r   r   r   r  r6  )rG   r%  rL   r>   slicerB  r   rb   r   r   rF   r   r   r   r   r   r  )r1   r5   ro   r   r   r   r   rP  r   r  rr   rU  r   slice_indicesrT  rS  rX  s                    r3   r7   zPLBartForCausalLM.forward1  s   L >PTZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r4   )	NNNNNNNNr   )r:   r;   r<   r:  r0   r(  r+  r   r   r@   r   rA   r   r   r   r>   r   r   r   r   r7   rB   rC   s   @r3   r  r    s1    	=	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r4   r  )r  rM   ri  r!  rE   )Nr   )Nr   collections.abcr   r@   r   torch.nnr   r   r    r	   rN   activationsr
   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    r!   utils.genericr"   utils.output_capturingr#   r$   configuration_plbartr&   
get_loggerr:   r   r\  r(   rE   rZ   ModulerA   r?   r   r   rI   r   rH   r  r>   r  r!  rM   r^  ri  r{  r  __all__r6  r4   r3   <module>r     sV  *  $   A A & ! C C ) J B 9  G &  8 E . 
		H	%
= 
= 2O 2 2;r|| ;B !%II%<<% 
% <<	%
 LL4'% T\% % '(%8r)bii r)j03 0fV
) V
rO3 Od|
) |
~%,, c ( g
' g
 g
T 
DD%:O DD
DDNryy 0 x
&; x
x
v-0 - 
Y
- Y

Y
xr4   