
    i                     \   d Z ddlZddlZddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7  e/jp                  e9      Z:dejv                  de<de<fdZ= G d dej|                        Z? G d dej|                        Z@	 	 dJdej                  dejv                  d ejv                  d!ejv                  d"ejv                  dz  d#eBdz  d$eBd%e)e+   fd&ZC G d' d(ej                        ZD G d) d*e      ZE G d+ d,e      ZF G d- d.ej                        ZGe, G d/ d0e'             ZH G d1 d2eH      ZI G d3 d4eH      ZJ G d5 d6eH      ZK G d7 d8eH      ZLe, G d9 d:eH             ZM e,d;<       G d= d>eHe             ZN e,d?<       G d@ dAeH             ZOe, G dB dCeH             ZP G dD dEeH      ZQ e,dF<       G dG dHeHe             ZRg dIZSy)KzPyTorch BART model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilingloggingtorch_compilable_check)merge_with_config_defaults)OutputRecordercapture_outputs   )
BartConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr&   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r(   r)   r*   shifted_input_idss       w/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/bart/modeling_bart.pyshift_tokens_rightr4   :   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                   v     e Zd ZdZdedef fdZ	 d
dej                  dedej                  dz  f fd	Z xZ	S )BartLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y N   )offsetsuper__init__)selfr8   r9   	__class__s      r3   r?   z'BartLearnedPositionalEmbedding.__init__O   s$     $++5}Er5   Nr(   past_key_values_lengthposition_idsc                 $   |a|j                   dd \  }}t        j                  |||z   t        j                  | j                  j
                        j                  |d      }n|j                  d      }t        | %  || j                  z         S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr<   )dtypedevicer,   r   )r.   torcharangelongweightrF   expand	unsqueezer>   forwardr=   )r@   r(   rB   rC   bszseq_lenrA   s         r3   rM   z&BartLearnedPositionalEmbedding.forwardU   s    
 $??2A.LC <<&(>(HPUPZPZcgcncncucufS"o  (11!4Lw|dkk9::r5   )r   N)
__name__
__module____qualname____doc__intr?   rG   TensorrM   __classcell__rA   s   @r3   r7   r7   J   sW    Fs F3 F mq;;?B;V[VbVbeiVi; ;r5   r7   c            
       `     e Zd ZdZd
dededededz  f fdZdej                  f fd	Z	 xZ
S )BartScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r8   r9   padding_idxembed_scaleNc                 6    t         |   |||       || _        y N)r>   r?   r[   )r@   r8   r9   rZ   r[   rA   s        r3   r?   z BartScaledWordEmbedding.__init__j   s    D&r5   r(   c                 <    t         |   |      | j                  z  S r]   )r>   rM   r[   )r@   r(   rA   s     r3   rM   zBartScaledWordEmbedding.forwardn   s    wy)D,<,<<<r5   )      ?rP   rQ   rR   rS   rT   floatr?   rG   rU   rM   rV   rW   s   @r3   rY   rY   e   sE    's '3 'S '_dgk_k '= = =r5   rY   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr,         r<   r   dimptrainingr&   )
sizerG   matmul	transposer   
functionalsoftmaxrh   rp   
contiguous)
rb   rc   rd   re   rf   rg   rh   ri   attn_weightsattn_outputs
             r3   eager_attention_forwardry   s   s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r5   c                       e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZ	 	 	 dde	j                  de	j                  dz  dedz  de	j                  dz  dee   dee	j                  e	j                  dz  f   fdZ xZS )BartAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsrh   
is_decoderbias	is_causalconfig	layer_idxc	                    t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        || _        |9| j                  r-t        j                  d| j                  j                   d       t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rk   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )r>   r?   r|   r}   rh   head_dimr   r0   rg   r~   r   r   loggerwarning_oncerA   rP   r   Lineark_projv_projq_projout_proj)
r@   r|   r}   rh   r~   r   r   r   r   rA   s
            r3   r?   zBartAttention.__init__   s$    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	94@ii	94@ii	94@		)YTBr5   hidden_stateskey_value_statespast_key_valuesrf   ri   returnc                    |du}|j                   dd }g |d| j                  }| j                  |      j                  |      j	                  dd      }	d}
|St        |t              rA|j                  j                  | j                        }
|r|j                  }n|j                  }n|}|r|n|}|rK|I|
rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j!                  |      }g |j                   dd d| j                  }|j                  |      j	                  dd      }|j                  |      j	                  dd      }|Kj#                  ||| j                        \  }}|r)t        |t              rd|j                  | j                  <   t%        j&                  | j(                  j*                  t,              } || |	|||f| j.                  sdn| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j9                  |      }||fS )	z#Input shape: Batch x Time x ChannelNr,   r&   r<   FT        )rh   rg   )r.   r   r   viewrs   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacer   _attn_implementationry   rp   rh   rg   reshaperv   r   )r@   r   r   r   rf   ri   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacerx   rw   s                      r3   rM   zBartAttention.forward   sd    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+FzS_aeaoao+p(
L%*_FY*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ *k));;;;FFHmmK0L((r5   )r   FTFNNNNN)rP   rQ   rR   rS   rT   ra   boolr'   r?   rG   rU   r   r   r   tuplerM   rV   rW   s   @r3   r{   r{      s   G  $( $%C%C %C 	%C
 %C %C %C T!%C :%CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H)r5   r{   c                        e Zd Zd
dededz  f fdZdej                  dej                  dee	   dej                  fd	Z xZS )BartEncoderLayerNr   r   c                 j   t         |           |j                  | _        t	        | j                  |j
                  |j                  ||      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)r|   r}   rh   r   r   )r>   r?   d_modelr|   r{   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrh   r
   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normr@   r   r   rA   s      r3   r?   zBartEncoderLayer.__init__  s    &nn44,,
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r5   r   rf   ri   r   c                 F   |} | j                   |fd|i|\  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|j                  t        j                  k(  rht        j                  |      j                         sEt        j                   |j                        j"                  dz
  }t        j$                  || |      }|S )Nrf   rn   i  )minmax)r   r   rt   rh   rp   r   r   r   r   r   r   rE   rG   float16isfiniteallfinfor   clamp)r@   r   rf   ri   residual_clamp_values          r3   rM   zBartEncoderLayer.forward  sh    !)4>>
)
 
q
 --mt||VZVcVc-d =011-@ **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<%--/}8U8Y8Y8[++m&9&9:>>EK!KKK<[YMr5   r]   )rP   rQ   rR   r'   rT   r?   rG   FloatTensorr   r   rU   rM   rV   rW   s   @r3   r   r     s[    =z =cDj =&(( )) +,	
 
r5   r   c                        e Zd Zddededz  f fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	edz  d
e	dz  de
e   dej                  fdZ xZS )BartDecoderLayerNr   r   c           	         t         |           |j                  | _        t	        | j                  |j
                  |j                  dd||      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                  d||      | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )NT)r|   r}   rh   r~   r   r   r   )rh   r~   r   r   )r>   r?   r   r|   r{   decoder_attention_headsr   r   rh   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   r   s      r3   r?   zBartDecoderLayer.__init__8  s    &nn44,,
 ~~#F$>$>?"(";";$&LL$@!)NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r5   r   rf   encoder_hidden_statesencoder_attention_maskr   	use_cacheri   r   c                    |} | j                   |f||d|\  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }|h|} | j                  |f|||d|\  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|S )N)r   rf   rn   )r   rf   r   )r   r   rt   rh   rp   r   r   r   r   r   r   r   r   )
r@   r   rf   r   r   r   r   ri   r   r   s
             r3   rM   zBartDecoderLayer.forwardW  s    ! *4>>
+)
 	
q --mt||VZVcVc-d =011-@ !,$H0t00 !65 /	 
  M1 MM11-4<<Z^ZgZg1hM$}4M 88GM !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<r5   r]   )NNNNT)rP   rQ   rR   r'   rT   r?   rG   rU   r   r   r   r   rM   rV   rW   s   @r3   r   r   7  s    =z =cDj =D /3596:(,!%/||/ t+/  %||d2	/
 !&t 3/ / $;/ +,/ 
/r5   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z	 xZ
S )
BartClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                     t         |           t        j                  ||      | _        t        j
                  |      | _        t        j                  ||      | _        y )N)ro   )r>   r?   r   r   denseDropoutrh   r   )r@   r   r   r   r   rA   s        r3   r?   zBartClassificationHead.__init__  sD     	YYy)4
zzN3		)[9r5   r   r   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r]   )rh   r   rG   tanhr   )r@   r   s     r3   rM   zBartClassificationHead.forward  sN    ]3

=1

=1]3m4r5   r`   rW   s   @r3   r   r     sL    7
:
: 
: 	
:
 
:U\\ ell r5   r   c                   f     e Zd ZU eed<   dZdZddgZddgZdZ	dZ
dZdZdZ fd	Zed
        Z xZS )BartPreTrainedModelr   modelTzencoder.versionzdecoder.versionr   r   r   c                     t         |   |       t        |t              r t	        j
                  |j                         y y r]   )r>   _init_weightsr   BartForConditionalGenerationinitzeros_final_logits_bias)r@   rb   rA   s     r3   r   z!BartPreTrainedModel._init_weights  s2    f%f:;KK001 <r5   c                     | j                   j                  }t        j                  g ddddd|gg| j                        }|j                  |      |d}|S )N)r      
      r<   r         r<   rF   )rf   r(   )r   r)   rG   tensorrF   ne)r@   	pad_tokenr(   dummy_inputss       r3   r   z BartPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r5   )rP   rQ   rR   r'   __annotations__base_model_prefixsupports_gradient_checkpointing"_keys_to_ignore_on_load_unexpected_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   propertyr   rV   rW   s   @r3   r   r     sc    &*#*;=N)O&,.AB"3N!2
  r5   r   c                       e Zd Zd Zy)PretrainedBartModelc                 8    t        j                  dt               y Nz_The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.warningswarnFutureWarningr@   s    r3   __init_subclass__z%PretrainedBartModel.__init_subclass__      m	
r5   NrP   rQ   rR   r   r5   r3   r   r         
r5   r   c                       e Zd Zd Zy)BartPretrainedModelc                 8    t        j                  dt               y r  r  r  s    r3   r  z%BartPretrainedModel.__init_subclass__  r	  r5   Nr
  r  r5   r3   r  r    r  r5   r  c                        e Zd ZdZeedZdef fdZe	e
e	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ee   d
ef
d                     Z xZS )BartEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`BartEncoderLayer`].

    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    )r   
attentionsr   c           	         t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |      nd}t        |j                  || j                  |      | _        t!        |j                  |      | _        t%        j&                  t)        |j*                        D cg c]  }t-        ||       c}      | _        t%        j0                  |      | _        d| _        | j7                          y c c}w Nr_   r[   )r   F)r>   r?   rh   encoder_layerdrop	layerdropr   r)   rZ   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtrY   
vocab_sizeembed_tokensr7   embed_positionsr   
ModuleListrangeencoder_layersr   r   r   layernorm_embeddinggradient_checkpointing	post_init)r@   r   r|   r[   irA   s        r3   r?   zBartEncoder.__init__  s     ~~11NN	!..$*$B$B!.4.D.Ddii	*#3y$*:*:
  >** 
 mmTYZ`ZoZoTp$qq%5f%J$qr#%<<	#: &+# %rs   -D?Nr(   rf   inputs_embedsri   r   c                 X   |d u |d uz  rt        d      || j                  |      }| j                  |d d d d df         }|j                  |j                        }||z   }| j                  |      }t        j                  j                  || j                  | j                        }t        | j                  ||      }t        | j                        D ]F  \  }}d}	| j                  r&t        j                  g       }
|
| j                   k  rd}	|	r= |||fi |}H t#        |      S )Nz:You must specify exactly one of input_ids or inputs_embedsr,   rn   )r   r'  rf   FT)last_hidden_state)r0   r  r  torF   r#  r   rt   rh   rp   r   r   	enumerater   rG   randr  r   )r@   r(   rf   r'  ri   	embed_posr   idxencoder_layerto_dropdropout_probabilitys              r3   rM   zBartEncoder.forward  s7    -t";<YZZ  --i8M((q!Rx)@A	LL!5!56	%	100?--mt||VZVcVc-d2;;')

 #,DKK"8 	CG}}&+jjn#&7"G -!"! !	 +
 	
r5   r   )rP   rQ   rR   rS   r   r{   _can_record_outputsr'   r?   r#   r%   r   rG   
LongTensorrU   r   r   r   r   rM   rV   rW   s   @r3   r  r    s     *#
z 4   .2.226	*
##d**
 t+*
 ((4/	*

 +,*
 
*
    *
r5   r  c                   F    e Zd ZdZe eedd       eedd      dZdef fdZ	e
ee	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                   d	z  dej                  d	z  ded	z  dej                   d	z  ded	z  dee   defd                     Z xZS )BartDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]

    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    r&   r   )index
layer_namer   )r   r  cross_attentionsr   c           	         t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd}t        |j                  |j                  | j                  |      | _        t!        |j                  |j                        | _        t%        j&                  t)        |j*                        D cg c]  }t-        ||       c}      | _        t%        j0                  |j                        | _        d| _        | j7                          y c c}w r  )r>   r?   rh   decoder_layerdropr  r)   rZ   r  max_target_positionsr  r  r  r   rY   r  r  r7   r  r   r   r!  decoder_layersr   r   r   r#  r$  r%  )r@   r   r[   r&  rA   s       r3   r?   zBartDecoder.__init__7  s    ~~11!..$*$B$B!393I3Idii/s3v~~t/?/?[
  >**NN 
 mmTYZ`ZoZoTp$qq%5f%J$qr#%<<#? &+# %rs   ?ENr(   rf   r   r   r   r'  r   ri   r   c                    |d u |d uz  rt        d      || j                  |      }|rd|b|| j                  j                  r4t	        t        | j                        t        | j                              nt        | j                        }|j                         d d \  }	}
||j                         nd}t        j                  |
|j                        |z   }|1t               s'||
z   }t        j                  |	||j                        }t        |t              r|j                  n|}t        | j                  |||      }t!        | j                  |||      }| j#                  t$        ||      }|j'                  |j                        }||z   }| j)                  |      }t*        j,                  j/                  || j.                  | j0                  	      }t3        | j4                        D ]E  \  }}| j0                  r%t        j6                  g       }|| j8                  k  r7 ||||f|||d
|}G t;        ||      S )NzJYou must specify exactly one of decoder_input_ids or decoder_inputs_embeds)r   r,   r   r   )r   r'  rf   r   )r   r'  rf   r   )rC   rn   )r   r   r   )r)  r   )r0   r  r   is_encoder_decoderr   r   rq   get_seq_lengthrG   rH   rF   r    onesr   r   r   r   r  inputr*  r#  r   rt   rh   rp   r+  r   r,  r  r   )r@   r(   rf   r   r   r   r'  r   ri   
batch_size
seq_lengthrB   rC   mask_seq_lengthself_attn_cache	positionsr   r.  decoder_layerr1  s                       r3   rM   zBartDecoder.forwardO  sP    -t";<ijj  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
JETE`!?!?!Afg||J}7K7KLOee!*B*D4zAO"ZZ
OML`L`aN /+>? 00  	 ,;;')+	
 ";;;'1"7	"
 ((0FUa(b	LL!5!56	%	100?--mt||VZVcVc-d"+DKK"8 	C}}&+jjn#&7)% (> /# M	" 9++
 	
r5   )NNNNNNN)rP   rQ   rR   rS   r   r$   r{   r2  r'   r?   r#   r%   r   rG   r3  rU   r   r   r   r   r   r   rM   rV   rW   s   @r3   r5  r5  (  s    *$]!T*=n]z 0   .2.2:>:>(,26!%R
##d*R
 t+R
  %0047	R

 !& 0 04 7R
 R
 ((4/R
 $;R
 +,R
 
3R
    R
r5   r5  c                   j    e Zd ZdddZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 dde
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  dee
j                     dz  dedz  de
j                  dz  de
j                  dz  dedz  dee   deez  fd              Z xZS )	BartModelzshared.weight)zdecoder.embed_tokens.weightzencoder.embed_tokens.weightr   c                 J   t         |   |       |j                  |j                  }}|j                  rt        j                  |j                        nd}t        ||j                  ||      | _	        t        |      | _        t        |      | _        | j                          y )Nr_   r  )r>   r?   r)   r  r  r  r  r   rY   sharedr  encoderr5  decoderr%  )r@   r   rZ   r  r[   rA   s        r3   r?   zBartModel.__init__  s|     "("5"5v7H7HZ393I3Idii/s-j&..+cno"6*"6* 	r5   c                     | j                   S r]   )rK  r  s    r3   get_input_embeddingszBartModel.get_input_embeddings  s    {{r5   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r]   )rK  rL  r  rM  r@   re   s     r3   set_input_embeddingszBartModel.set_input_embeddings  s)    $(KK!$(KK!r5   Nr(   rf   decoder_input_idsdecoder_attention_maskencoder_outputsr   r'  decoder_inputs_embedsr   ri   r   c
                 N   |D|B|t        d      t        || j                  j                  | j                  j                        }| | j
                  d
|||d|
}nGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      } | j                  d
|||d   ||||	d|
}t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r(   rf   r'  r   r&   r<   )r)  r   r  r(   rf   r   r   r   r'  r   )r)  r   decoder_hidden_statesdecoder_attentionsr8  encoder_last_hidden_stater   encoder_attentionsr  )r0   r4   r   r)   r*   rL  r   r   lenrM  r   r)  r   r   r  r8  )r@   r(   rf   rS  rT  rU  r   r'  rV  r   ri   decoder_outputss               r3   rM   zBartModel.forward  sd   P $)>)F  U  !34;;33T[[5W5W! "/;t|| 0#-+0 	0O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO FRT\\ 	F
'1"1!"4#1+/	F
 	F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r5   	NNNNNNNNN)rP   rQ   rR   _tied_weights_keysr'   r?   rO  rR  r   r   rG   r3  rU   listr   r   r   r   r   r   r   rM   rV   rW   s   @r3   rI  rI    s?    (7'6
z 0
  .2.259:>:>(,26:>!%T
##d*T
 t+T
 !++d2	T

 !& 0 04 7T
 e//047T
 T
 ((4/T
  %0047T
 $;T
 +,T
 
#	#T
  T
r5   rI  zV
    The BART Model with a language modeling head. Can be used for summarization.
    )custom_introc                       e Zd ZdZddiZdgZdef fdZ	 dded	edz  d
e	de
j                  f fdZdeddfdZee	 	 	 	 	 	 	 	 	 	 ddej"                  dz  dej$                  dz  dej"                  dz  dej"                  dz  deej(                     dz  dedz  dej(                  dz  dej(                  dz  dej"                  dz  de	dz  dee   deez  fd              Zdej$                  fdZ xZS )r   r   lm_head.weightzmodel.shared.weightr   r   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )Nr   r&   Fr   )r>   r?   rI  r   register_bufferrG   zerosrK  r8   r   r   r   lm_headr%  r@   r   rA   s     r3   r?   z%BartForConditionalGeneration.__init__(  s     v&
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r5   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 z    t         |   |||      }| j                  |j                  j                  d          |S )Nr   )r>   resize_token_embeddings_resize_final_logits_biasrJ   r.   )r@   rk  rl  rm  new_embeddingsrA   s        r3   ro  z4BartForConditionalGeneration.resize_token_embeddings1  s?     8I[]jk&&~'<'<'B'B1'EFr5   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nr,   r&   r   rl   r   )r   r.   rG   rh  rF   catrg  )r@   rk  old_num_tokensnew_bias
extra_biass        r3   rp  z6BartForConditionalGeneration._resize_final_logits_bias8  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r5   r(   rf   rS  rT  rU  r   r'  rV  labelsr   ri   c                    |	R|
rt         j                  d       d}
|7|5t        |	| j                  j                  | j                  j
                        } | j                  |f||||||||
d|}| j                  |d         }|| j                  j                  |j                        z   }d}|	a|	j                  |j                        }	t               } ||j                  d| j                  j                        |	j                  d            }t        |||j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  	      S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example summarization:

        ```python
        >>> from transformers import AutoTokenizer, BartForConditionalGeneration

        >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

        >>> ARTICLE_TO_SUMMARIZE = (
        ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
        ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
        ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
        ... )
        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")

        >>> # Generate Summary
        >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
        >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
        ```

        Mask filling example:

        ```python
        >>> from transformers import AutoTokenizer, BartForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
        >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

        >>> TXT = "My friends are <mask> but they eat too many carbs."
        >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
        >>> logits = model(input_ids).logits

        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
        >>> probs = logits[0, masked_index].softmax(dim=0)
        >>> values, predictions = probs.topk(5)

        >>> tokenizer.decode(predictions).split()
        ['not', 'good', 'healthy', 'great', 'very']
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rf   rS  rU  rT  r   r'  rV  r   r   r,   	losslogitsr   rZ  r[  r8  r\  r   r]  )r   warningr4   r   r)   r*   r   ri  r   r*  rF   r   r   r  r   r   rZ  r[  r8  r\  r   r]  )r@   r(   rf   rS  rT  rU  r   r'  rV  rw  r   ri   outputs	lm_logitsmasked_lm_lossloss_fcts                   r3   rM   z$BartForConditionalGeneration.forwardA  sj   h klI (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+'"7'
 '
 LL,	 6 6 9 9):J:J KK	YYy//0F')H%innR9O9O&PRXR]R]^`RabN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r5   c                 l    t        || j                  j                  | j                  j                        S r]   )r4   r   r)   r*   )r@   rw  s     r3   %prepare_decoder_input_ids_from_labelszBBartForConditionalGeneration.prepare_decoder_input_ids_from_labels  s%    !&$++*B*BDKKDfDfggr5   )NT
NNNNNNNNNN)rP   rQ   rR   r   ra  _keys_to_ignore_on_load_missingr'   r?   rT   r   r   	Embeddingro  rp  r   r   rG   r3  rU   rb  r   r   r   r   r   r   rM   r  rV   rW   s   @r3   r   r     s     / (;&;#z  ae!7:TzY]	< < <  .2.259:>:>(,26:>*.!%{
##d*{
 t+{
 !++d2	{

 !& 0 04 7{
 e//047{
 {
 ((4/{
  %0047{
   4'{
 $;{
 +,{
 
	 {
  {
zhELL hr5   r   z
    Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                   h    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de
ej                     dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  dee   deez  fd              Z xZS )BartForSequenceClassificationr   c                     t        |   |fi | t        |      | _        t	        |j
                  |j
                  |j                  |j                        | _        | j                          y r]   )
r>   r?   rI  r   r   r   
num_labelsclassifier_dropoutclassification_headr%  )r@   r   ri   rA   s      r3   r?   z&BartForSequenceClassification.__init__  sZ    *6*v&
#9NNNN%%	$
  	r5   Nr(   rf   rS  rT  rU  r'  rV  rw  r   ri   r   c
                    |d}	|$|"t        d| j                  j                          | j                  |f|||||||	d|
}|d   }|j	                  | j
                  j                        j                  |j                        }t        t        j                  |j                  d            j                         dk(  d       ||ddf   j                  |j                  d      d|j                  d            dddddf   }| j!                  |      }d}||j                  |j                        }| j
                  j"                  | j
                  j$                  dk(  rd	| j
                  _        nv| j
                  j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd
| j
                  _        nd| j
                  _        | j
                  j"                  d	k(  rSt-               }| j
                  j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j
                  j"                  d
k(  rGt1               } ||j                  d| j
                  j$                        |j                  d            }n,| j
                  j"                  dk(  rt3               } |||      }t5        |||j6                  |j8                  |j:                  |j<                  |j>                  |j@                  |jB                  	      S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for rf   rS  rT  rU  r'  rV  r   r   r&   z7All examples must have the same number of <eos> tokens.r,   
regressionsingle_label_classificationmulti_label_classificationry  )"NotImplementedErrorrA   rP   r   eqr   eos_token_idr*  rF   r"   rG   unique_consecutivesumnumelr   rq   r  problem_typer  rE   rI   rT   r   squeezer   r   r   r   rZ  r[  r8  r\  r   r]  )r@   r(   rf   rS  rT  rU  r'  rV  rw  r   ri   r}  r   eos_masksentence_representationr{  rz  r  s                     r3   rM   z%BartForSequenceClassification.forward  s   R I!:%J4>>KbKbJcd  '1djj
'
)/#9+'"7
'
 
'
  
<< 8 89<<]=Q=QR$$X\\!_5;;=BE	
 #0!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r5   r`  )rP   rQ   rR   r'   r?   r   r   rG   r3  rU   rb  r   r   r   r   r   r   rM   rV   rW   s   @r3   r  r    s,   z   .2.259:>:>26:>*.!%i
##d*i
 t+i
 !++d2	i

 !& 0 04 7i
 e//047i
 ((4/i
  %0047i
   4'i
 $;i
 +,i
 
0	0i
  i
r5   r  c                       e Zd Z fdZee	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de	ej                     dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  dee   deez  fd              Z xZS )BartForQuestionAnsweringc                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r;   )
r>   r?   r  rI  r   r   r   hidden_size
qa_outputsr%  rj  s     r3   r?   z!BartForQuestionAnswering.__init__H  s[      ++v&
))F$6$68I8IJ 	r5   Nr(   rf   rS  rT  rU  start_positionsend_positionsr'  rV  r   ri   r   c                 D   ||d}
 | j                   |f||||||	|
d|}|d   }| j                  |      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   d	z  }t        ||||j                  |j                  |j                  |j                  |j                  |j                  |j                   

      S )rX  NFr  r   r&   r,   rl   )ignore_indexr<   )
rz  start_logits
end_logitsr   rZ  r[  r8  r\  r   r]  )r   r  splitr  rv   r^  rq   r   r   r   r   rZ  r[  r8  r\  r   r]  )r@   r(   rf   rS  rT  rU  r  r  r'  rV  r   ri   r}  sequence_outputr{  r  r  
total_lossignored_indexr  
start_lossend_losss                         r3   rM   z BartForQuestionAnswering.forwardT  s   N &=+DI&0djj
'
)/#9+'"7
'
 
'
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r5   r  )rP   rQ   rR   r?   r   r   rG   rU   r3  rb  r   r   r   r   r   r   rM   rV   rW   s   @r3   r  r  F  s<   
  *..259:>:>371526:>!%W
<<$&W
 t+W
 !++d2	W

 !& 0 04 7W
 e//047W
 ))D0W
 ''$.W
 ((4/W
  %0047W
 $;W
 +,W
 
4	4W
  W
r5   r  c                   (     e Zd ZdZ fdZd Z xZS )BartDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 d    t         |   |       t        |      | _        | j	                          y r]   )r>   r?   r5  rM  r%  rj  s     r3   r?   zBartDecoderWrapper.__init__  s&     "6*r5   c                 &     | j                   |i |S r]   )rM  )r@   argsri   s      r3   rM   zBartDecoderWrapper.forward  s    t||T,V,,r5   )rP   rQ   rR   rS   r?   rM   rV   rW   s   @r3   r  r    s    

-r5   r  zu
    BART decoder with a language modeling head on top (linear layer with weights tied to the input embeddings).
    c                   \    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dee	j                  z  dee   deez  fd              Z xZS )BartForCausalLMre  z!model.decoder.embed_tokens.weightc                     d|_         d|_        t        |   |       t	        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y )NTFr   )r~   r>  r>   r?   r  r   r   r   r  r  ri  r%  rj  s     r3   r?   zBartForCausalLM.__init__  sX     $)! '/
yy!3!3V5F5FUS 	r5   c                 B    | j                   j                  j                  S r]   r   rM  r  r  s    r3   rO  z$BartForCausalLM.get_input_embeddings  s    zz!!...r5   c                 :    || j                   j                  _        y r]   r  rQ  s     r3   rR  z$BartForCausalLM.set_input_embeddings  s    */

'r5   Nr(   rf   r   r   r   r'  rw  r   logits_to_keepri   r   c
                     | j                   j                  d|||||||d|
}|d   }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|a|j                  |j                        }t               } ||j                  d| j                  j                        |j                  d            }t        |||j                  |j                  |j                  |j                         S )a@  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BartForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
        >>> model = BartForCausalLM.from_pretrained("facebook/bart-base")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```rY  r   Nr,   )rz  r{  r   r   r  r8  r  )r   rM  r   rT   sliceri  r*  rF   r   r   r   r  r   r   r   r  r8  )r@   r(   rf   r   r   r   r'  rw  r   r  ri   r}  r   slice_indicesr{  rz  r  s                    r3   rM   zBartForCausalLM.forward  s   L >PTZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r5   )	NNNNNNNNr   )rP   rQ   rR   ra  r?   rO  rR  r   r   rG   r3  rU   r   r   r   rT   r   r   r   r   rM   rV   rW   s   @r3   r  r    s1    	=	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r5   r  )r  r   r  r  rI  r   r  r   )Nr   )TrS   r  r  collections.abcr   rG   r   torch.nnr   r   r    r	   r   activationsr
   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    r!   r"   utils.genericr#   utils.output_capturingr$   r%   configuration_bartr'   
get_loggerrP   r   rU   rT   r4   r  r7   rY   Modulera   ry   r{   r   r   r   r   r   r  r  r5  rI  r   r  r  r  r  __all__r  r5   r3   <module>r     s      $   A A & ! C C ) J B 9   G &  8 E * 
		H	%%,, c [^  ;R\\ ;6
=bll 
=( !%II%<<% 
% <<	%
 LL4'% T\% % '(%8r)BII r)j01 0fO1 OdRYY 0 /  :
- 

- 
V
% V
r|
% |
~ q
# q
 q
h 
`h#6 `h
`hF y
$7 y
y
x f
2 f
 f
R-, - 
Y
)? Y

Y
x	r5   