
    ib                     b   d Z ddlZddlmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3  e,jh                  e5      Z6dejn                  de8de8fdZ9 G d dejt                        Z;	 	 d<dejx                  dejn                  dejn                  dejn                  d ejn                  dz  d!e=dz  d"e=d#e&e(   fd$Z> G d% d&ejx                        Z? G d' d(e      Z@ G d) d*e      ZAe) G d+ d,e$             ZB G d- d.eB      ZC G d/ d0eB      ZDe) G d1 d2eB             ZE e)d34       G d5 d6eBe             ZF G d7 d8eB      ZG G d9 d:eBe      ZHg d;ZIy)=z=PyTorch MarianMTModel model, ported from the Marian C++ repo.    N)Callable)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)OutputRecordercapture_outputs   )MarianConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr!   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r#   r$   r%   shifted_input_idss       {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/marian/modeling_marian.pyshift_tokens_rightr/   8   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    c            
            e Zd ZdZddedededz  ddf fdZd Z ej                         	 dd	ej                  d
edej                  dz  dej                  f fd       Z xZS )#MarianSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxreturnc                 *    t         |   ||d       y )NT)_freeze)super__init__)selfr3   r4   r5   	__class__s       r.   r:   z,MarianSinusoidalPositionalEmbedding.__init__K   s    tDr0   c                    | j                   j                  \  }}t        j                  t	        |      D cg c];  }t	        |      D cg c]$  }|t        j
                  dd|dz  z  |z        z  & c}= c}}      }t        j                  ||| j                   j                  d      }|dz  dk(  r|dz  n|dz  dz   }t        j                  t        j                  |dddddf               |ddd|f<   t        j                  t        j                  |dddddf               |dd|df<   |S c c}w c c}}w )z
        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
        the 2nd half of the vector. [dim // 2:]
        i'     F)dtyperequires_gradr   r!   N)weightr)   nparrayrangepowertorchemptyr?   FloatTensorsincos)r;   n_posdimposjposition_encoutsentinels           r.   create_weightz1MarianSinusoidalPositionalEmbedding.create_weightN   s   
 [[&&
sxxX]^cXdeQTsLAcBHHUAaL3$677Le
 kk%DKK,=,=US"Qw!|3!8#(a"..rvvl1add76K/LMAqzM!--bff\!QTT'5J.KLAxyL
 Mes   D>
	)D92D>
9D>
input_ids_shapepast_key_values_lengthposition_idsc                     |F|dd \  }}t        j                  |||z   t         j                  | j                  j                        }t
        |   |      S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr>   )r?   device)rF   arangelongrA   rW   r9   forward)r;   rS   rT   rU   bszseq_lenr<   s         r.   rZ   z+MarianSinusoidalPositionalEmbedding.forward]   s]    
 *2A.LC <<&(>(HPUPZPZcgcncncucuL w|,,r0   N)r   N)__name__
__module____qualname____doc__intr:   rR   rF   no_gradSizeTensorrZ   __classcell__r<   s   @r.   r2   r2   H   s    NEc E# ECRVJ Ebf E U]]_pt	-$zz	-CF	-Z_ZfZfimZm	-		- 	-r0   r2   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr'         r>   r   rL   ptrainingr!   )
sizerF   matmul	transposer   
functionalsoftmaxrn   ru   
contiguous)
rh   ri   rj   rk   rl   rm   rn   ro   attn_weightsattn_outputs
             r.   eager_attention_forwardr~   k   s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r0   c                       e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZ	 	 	 dde	j                  de	j                  dz  dedz  de	j                  dz  dee   dee	j                  e	j                  dz  f   fdZ xZS )MarianAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsrn   
is_decoderbias	is_causalconfig	layer_idxc	                    t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        || _        |9| j                  r-t        j                  d| j                  j                   d       t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rq   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )r9   r:   r   r   rn   head_dimr   r+   rm   r   r   r   loggerwarning_oncer<   r^   r   Lineark_projv_projq_projout_proj)
r;   r   r   rn   r   r   r   r   r   r<   s
            r.   r:   zMarianAttention.__init__   s$    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	94@ii	94@ii	94@		)YTBr0   hidden_stateskey_value_statespast_key_valuesrl   ro   r6   c                    |du}|j                   dd }g |d| j                  }| j                  |      j                  |      j	                  dd      }	d}
|St        |t              rA|j                  j                  | j                        }
|r|j                  }n|j                  }n|}|r|n|}|rK|I|
rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j!                  |      }g |j                   dd d| j                  }|j                  |      j	                  dd      }|j                  |      j	                  dd      }|Kj#                  ||| j                        \  }}|r)t        |t              rd|j                  | j                  <   t%        j&                  | j(                  j*                  t,              } || |	|||f| j.                  sdn| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j9                  |      }||fS )	z#Input shape: Batch x Time x ChannelNr'   r!   r>   FT        )rn   rm   )r)   r   r   viewrx   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacer   _attn_implementationr~   ru   rn   rm   reshaper{   r   )r;   r   r   r   rl   ro   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacer}   r|   s                      r.   rZ   zMarianAttention.forward   sd    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+FzS_aeaoao+p(
L%*_FY*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ *k));;;;FFHmmK0L((r0   )r   FTFNNNNN)r^   r_   r`   ra   rb   floatboolr"   r:   rF   re   r	   r   r   tuplerZ   rf   rg   s   @r.   r   r      s   G  &* $%C%C %C 	%C
 %C %C %C t#%C :%CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H)r0   r   c                        e Zd Zd
dededz  f fdZdej                  dej                  dee	   dej                  fd	Z xZS )MarianEncoderLayerNr   r   c                 j   t         |           |j                  | _        t	        | j                  |j
                  |j                  ||      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)r   r   rn   r   r   )r9   r:   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrn   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normr;   r   r   r<   s      r.   r:   zMarianEncoderLayer.__init__   s    (nn44,,
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r0   r   rl   ro   r6   c                 F   |} | j                   |fd|i|\  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|j                  t        j                  k(  rht        j                  |      j                         sEt        j                   |j                        j"                  dz
  }t        j$                  || |      }|S )Nrl   rs   i  )minmax)r   r   ry   rn   ru   r   r   r   r   r   r   r?   rF   float16isfiniteallfinfor   clamp)r;   r   rl   ro   residual_clamp_values          r.   rZ   zMarianEncoderLayer.forward  sh    !)4>>
)
 
q
 --mt||VZVcVc-d =011-@ **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<%--/}8U8Y8Y8[++m&9&9:>>EK!KKK<[YMr0   r]   )r^   r_   r`   r"   rb   r:   rF   rH   r   r   re   rZ   rf   rg   s   @r.   r   r      s[    =| =d
 =&(( )) +,	
 
r0   r   c                        e Zd Zddededz  f fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	edz  d
e	dz  de
e   dej                  fdZ xZS )MarianDecoderLayerNr   r   c           	         t         |           |j                  | _        t	        | j                  |j
                  |j                  dd||      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                  d||      | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )NT)r   r   rn   r   r   r   r   )rn   r   r   r   )r9   r:   r   r   r   decoder_attention_headsr   r   rn   r   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   r   s      r.   r:   zMarianDecoderLayer.__init__3  s    (nn44,,
 ~~#F$>$>?"(";";$&LL$@!+NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r0   r   rl   encoder_hidden_statesencoder_attention_maskr   	use_cachero   r6   c                    |} | j                   |f||d|\  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }|h|} | j                  |f|||d|\  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|S )N)r   rl   rs   )r   rl   r   )r   r   ry   rn   ru   r   r   r   r   r   r   r   r   )
r;   r   rl   r   r   r   r   ro   r   r   s
             r.   rZ   zMarianDecoderLayer.forwardR  s    ! *4>>
+)
 	
q --mt||VZVcVc-d =011-@ !,$H0t00 !65 /	 
  M1 MM11-4<<Z^ZgZg1hM$}4M 88GM !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<r0   r]   )NNNNT)r^   r_   r`   r"   rb   r:   rF   re   r	   r   r   r   rZ   rf   rg   s   @r.   r   r   2  s    =| =d
 =D /3596:(,!%/||/ t+/  %||d2	/
 !&t 3/ / $;/ +,/ 
/r0   r   c                   z     e Zd ZU eed<   dZdZdZdZdZ	dZ
 ej                          fd       Zed        Z xZS )MarianPreTrainedModelr   modelTc                    t         |   |       t        |t              r/t	        j
                  |j                  |j                                y t        |t              r t	        j                  |j                         y y r]   )r9   _init_weightsr   r2   initcopy_rA   rR   MarianMTModelzeros_final_logits_bias)r;   rh   r<   s     r.   r   z#MarianPreTrainedModel._init_weights  sW    f%fABJJv}}f&:&:&<=.KK001 /r0   c                     | j                   j                  }t        j                  g ddddd|gg| j                        }|j                  |      ||d}|S )N)r      
      r>   r         r>   rW   )rl   r#   decoder_input_ids)r   r$   rF   tensorrW   ne)r;   	pad_tokenr#   dummy_inputss       r.   r   z"MarianPreTrainedModel.dummy_inputs  sZ    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"!*

 r0   )r^   r_   r`   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphrF   rc   r   propertyr   rf   rg   s   @r.   r   r     sZ    &*#N!U]]_2 2  r0   r   c                        e Zd ZdZeedZdef fdZe	e
e	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ee   d
ef
d                     Z xZS )MarianEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MarianEncoderLayer`].

    Args:
        config: MarianConfig
        embed_tokens (nn.Embedding): output embedding
    )r   
attentionsr   c                    t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |      nd| _        t        j                  |j                   || j                        | _        t%        |j                  || j                        | _        t        j(                  t+        |j,                        D cg c]  }t/        |       c}      | _        d| _        | j5                          y c c}w )N      ?F)r9   r:   rn   encoder_layerdrop	layerdropr   r$   r5   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   	Embedding
vocab_sizeembed_tokensr2   embed_positions
ModuleListrD   encoder_layersr   r   gradient_checkpointing	post_init)r;   r   r   r   r<   s       r.   r:   zMarianEncoder.__init__  s     ~~11NN	!..$*$B$B!393I3I499Y/sLL):):ItGWGWXB**It7G7G 
 mmvOdOdIe$fA%7%?$fg&+#	 %gs   D;Nr#   rl   inputs_embedsro   r6   c                     |d u |d uz  rt        d      || j                  |      | j                  z  }| j                  |j                  d d       }||z   }t
        j                  j                  || j                  | j                        }t        | j                  ||      }t        | j                        D ]F  \  }}d}	| j                  r&t        j                  g       }
|
| j                  k  rd}	|	r= |||fi |}H t!        |      S )Nz:You must specify exactly one of input_ids or inputs_embedsr'   rs   )r   r  rl   FT)last_hidden_state)r+   r  r  r  r)   r   ry   rn   ru   r   r   	enumerater   rF   randr   r   )r;   r#   rl   r  ro   	embed_posr   idxencoder_layerto_dropdropout_probabilitys              r.   rZ   zMarianEncoder.forward  s     -t";<YZZ  --i84;K;KKM(()<)<Sb)AB	%	1--mt||VZVcVc-d2;;')
 #,DKK"8 	CG}}&+jjn#&7"G -!"! !	 +
 	
r0   r   )r^   r_   r`   ra   r   r   _can_record_outputsr"   r:   r   r    r   rF   
LongTensorrH   r   r   r   rZ   rf   rg   s   @r.   r   r     s     ,%
| ,   .22626	(
##d*(
 ((4/(
 ((4/	(

 +,(
 
(
    (
r0   r   c                   F    e Zd ZdZe eedd       eedd      dZdef fdZ	e
ee	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                   d	z  dej                  d	z  ded	z  dej                   d	z  ded	z  dee   defd                     Z xZS )MarianDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MarianDecoderLayer`]

    Args:
        config: MarianConfig
        embed_tokens (nn.Embedding): output embedding
    r!   r   )index
layer_namer   )r   r   cross_attentionsr   c           	         t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd| _        t        j                  |j                   |j                  | j                        | _        t%        |j                  |j                  | j                        | _        t        j(                  t+        |j,                        D cg c]  }t/        ||       c}      | _        d| _        | j5                          y c c}w )Nr   )r   F)r9   r:   rn   decoder_layerdropr   r$   r5   r   max_target_positionsr   r  r  r   r  r   r  decoder_vocab_sizer  r2   r  r  rD   decoder_layersr   r   r
  r  )r;   r   ir<   s      r.   r:   zMarianDecoder.__init__  s     ~~11!..$*$B$B!8>8N8N499V^^4TWLL)B)BFNNTXTdTdeB**FNND<L<L 
 mmV[\b\q\qVr$sQR%7!%L$st&+#	 %ts   ENr#   rl   r   r   r   r  r   ro   r6   c                 R   |d u |d uz  rt        d      || j                  |      }|| j                  z  }|rd|b|| j                  j                  r4t        t        | j                        t        | j                              nt        | j                        }|j                         d d \  }	}
||j                         nd}t        j                  |
|j                        |z   }|1t               s'||
z   }t        j                  |	||j                        }t        |t
              r|j                  n|}t!        | j                  |||      }t#        | j                  |||      }| j%                  |	|
f||      }||z   }t&        j(                  j+                  || j*                  | j,                  	      }t/        | j0                        D ]E  \  }}| j,                  r%t        j2                  g       }|| j4                  k  r7 ||||f|||d
|}G t7        ||      S )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)r   r'   r   r   )r   r  rl   r   )r   r  rl   r   )rU   rs   )r   r   r   )r  r   )r+   r  r  r   is_encoder_decoderr   r
   rv   get_seq_lengthrF   rX   rW   r   onesr   r   r   r   r  r   ry   rn   ru   r  r   r  r   r   )r;   r#   rl   r   r   r   r  r   ro   
batch_size
seq_lengthrT   rU   mask_seq_lengthself_attn_cachecausal_maskr   r  decoder_layerr  s                       r.   rZ   zMarianDecoder.forward  sI    -t";<stt  --i8M &(8(88 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
JETE`!?!?!Afg||J}7K7KLOee!*B*D4zAO"ZZ
OML`L`aN /+>? 00  	 );;')+	
 ";;;'1"7	"
 ++$&<< , 
 &4--mt||VZVcVc-d"+DKK"8 	C}}&+jjn#&7)% (> /# M	  9++
 	
r0   )NNNNNNN)r^   r_   r`   ra   r   r   r   r  r"   r:   r   r    r   rF   r  re   rH   r	   r   r   r   r   rZ   rf   rg   s   @r.   r  r    s    ,$_A+V*?!P^_| &   .2.2:>:>(,26!%R
##d*R
 t+R
  %0047	R

 !& 0 04 7R
 R
 ((4/R
 $;R
 +,R
 
3R
    R
r0   r  c                       e Zd ZddgZdef fdZd Zd Zd Zd Z	d	e
d
ej                  fdZee	 	 	 	 	 	 	 	 	 ddej"                  dz  dej$                  dz  dej"                  dz  dej$                  dz  deej$                     ez  dz  dedz  dej,                  dz  dej,                  dz  dedz  dee   d
efd              Z xZS )MarianModel$model.encoder.embed_positions.weight$model.decoder.embed_positions.weightr   c                 P   t         |   |       |j                  |j                  }}| j                  j
                  r1t        j                  ||j                  |      | _	        ddd| _
        nd | _
        t        |      | _        t        |      | _        | j                          y )Nzshared.weight)zdecoder.embed_tokens.weightzencoder.embed_tokens.weight)r9   r:   r$   r  r    share_encoder_decoder_embeddingsr   r  r   shared_tied_weights_keysr   encoderr  decoderr  )r;   r   r5   r  r<   s       r.   r:   zMarianModel.__init__w  s     "("5"5v7H7HZ ;;77,,z6>>;ODK/>/>'D#
 '+D#$V,$V, 	r0   c                 >    | j                         j                         S r]   )get_encoderget_input_embeddingsr;   s    r.   r9  z MarianModel.get_input_embeddings  s    !6688r0   c                     | j                   j                  r>|| _        | j                  | j                  _        | j                  | j
                  _        y || j                  _        y r]   )r   r2  r3  r5  r  r6  r;   rk   s     r.   set_input_embeddingsz MarianModel.set_input_embeddings  sB    ;;77DK(,DLL%(,DLL%(-DLL%r0   c                     | j                   j                  rt        d      | j                         j	                         S )Nz`get_decoder_input_embeddings` should not be called if `config.share_encoder_decoder_embeddings` is `True`. Please use `get_input_embeddings` instead.)r   r2  r+   get_decoderr9  r:  s    r.   get_decoder_input_embeddingsz(MarianModel.get_decoder_input_embeddings  s<    ;;77H  !6688r0   c                 h    | j                   j                  rt        d      || j                  _        y )Na   `config.share_encoder_decoder_embeddings` is set to `True` meaning the decoder input embeddings are shared with the encoder. In order to set the decoder input embeddings, you should simply set the encoder input embeddings by calling `set_input_embeddings` with the appropriate embeddings.)r   r2  r+   r6  r  r<  s     r.   set_decoder_input_embeddingsz(MarianModel.set_decoder_input_embeddings  s0    ;;77r 
 %*!r0   new_num_tokensr6   c                    | j                   j                  rt        d      | j                         }| j	                  ||      }| j                  |       | j                         }||S || j                   _        | j                          |S Nz`resize_decoder_token_embeddings` should not be called if `config.share_encoder_decoder_embeddings` is `True`. Please use `resize_token_embeddings` instead.)r   r2  r+   r@  _get_resized_embeddingsrB  r   tie_weights)r;   rC  old_embeddingsnew_embeddingsmodel_embedss        r.   resize_decoder_token_embeddingsz+MarianModel.resize_decoder_token_embeddings  s    ;;77K 
 ::<55nnU)).988:! *8& 	r0   Nr#   rl   r   decoder_attention_maskencoder_outputsr   r  decoder_inputs_embedsr   ro   c
                    | | j                   d	|||d|
}nGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      } | j                  d	|||d   ||||	d|
}t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Marian uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MarianModel

        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
        >>> model = MarianModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")

        >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
        >>> decoder_inputs = tokenizer(
        ...     "<pad> Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
        ...     return_tensors="pt",
        ...     add_special_tokens=False,
        ... )
        >>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_inputs.input_ids)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 26, 512]
        ```N)r#   rl   r  r   r!   r>   )r  r   r   r#   rl   r   r   r   r  r   )r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentions )r5  r   r   lenr6  r   r  r   r   r   r  )r;   r#   rl   r   rL  rM  r   r  rN  r   ro   decoder_outputss               r.   rZ   zMarianModel.forward  s   h "*dll #-+ 	O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO '$,, 	
'1"1!"4#1+/	
 	
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r0   )	NNNNNNNNN)r^   r_   r`   _keys_to_ignore_on_load_missingr"   r:   r9  r=  r@  rB  rb   r   r  rK  r   r   rF   r  re   r   r   r	   rH   r   r   r   r   rZ   rf   rg   s   @r.   r.  r.  p  s[    	/.'#
| *9.9*c bll 0  .2.2596:HL(,26:>!%V
##d*V
 t+V
 !++d2	V

 !&t 3V
 u||,>EV
 V
 ((4/V
  %0047V
 $;V
 +,V
 
V
  V
r0   r.  zX
    The Marian Model with a language modeling head. Can be used for summarization.
    )custom_introc                   F    e Zd ZdZg dZddgZddiZdef fdZ	 d!d
e	de	d	z  de
dej                  f fdZd"d
e	dej                  fdZd Zd
e	dd	fdZdej                  fdZee	 	 	 	 	 	 	 	 	 	 d#dej*                  d	z  dej,                  d	z  dej*                  d	z  dej,                  d	z  deej,                     ez  d	z  ded	z  dej4                  d	z  dej4                  d	z  dej*                  d	z  de
d	z  dee   defd              Zdej,                  fd Z xZ S )$r   r   )r   r/  r0  r/  r0  lm_head.weight!model.decoder.embed_tokens.weightr   c                    t         |   |       t        |      | _        | j                  j
                  rdddd| _        |j
                  r|j                  n|j                  }| j                  dt        j                  d|f             t        j                  |j                  |d      | _        | j!                          y )Nzmodel.shared.weight)r[  r\  z!model.encoder.embed_tokens.weightr   r!   Fr   )r9   r:   r.  r   r   r2  r4  r  r   register_bufferrF   zerosr   r   r   lm_headr  )r;   r   target_vocab_sizer<   s      r.   r:   zMarianMTModel.__init__+  s      (
;;77"75J5J'D# 281X1XF--^d^w^w0%++qBS>T2UVyy1BO 	r0   NrC  pad_to_multiple_ofmean_resizingr6   c                 x    t         |   |||      }| j                  j                  r| j	                  |       |S r]   )r9   resize_token_embeddingsr   r2  _resize_final_logits_bias)r;   rC  rb  rc  rI  r<   s        r.   re  z%MarianMTModel.resize_token_embeddings<  s;     8I[]jk;;77**>:r0   c                    | j                         }| j                  |||      }| j                  |       |j                  j                  d   }| j
                  j                  r|| j
                  _        | j
                  j                  rY| j                         I| j
                  j                  s3| j                         }| j                  ||      }| j                  |       | j                         S )Nr   )r9  rF  r=  rA   r)   r   r2  r   get_output_embeddingstie_word_embeddings_get_resized_lm_headset_output_embeddings)r;   rC  rb  argsrH  rI  old_lm_headnew_lm_heads           r.   _resize_token_embeddingsz&MarianMTModel._resize_token_embeddingsE  s    22455nnVhi!!.1'..44Q7;;77-;DKK* KK88**,8KK33446K33KPK&&{3((**r0   c                 (   | j                   j                  rt        d      | j                  j	                         }| j                  ||      }| j                  j                  |       | j                         I| j                   j                  s3| j                         }| j                  ||      }| j                  |       | j                  j	                         }||S || j                   _        | j                          | j                  |       |S rE  )r   r2  r+   r   r@  rF  rB  rh  ri  rj  rk  r   rG  rf  )r;   rC  rH  rI  rm  rn  rJ  s          r.   rK  z-MarianMTModel.resize_decoder_token_embeddings[  s    ;;77K 
 @@B55nnU

//? %%'3DKK<[<[446K33KPK&&{3zz>>@! *8& 	&&~6r0   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nr'   r!   r   rr   r   )r   r)   rF   r_  rW   catr^  )r;   rC  old_num_tokensnew_bias
extra_biass        r.   rf  z'MarianMTModel._resize_final_logits_bias{  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r0   rI  c                     || _         y r]   )r`  )r;   rI  s     r.   rk  z#MarianMTModel.set_output_embeddings  s	    %r0   r#   rl   r   rL  rM  r   r  rN  labelsr   ro   c                 l   |	R|
rt         j                  d       d}
|7|5t        |	| j                  j                  | j                  j
                        } | j                  |f||||||||
d|}| j                  |d         | j                  z   }d}|	Ft               } ||j                  d| j                  j                        |	j                  d            }t        |||j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  	      S )u  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Marian uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MarianMTModel

        >>> src = "fr"  # source language
        >>> trg = "en"  # target language

        >>> model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
        >>> model = MarianMTModel.from_pretrained(model_name)
        >>> tokenizer = AutoTokenizer.from_pretrained(model_name)

        >>> sample_text = "où est l'arrêt de bus ?"
        >>> batch = tokenizer([sample_text], return_tensors="pt")

        >>> generated_ids = model.generate(**batch)
        >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        "Where's the bus stop?"
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rl   r   rM  rL  r   r  rN  r   r   r'   )	losslogitsr   rQ  rR  r  rS  r   rT  )r   warningr/   r   r$   r%   r   r`  r   r   r   r   r   r   rQ  rR  r  rS  r   rT  )r;   r#   rl   r   rL  rM  r   r  rN  rw  r   ro   outputs	lm_logitsmasked_lm_lossloss_fcts                   r.   rZ   zMarianMTModel.forward  s@   p klI (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+'"7'
 '
 LL,t/E/EE	')H%innR9W9W&XZ`ZeZefhZijN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r0   c                 l    t        || j                  j                  | j                  j                        S r]   )r/   r   r$   r%   )r;   rw  s     r.   %prepare_decoder_input_ids_from_labelsz3MarianMTModel.prepare_decoder_input_ids_from_labels  s%    !&$++*B*BDKKDfDfggr0   )NTr]   )
NNNNNNNNNN)!r^   r_   r`   r   rX  _keys_to_ignore_on_saver4  r"   r:   rb   r   r   r  re  ro  rK  rf  rk  r   r   rF   r  re   r   r   r	   rH   r   r   r   rZ   r  rf   rg   s   @r.   r   r     s     '#
  FGmn*,OP| $ ae!7:TzY]	+s +_a_k_k +,@< < <&BLL &  .2.2596:HL(,26:>*.!%]
##d*]
 t+]
 !++d2	]

 !&t 3]
 u||,>E]
 ]
 ((4/]
  %0047]
   4']
 $;]
 +,]
 
]
  ]
~hELL hr0   r   c                   (     e Zd ZdZ fdZd Z xZS )MarianDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 d    t         |   |       t        |      | _        | j	                          y r]   )r9   r:   r  r6  r  r;   r   r<   s     r.   r:   zMarianDecoderWrapper.__init__  s&     $V,r0   c                 &     | j                   |i |S r]   )r6  )r;   rl  ro   s      r.   rZ   zMarianDecoderWrapper.forward  s    t||T,V,,r0   )r^   r_   r`   ra   r:   rZ   rf   rg   s   @r.   r  r    s    

-r0   r  c                   \    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dee	j                  z  dee   deez  fd              Z xZS )MarianForCausalLMr[  r\  c                     d|_         d|_        t        |   |       t	        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y )NTFr   )r   r$  r9   r:   r  r   r   r   hidden_sizer  r`  r  r  s     r.   r:   zMarianForCausalLM.__init__  sX     $)! )&1
yy!3!3V5F5FUS 	r0   c                 B    | j                   j                  j                  S r]   r   r6  r  r:  s    r.   r9  z&MarianForCausalLM.get_input_embeddings  s    zz!!...r0   c                 :    || j                   j                  _        y r]   r  r<  s     r.   r=  z&MarianForCausalLM.set_input_embeddings  s    */

'r0   Nr#   rl   r   r   r   r  rw  r   logits_to_keepro   r6   c
                     | j                   j                  d|||||||d|
}|d   }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|a|j                  |j                        }t               } ||j                  d| j                  j                        |j                  d            }t        |||j                  |j                  |j                  |j                         S )aT  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MarianForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
        >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```rP  r   Nr'   )ry  rz  r   r   r   r  rU  )r   r6  r   rb   slicer`  torW   r   r   r   r  r   r   r   r   r  )r;   r#   rl   r   r   r   r  rw  r   r  ro   r|  r   slice_indicesrz  ry  r  s                    r.   rZ   zMarianForCausalLM.forward  s   L >PTZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r0   )	NNNNNNNNr   )r^   r_   r`   r4  r:   r9  r=  r   r   rF   r  re   rH   r	   r   rb   r   r   r   r   rZ   rf   rg   s   @r.   r  r    s/   =	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r0   r  )r  r.  r   r   )Nr   )Jra   r  collections.abcr   numpyrB   rF   r   torch.nnr    r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r    configuration_marianr"   
get_loggerr^   r   re   rb   r/   r  r2   Moduler   r~   r   r   r   r   r   r  r.  r   r  r  __all__rU  r0   r.   <module>r     s   D  $    % & ! C C ) J B 9  G &  8 E . 
		H	%%,, c [^  -",, -R !%II%<<% 
% <<	%
 LL4'% T\% % '(%:r)bii r)l03 0hO3 Od O  <P
) P
fw
) w
t h
' h
 h
V 
Hh)? Hh
HhX-0 - Y
- Y
x Yr0   