
    i                        d Z ddlZddlmZ ddlZddlmZ ddlmZ ddlm	Z
 ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1  e*jd                  e3      Z4dejj                  de6de6fdZ7 G d dejp                        Z9 G d dejp                        Z:	 	 d>dejv                  dejj                  d ejj                  d!ejj                  d"ejj                  dz  d#e<dz  d$e<d%e$e&   fd&Z= G d' d(ejv                        Z> G d) d*e      Z? G d+ d,e      Z@e' G d- d.e"             ZA G d/ d0eA      ZB G d1 d2eA      ZCe' G d3 d4eA             ZD e'd56       G d7 d8eAe             ZE G d9 d:eA      ZF G d; d<eAe      ZGg d=ZHy)?zPyTorch Blenderbot model.    N)Callable)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)OutputRecordercapture_outputs   )BlenderbotConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr!   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r#   r$   r%   shifted_input_idss       /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/blenderbot/modeling_blenderbot.pyshift_tokens_rightr/   1   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                   v     e Zd ZdZdedef fdZ	 d
dej                  dedej                  dz  f fd	Z	 xZ
S )$BlenderbotLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 &    t         |   ||       y N)super__init__)selfr3   r4   	__class__s      r.   r8   z-BlenderbotLearnedPositionalEmbedding.__init__F   s    7r0   Ninput_ids_shapepast_key_values_lengthposition_idsc                     |F|dd \  }}t        j                  |||z   t         j                  | j                  j                        }t
        |   |      S )z3`input_ids_shape` is expected to be [bsz x seqlen].N   )dtypedevice)torcharangelongweightrA   r7   forward)r9   r;   r<   r=   bszseq_lenr:   s         r.   rF   z,BlenderbotLearnedPositionalEmbedding.forwardI   s]     *2A.LC <<&(>(HPUPZPZcgcncncucuL w|,,r0   )r   N)__name__
__module____qualname____doc__intr8   rB   SizeTensorrF   __classcell__r:   s   @r.   r2   r2   A   sT    8s 83 8 qu	-$zz	-CF	-Z_ZfZfimZm	- 	-r0   r2   c            
       `     e Zd ZdZd
dededededz  f fdZdej                  f fd	Z	 xZ
S )BlenderbotScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r3   r4   padding_idxembed_scaleNc                 6    t         |   |||       || _        y r6   )r7   r8   rU   )r9   r3   r4   rT   rU   r:   s        r.   r8   z&BlenderbotScaledWordEmbedding.__init__[   s    D&r0   r#   c                 <    t         |   |      | j                  z  S r6   )r7   rF   rU   )r9   r#   r:   s     r.   rF   z%BlenderbotScaledWordEmbedding.forward_   s    wy)D,<,<<<r0   )      ?)rI   rJ   rK   rL   rM   floatr8   rB   rO   rF   rP   rQ   s   @r.   rS   rS   V   sE    's '3 'S '_dgk_k '= = =r0   rS   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr'         r?   r   dimptrainingr!   )
sizerB   matmul	transposer   
functionalsoftmaxr`   rh   
contiguous)
rZ   r[   r\   r]   r^   r_   r`   ra   attn_weightsattn_outputs
             r.   eager_attention_forwardrq   d   s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r0   c                       e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZ	 	 	 dde	j                  de	j                  dz  dedz  de	j                  dz  dee   dee	j                  e	j                  dz  f   fdZ xZS )BlenderbotAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr`   
is_decoderbias	is_causalconfig	layer_idxc	                    t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        || _        |9| j                  r-t        j                  d| j                  j                   d       t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rc   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rw   )r7   r8   rt   ru   r`   head_dimry   r+   r_   rv   rx   rz   loggerwarning_oncer:   rI   r   Lineark_projv_projq_projout_proj)
r9   rt   ru   r`   rv   rw   rx   ry   rz   r:   s
            r.   r8   zBlenderbotAttention.__init__   s$    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	94@ii	94@ii	94@		)YTBr0   hidden_stateskey_value_statespast_key_valuesr^   ra   returnc                    |du}|j                   dd }g |d| j                  }| j                  |      j                  |      j	                  dd      }	d}
|St        |t              rA|j                  j                  | j                        }
|r|j                  }n|j                  }n|}|r|n|}|rK|I|
rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j!                  |      }g |j                   dd d| j                  }|j                  |      j	                  dd      }|j                  |      j	                  dd      }|Kj#                  ||| j                        \  }}|r)t        |t              rd|j                  | j                  <   t%        j&                  | j(                  j*                  t,              } || |	|||f| j.                  sdn| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j9                  |      }||fS )	z#Input shape: Batch x Time x ChannelNr'   r!   r?   FT        )r`   r_   )r)   r}   r   viewrk   
isinstancer   
is_updatedgetrz   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacery   _attn_implementationrq   rh   r`   r_   reshapern   r   )r9   r   r   r   r^   ra   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacerp   ro   s                      r.   rF   zBlenderbotAttention.forward   sd    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+FzS_aeaoao+p(
L%*_FY*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ *k));;;;FFHmmK0L((r0   )r   FTFNNNNN)rI   rJ   rK   rL   rM   rY   boolr"   r8   rB   rO   r	   r   r   tuplerF   rP   rQ   s   @r.   rs   rs      s   G  *. $%C%C %C 	%C
 %C %C %C !4'%C :%CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H)r0   rs   c                   ~     e Zd Zdef fdZdej                  dej                  dee   dej                  fdZ	 xZ
S )BlenderbotEncoderLayerry   c                 h   t         |           |j                  | _        t	        | j                  |j
                  |j                  |      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)rt   ru   r`   ry   )r7   r8   d_modelrt   rs   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr`   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normr9   ry   r:   s     r.   r8   zBlenderbotEncoderLayer.__init__   s    ,nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r0   r   r^   ra   r   c                     |}| j                  |      } | j                  d||d|\  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|j                  t        j                  k(  rEt        j                  |j                        j                  dz
  }t        j                   || |      }|S )a>  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        )r   r^   rf   i  )minmax )r   r   r   rl   r`   rh   r   r   r   r   r   r@   rB   float16finfor   clamp)r9   r   r^   ra   residual_clamp_values          r.   rF   zBlenderbotEncoderLayer.forward
  sT    !11-@)4>> 
')
 
q
 --mt||VZVcVc-d =0 --m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0%--/++m&9&9:>>EK!KKK<[YMr0   )rI   rJ   rK   r"   r8   rB   rO   r   r   rF   rP   rQ   s   @r.   r   r      sM    =/ =$"||" " +,	"
 
"r0   r   c                        e Zd Zddededz  f fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	edz  d
e	dz  de
e   dej                  fdZ xZS )BlenderbotDecoderLayerNry   rz   c           	         t         |           |j                  | _        t	        | j                  |j
                  |j                  dd||      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                  d||      | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )NT)rt   ru   r`   rv   rx   ry   rz   )r`   rv   ry   rz   )r7   r8   r   rt   rs   decoder_attention_headsr   r   r`   r   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   )r9   ry   rz   r:   s      r.   r8   zBlenderbotDecoderLayer.__init__1  s    ,nn44,,
 ~~#F$>$>?"(";";$&LL$@!/NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r0   r   r^   encoder_hidden_statesencoder_attention_maskr   	use_cachera   r   c                    |}| j                  |      } | j                  d|||d|\  }}	t        j                  j	                  || j                  | j
                        }||z   }|h|}| j                  |      } | j                  d||||d|\  }}	t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
        )r   r   r^   rf   )r   r   r^   r   r   )r   r   r   rl   r`   rh   r   r   r   r   r   r   r   )
r9   r   r^   r   r   r   r   ra   r   r   s
             r.   rF   zBlenderbotDecoderLayer.forwardP  s   * !11-@ *4>> 
'+)
 	
q --mt||VZVcVc-d =0 !,$H 88GM0t00  +!65 /	 
  M1 MM11-4<<Z^ZgZg1hM$}4M !--m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0r0   r6   )NNNNT)rI   rJ   rK   r"   rM   r8   rB   rO   r	   r   r   r   rF   rP   rQ   s   @r.   r   r   0  s    =/ =C$J =D /3596:(,!%:||: t+:  %||d2	:
 !&t 3: : $;: +,: 
:r0   r   c                   R     e Zd ZU eed<   dZdZdZdZdZ	dZ
 fdZed        Z xZS )BlenderbotPreTrainedModelry   modelTc                     t         |   |       t        |t              r t	        j
                  |j                         y y r6   )r7   _init_weightsr   "BlenderbotForConditionalGenerationinitzeros_final_logits_bias)r9   rZ   r:   s     r.   r   z'BlenderbotPreTrainedModel._init_weights  s3    f%f@AKK001 Br0   c                     | j                   j                  }t        j                  g ddddd|gg| j                        }|j                  |      ||d}|S )N)r      
      r?   r         r?   rA   )r^   r#   decoder_input_ids)ry   r$   rB   tensorrA   ne)r9   	pad_tokenr#   dummy_inputss       r.   r   z&BlenderbotPreTrainedModel.dummy_inputs  sZ    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"!*

 r0   )rI   rJ   rK   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   propertyr   rP   rQ   s   @r.   r   r     sE    &*#N!2
  r0   r   c                        e Zd ZdZeedZdef fdZe	e
e	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ee   d
ef
d                     Z xZS )BlenderbotEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`BlenderbotEncoderLayer`].

    Args:
        config: BlenderbotConfig
        embed_tokens (nn.Embedding): output embedding
    )r   
attentionsry   c                    t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |      nd}t        |j                  || j                  |      | _        t!        |j                  |      | _        t%        j&                  t)        |j*                        D cg c]  }t-        |       c}      | _        t%        j0                  |j
                        | _        d| _        | j7                          y c c}w )NrX   rU   F)r7   r8   r`   encoder_layerdrop	layerdropr   r$   rT   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtrS   
vocab_sizeembed_tokensr2   embed_positionsr   
ModuleListrangeencoder_layersr   r   r   
layer_normgradient_checkpointing	post_init)r9   ry   rt   rU   r   r:   s        r.   r8   zBlenderbotEncoder.__init__  s     ~~11NN	!..$*$B$B!.4.D.Ddii	*#9y$*:*:
  D** 
 mmUSYShShMi$j%;F%C$jk,,v~~6&+# %ks   -ENr#   r^   inputs_embedsra   r   c                 4   |d u |d uz  rt        d      || j                  |      }|j                         d d }| j                  |      }||z   }t        j
                  j                  || j                  | j                        }t        | j                  ||      }t        | j                        D ]F  \  }}	d}
| j                  r&t        j                  g       }|| j                  k  rd}
|
r= |	||fi |}H | j                  |      }t!        |      S )Nz:You must specify exactly one of input_ids or inputs_embedsr'   rf   )ry   r   r^   FT)last_hidden_state)r+   r   ri   r   r   rl   r`   rh   r   ry   	enumerater   rB   randr   r   r   )r9   r#   r^   r   ra   r   	embed_posr   idxencoder_layerto_dropdropout_probabilitys               r.   rF   zBlenderbotEncoder.forward  s+    -t";<YZZ  --i8M#((*3B/((5	%	1--mt||VZVcVc-d2;;')

 #,DKK"8 	CG}}&+jjn#&7"G -!"! !	  6+
 	
r0   r   )rI   rJ   rK   rL   r   rs   _can_record_outputsr"   r8   r   r    r   rB   
LongTensorrO   FloatTensorr   r   r   rF   rP   rQ   s   @r.   r   r     s     0)
/ 4   .2.226	,
##d*,
 t+,
 ((4/	,

 +,,
 
,
    ,
r0   r   c                   F    e Zd ZdZe eedd       eedd      dZdef fdZ	e
ee	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                   d	z  dej                  d	z  ded	z  dej                   d	z  ded	z  dee   defd                     Z xZS )BlenderbotDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BlenderbotDecoderLayer`]

    Args:
        config: BlenderbotConfig
        embed_tokens (nn.Embedding): output embedding
    r!   r   )index
layer_namer   )r   r   cross_attentionsry   c           	         t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd}t        |j                  |j                  | j                  |      | _        t!        |j                  |j                        | _        t%        j&                  t)        |j*                        D cg c]  }t-        ||       c}      | _        t%        j0                  |j                        | _        d| _        | j7                          y c c}w )NrX   r   )rz   F)r7   r8   r`   decoder_layerdropr   r$   rT   r   max_target_positionsr   r   r   r   rS   r   r   r2   r   r   r   r   decoder_layersr   r   r   r   r   r   )r9   ry   rU   ir:   s       r.   r8   zBlenderbotDecoder.__init__  s    ~~11!..$*$B$B!393I3Idii/s9v~~t/?/?[
  D**NN 
 mmBGH]H]B^_Q#Fa8_
 ,,v~~6&+# `s   ?ENr#   r^   r   r   r   r   r   ra   r   c                    |d u |d uz  rt        d      || j                  |      }|rd|b|| j                  j                  r4t	        t        | j                        t        | j                              nt        | j                        }|j                         d d \  }	}
||j                         nd}t        j                  |
|j                        |z   }|1t               s'||
z   }t        j                  |	||j                        }t        |t              r|j                  n|}t        | j                  |||      }t!        | j                  |||      }| j#                  |	|
f||      }||z   }t$        j&                  j)                  || j(                  | j*                  	      }t-        | j.                        D ]\  \  }}| j*                  r%t        j0                  g       }|| j2                  k  r7 ||||f|||d
|}t        |t4              r|d   n|}^ | j7                  |      }t9        ||      S )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)ry   r'   r   r   )ry   r   r^   r   )ry   r   r^   r   )r=   rf   )r   r   r   )r   r   )r+   r   ry   is_encoder_decoderr   r
   ri   get_seq_lengthrB   rC   rA   r   onesr   r   r   r   r   r   rl   r`   rh   r   r   r  r   r   r   r   )r9   r#   r^   r   r   r   r   r   ra   
batch_size
seq_lengthr<   r=   mask_seq_lengthself_attn_cachecausal_maskr   r  decoder_layerr  layer_outputss                        r.   rF   zBlenderbotDecoder.forward+  s_    -t";<stt  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
JETE`!?!?!Afg||J}7K7KLOee!*B*D4zAO"ZZ
OML`L`aN /+>? 00  	 );;')+	
 ";;;'1"7	"
 ++$&<< , 
 &4--mt||VZVcVc-d"+DKK"8 	dC}}&+jjn#&7)% (> /# M 1;=%0PM!,VcM!	d& 68++
 	
r0   )NNNNNNN)rI   rJ   rK   rL   r   r   rs   r  r"   r8   r   r    r   rB   r  rO   r	  r	   r   r   r   r   rF   rP   rQ   s   @r.   r  r    s    0$%8kZ*+>aTbc/ 2   .2.2:>:>(,26!%U
##d*U
 t+U
  %0047	U

 !& 0 04 7U
 U
 ((4/U
 $;U
 +,U
 
3U
    U
r0   r  c                   J    e Zd ZdddZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 dde
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  dedz  dedz  de
j                  dz  de
j                  dz  dedz  dee   defd              Z xZS )BlenderbotModelzshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightry   c                 J   t         |   |       |j                  |j                  }}|j                  rt        j                  |j                        nd}t        ||j                  ||      | _	        t        |      | _        t        |      | _        | j                          y )NrX   r   )r7   r8   r$   r   r   r   r   r   rS   sharedr   encoderr  decoderr   )r9   ry   rT   r   rU   r:   s        r.   r8   zBlenderbotModel.__init__  s}     "("5"5v7H7HZ393I3Idii/s3JP[itu(0(0 	r0   c                     | j                   S r6   )r"  r9   s    r.   get_input_embeddingsz$BlenderbotModel.get_input_embeddings  s    {{r0   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r6   )r"  r#  r   r$  r9   r]   s     r.   set_input_embeddingsz$BlenderbotModel.set_input_embeddings  s)    $(KK!$(KK!r0   Nr#   r^   r   decoder_attention_maskencoder_outputsr   r   decoder_inputs_embedsr   ra   r   c
                    | | j                   d	|||d|
}nGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      } | j                  d	|||d   ||||	d|
}t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )
a|  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BlenderbotModel

        >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

        >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 6, 1280]
        ```N)r#   r^   r   r   r!   r?   )r   r   r   r#   r^   r   r   r   r   r   )r   r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsr   )r#  r   r   lenr$  r   r   r   r   r   r  )r9   r#   r^   r   r+  r,  r   r   r-  r   ra   decoder_outputss               r.   rF   zBlenderbotModel.forward  s   ` "/;t|| 0#-+0 	0O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO FRT\\ 	F
'1"1!"4#1+/	F
 	F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r0   )	NNNNNNNNN)rI   rJ   rK   _tied_weights_keysr"   r8   r'  r*  r   r   rB   r  rO   r   r	   r	  r   r   r   r   rF   rP   rQ   s   @r.   r   r     s/    (7'6

/ 
0
  .2.259:>26(,26:>!%P
##d*P
 t+P
 !++d2	P

 !& 0 04 7P
 )4/P
 P
 ((4/P
  %0047P
 $;P
 +,P
 
P
  P
r0   r   z\
    The Blenderbot Model with a language modeling head. Can be used for summarization.
    )custom_introc                       e Zd ZdZdgZddiZdef fdZ	 dded	edz  d
e	de
j                  f fdZdeddfdZee	 	 	 	 	 	 	 	 	 	 ddej"                  dz  dej$                  dz  dej"                  dz  dej"                  dz  dedz  dedz  dej*                  dz  dej*                  dz  dej"                  dz  de	dz  dee   defd              Z xZS )r   r   r   lm_head.weightzmodel.shared.weightry   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )Nr   r!   Fr|   )r7   r8   r   r   register_bufferrB   zerosr"  r3   r   r   r   lm_headr   r   s     r.   r8   z+BlenderbotForConditionalGeneration.__init__  s     $V,
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r0   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 z    t         |   |||      }| j                  |j                  j                  d          |S )Nr   )r7   resize_token_embeddings_resize_final_logits_biasrE   r)   )r9   r>  r?  r@  new_embeddingsr:   s        r.   rB  z:BlenderbotForConditionalGeneration.resize_token_embeddings  s?     8I[]jk&&~'<'<'B'B1'EFr0   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nr'   r!   r   rd   r   )r   r)   rB   r<  rA   catr;  )r9   r>  old_num_tokensnew_bias
extra_biass        r.   rC  z<BlenderbotForConditionalGeneration._resize_final_logits_bias  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r0   r#   r^   r   r+  r,  r   r   r-  labelsr   ra   c                    |	R|
rt         j                  d       d}
|7|5t        |	| j                  j                  | j                  j
                        } | j                  |f||||||||
d|}| j                  |d         }|| j                  j                  |j                        z   }d}|	a|	j                  |j                        }	t               } ||j                  d| j                  j                        |	j                  d            }t        |||j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  	      S )a4  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example conversation:

        ```python
        >>> from transformers import AutoTokenizer, BlenderbotForConditionalGeneration

        >>> mname = "facebook/blenderbot-400M-distill"
        >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
        >>> tokenizer = AutoTokenizer.from_pretrained(mname)
        >>> UTTERANCE = "My friends are cool but they eat too many carbs."
        >>> print("Human: ", UTTERANCE)
        Human:  My friends are cool but they eat too many carbs.

        >>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
        >>> reply_ids = model.generate(**inputs)
        >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
        Bot: That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?

        >>> REPLY = "I'm not sure"
        >>> print("Human: ", REPLY)
        Human: I'm not sure

        >>> NEXT_UTTERANCE = (
        ...     "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
        ...     "Are they trying to lose weight or are they just trying to be healthier?</s> "
        ...     "<s> I'm not sure."
        ... )
        >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
        >>> next_reply_ids = model.generate(**inputs)
        >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
        Bot:   I see. Well, it's good that they're trying to change their eating habits.
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)r^   r   r,  r+  r   r   r-  r   r   r'   )	losslogitsr   r0  r1  r  r2  r   r3  )r~   warningr/   ry   r$   r%   r   r=  r   torA   r   r   r   r   r   r0  r1  r  r2  r   r3  )r9   r#   r^   r   r+  r,  r   r   r-  rJ  r   ra   outputs	lm_logitsmasked_lm_lossloss_fcts                   r.   rF   z*BlenderbotForConditionalGeneration.forward  sj   H klI (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+'"7'
 '
 LL,	 6 6 9 9):J:J KK	YYy//0F')H%innR9O9O&PRXR]R]^`RabN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r0   )NT)
NNNNNNNNNN)rI   rJ   rK   r   _keys_to_ignore_on_load_missingr6  r"   r8   rM   r   r   	EmbeddingrB  rC  r   r   rB   r  rO   r   r	   r	  r   r   r   rF   rP   rQ   s   @r.   r   r     s     ':&;#//  ae!7:TzY]	< < <  .2.259:>26(,26:>*.!%k
##d*k
 t+k
 !++d2	k

 !& 0 04 7k
 )4/k
 k
 ((4/k
  %0047k
   4'k
 $;k
 +,k
 
k
  k
r0   r   c                   (     e Zd ZdZ fdZd Z xZS )BlenderbotDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 d    t         |   |       t        |      | _        | j	                          y r6   )r7   r8   r  r$  r   r   s     r.   r8   z!BlenderbotDecoderWrapper.__init__  s&     (0r0   c                 &     | j                   |i |S r6   )r$  )r9   argsra   s      r.   rF   z BlenderbotDecoderWrapper.forward  s    t||T,V,,r0   )rI   rJ   rK   rL   r8   rF   rP   rQ   s   @r.   rW  rW    s    

-r0   rW  c                   \    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dee	j                  z  dee   deez  fd              Z xZS )BlenderbotForCausalLMr9  z!model.decoder.embed_tokens.weightc                     d|_         d|_        t        |   |       t	        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y )NTFr|   )rv   r  r7   r8   rW  r   r   r   hidden_sizer   r=  r   r   s     r.   r8   zBlenderbotForCausalLM.__init__  sX     $)! -f5
yy!3!3V5F5FUS 	r0   c                 B    | j                   j                  j                  S r6   r   r$  r   r&  s    r.   r'  z*BlenderbotForCausalLM.get_input_embeddings  s    zz!!...r0   c                 :    || j                   j                  _        y r6   r`  r)  s     r.   r*  z*BlenderbotForCausalLM.set_input_embeddings  s    */

'r0   Nr#   r^   r   r   r   r   rJ  r   logits_to_keepra   r   c
                     | j                   j                  d|||||||d|
}|d   }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|a|j                  |j                        }t               } ||j                  d| j                  j                        |j                  d            }t        |||j                  |j                  |j                  |j                         S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BlenderbotForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
        >>> model = BlenderbotForCausalLM.from_pretrained("facebook/blenderbot-400M-distill")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```r/  r   Nr'   )rL  rM  r   r   r   r  r   )r   r$  r   rM   slicer=  rO  rA   r   r   ry   r   r   r   r   r   r  )r9   r#   r^   r   r   r   r   rJ  r   rb  ra   rP  r   slice_indicesrM  rL  rS  s                    r.   rF   zBlenderbotForCausalLM.forward  s   L >PTZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r0   )	NNNNNNNNr   )rI   rJ   rK   r6  r8   r'  r*  r   r   rB   r  rO   r	  r	   r   rM   r   r   r   r   rF   rP   rQ   s   @r.   r\  r\    s/   =	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r0   r\  )r\  r   r   r   )Nr   )IrL   r   collections.abcr   rB   r   torch.nnr    r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r    configuration_blenderbotr"   
get_loggerrI   r~   rO   rM   r/   rU  r2   rS   ModulerY   rq   rs   r   r   r   r   r  r   r   rW  r\  __all__r   r0   r.   <module>ry     s       $   % & ! C C ) J B 9  G & l l 7 E 6 
		H	%%,, c [^  -2<< -*
=BLL 
=( !%II%<<% 
% <<	%
 LL4'% T\% % '(%:r)")) r)l57 5rZ7 Zz   4X
1 X
v@
1 @
F l
/ l
 l
^ 
M
)BO M

M
b-8 - Y
5 Y
xr0   