
    ipu                        d Z ddlmZ ddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*  e$jV                  e,      Z- G d dej\                        Z/	 d2dej`                  dejb                  dejb                  dejb                  dejb                  dz  de2de2fdZ3 G d d ej`                        Z4 G d! d"e      Z5e" G d# d$e             Z6 G d% d&e6      Z7e" G d' d(e6             Z8 G d) d*e6e      Z9 e"d+,       G d- d.e6             Z:e" G d/ d0e6             Z;g d1Z<y)3zPyTorch OPT model.    )CallableN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )	OPTConfigc                   x     e Zd ZdZdedef fdZ	 	 d
dej                  dedej                  dz  f fd	Z xZ	S )OPTLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y N   )offsetsuper__init__)selfr!   r"   	__class__s      u/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/opt/modeling_opt.pyr(   z&OPTLearnedPositionalEmbedding.__init__2   s$     $++5}E    Nattention_maskpast_key_values_lengthposition_idsc                     |8t        j                  |d      }||z  dz
  j                         }|dd|df   }t        |   || j
                  z         S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr'   forwardr&   )r)   r-   r.   r/   r*   s       r+   r6   z%OPTLearnedPositionalEmbedding.forward8   s^      <<A>L(>9A=CCEL'+A+B(BCLw|dkk9::r,   )r   N)
__name__
__module____qualname____doc__intr(   r3   
LongTensorr6   __classcell__r*   s   @r+   r    r    -   s]    Fs F3 F '(04	;((; !$; &&-	; ;r,   r    modulequerykeyvaluer-   scalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )N)r2   dtypeptrainingr   r%   )r3   matmul	transposer   
functionalsoftmaxfloat32torH   rD   rK   
contiguous)
r?   r@   rA   rB   r-   rC   rD   kwargsattn_weightsattn_outputs
             r+   eager_attention_forwardrV   J   s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r,   c                        e Zd ZdZ	 ddededz  f fdZ	 	 	 ddej                  de	dz  dej                  dz  d	e
d
eej                  ej                  dz  e	dz  f   f
dZ xZS )OPTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _	        || _
        |-t        j                  d| j                  j                   d       | j                  | j                  z  | _        d| _        | j                  | j                  z  | j                  k7  r&t#        d| j                   d| j                   d      | j                  dz  | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        y )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r'   r(   rY   hidden_size	embed_dimnum_attention_heads	num_headsattention_dropoutrD   enable_biasrZ   loggerwarning_oncer*   r7   head_dim	is_causal
ValueErrorrC   r   Lineark_projv_projq_projout_proj)r)   rY   rZ   rS   r*   s       r+   r(   zOPTAttention.__init__d   s    	++33//!--" !8!8 9 :, , $..8MMDNN*t~~=MdnnM]$T^^$4B8  }}d*iiTEUEUViiTEUEUViiTEUEUV		$..$..tGWGWXr,   hidden_statespast_key_valuesr-   output_attentionsreturnc                 \   |j                         \  }}}| j                  |      | j                  z  }	|	j                  |d| j                  | j
                        j                  dd      }	| j                  |      }
| j                  |      }|
j                  |d| j                  | j
                        j                  dd      }
|j                  |d| j                  | j
                        j                  dd      }| |j                  |
|| j                        \  }
}t        j                  | j                  j                  t              } || |	|
||f| j                   sdn| j"                  dd|\  }}|j%                  ||d      j'                         }| j)                  |      }||fS )z#Input shape: Batch x Time x ChannelrF   r   r%           g      ?)rD   rC   )sizerl   rC   viewra   rf   rM   rj   rk   updaterZ   r   get_interfacerY   _attn_implementationrV   rK   rD   reshaperR   rm   )r)   rn   ro   r-   rp   rS   bsztgt_len_query_states
key_statesvalue_statesattention_interfacerU   rT   s                  r+   r6   zOPTAttention.forward   s    (,,.Wa {{=1DLL@#((b$..$--PZZ[\^_`[[/
{{=1__S"dnndmmLVVWXZ[\
#((b$..$--PZZ[\^_`&'6'='=j,X\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$,,	%
 	%
!\ "))#w;FFHmmK0L((r,   N)NNF)r7   r8   r9   r:   r   r;   r(   r3   Tensorr
   booltupler6   r=   r>   s   @r+   rX   rX   a   s    G
 !%!Y!Y :!YL )-.2"'.)||.) .) t+	.)
  .) 
u||U\\D0%$,>	?.)r,   rX   c                        e Zd Zddededz  f fdZ	 	 	 	 ddej                  dej                  dz  dedz  de	dz  d	ej                  dz  d
ee   dej                  fdZ xZS )OPTDecoderLayerNrY   rZ   c                    t         |           |j                  | _        t	        ||      | _        |j                  | _        |j                  | _        t        |j                     | _
        t        j                  | j                  |j                        | _        t        j                  | j                  |j                   |j"                        | _        t        j                  |j                   | j                  |j"                        | _        t        j                  | j                  |j                        | _        y )N)rY   rZ   elementwise_affiner\   )r'   r(   r^   r_   rX   	self_attndo_layer_norm_beforerD   r	   activation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normri   ffn_dimrc   fc1fc2final_layer_norm)r)   rY   rZ   r*   s      r+   r(   zOPTDecoderLayer.__init__   s    ++%VyI$*$?$?!~~#F$>$>?$&LLNNv/S/S%
! 99T^^V^^&BTBTU99V^^T^^&BTBTU "T^^PVPtPt ur,   rn   r-   ro   	use_cacher/   rS   rq   c                    |}| j                   r| j                  |      } | j                  d||||d|\  }}t        j                  j                  || j
                  | j                        }||z   }| j                   s| j                  |      }|j                  }	|j                  d|j                  d            }|}| j                   r| j                  |      }| j                  |      }| j                  |      }| j                  |      }t        j                  j                  || j
                  | j                        }||z   j                  |	      }| j                   s| j                  |      }|S )N)rn   ro   r/   r-   rI   rF    )r   r   r   r   rN   rD   rK   shapery   rt   r   r   r   r   ru   )
r)   rn   r-   ro   r   r/   rS   residualr|   hidden_states_shapes
             r+   r6   zOPTDecoderLayer.forward   sw    ! $$ 55mDM *4>> 
'+%)	

 
q --mt||VZVcVc-d =0 (( 55mDM ,11%--b-2D2DR2HI  $$ 11-@M/**=9/--mt||VZVcVc-d!M1778KL (( 11-@Mr,   r   )NNFN)r7   r8   r9   r   r;   r(   r3   r   r
   r   r<   r   r   r6   r=   r>   s   @r+   r   r      s    vy vS4Z v( /3(,!&043||3 t+3 	3
 $;3 &&-3 -.3 
3r,   r   c                   D    e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZeedZy)OPTPreTrainedModelrY   modelTr   )rn   
attentionsN)r7   r8   r9   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   rX   _can_record_outputsr   r,   r+   r   r      sH    &*#*+"&N!("r,   r   c                        e Zd ZdZdef fdZeee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  dedz  de	j                  dz  d	edz  d
e	j                  dz  dee   defd                     Z xZS )
OPTDecoderz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

    Args:
        config: OPTConfig
    rY   c           	      8   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  |j                  |j                  | j
                        | _        t        |j                  |j                        | _        |j                  |j                  k7  r2t        j                   |j                  |j                  d      | _        nd | _        |j                  |j                  k7  r2t        j                   |j                  |j                  d      | _        nd | _        |j&                  r=|j(                  s1t        j*                  |j                  |j,                        | _        nd | _        t        j0                  t3        |j4                        D cg c]  }t7        ||       c}      | _        d| _        | j=                          y c c}w )NFr\   r   )rZ   )r'   r(   rD   	layerdroppad_token_idpadding_idxmax_position_embeddingsmax_target_positions
vocab_sizer   	Embeddingword_embed_proj_dimembed_tokensr    r^   embed_positionsri   project_out
project_inr   _remove_final_layer_normr   r   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing	post_init)r)   rY   ir*   s      r+   r(   zOPTDecoder.__init__  s    ~~))!..$*$B$B! ++LL):):F<V<VX\XhXhi<V=[=[]c]o]op%%););;!yy););V=W=W^cdD#D%%););; ii(B(BFDVDV]bcDO"DO
 &&v/N/N$&LL""v7[7[%D! %)D!mmSXY_YqYqSr$sa_Vq%I$st&+#	 %ts   HN	input_idsr-   ro   inputs_embedsr   r/   rS   rq   c           	         |d u |d uz  rt        d      ||j                  d|j                  d         }|| j                  |      }|r|t	        | j
                        }||j                         nd}|A||j                  d   z   }	t        j                  |j                  d   |	|j                        }|8t        j                  |d      }||z  dz
  j                         }|d d |d f   }t        | j
                  |||      }
| j                  |||	      }| j                  | j                  |      }||j                  |j                        z   }t!        | j"                        D ]D  \  }}| j$                  r%t        j&                  g       }|| j(                  k  r7 ||f|
|||d
|}F | j*                  | j+                  |      }| j,                  | j-                  |      }t/        ||      S )Nz:You must specify exactly one of input_ids or inputs_embedsrF   )rY   r   r   devicer1   )rY   r   r-   ro   )r/   )r-   r/   ro   r   )last_hidden_statero   )rh   ru   r   r   r   rY   get_seq_lengthr3   onesr   r4   r5   r   r   r   rQ   	enumerater   rK   randr   r   r   r   )r)   r   r-   ro   r   r   r/   rS   past_seen_tokens
seq_lengthcausal_mask
pos_embedsrn   idxdecoder_layerdropout_probabilitys                   r+   r6   zOPTDecoder.forward>  s$    -t";<YZZ !r9??2+>?I  --i8M0*$++>O?N?Z?99;`a!)M,?,?,BBJ"ZZ(;(;A(>
S`SgSghN  <<A>L(>9A=CCEL'+;+<(<=L(;;')+	
 )).:JYe)f
??& OOM:M%
m6J6J(KK #,DKK"8 	C}}&+jjn#&7)*) /# M	    , 11-@M' ,,];M&++
 	
r,   NNNNNN)r7   r8   r9   r:   r   r(   r   r   r   r3   r<   r   r
   FloatTensorr   r   r   r   r6   r=   r>   s   @r+   r   r     s    #y #J   .2.2(,26!%04K
##d*K
 t+K
 	K

 ((4/K
 $;K
 &&-K
 +,K
 
!K
    K
r,   r   c                        e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  dedz  d	e	j                  dz  d
edz  de	j                  dz  dee   defd              Z xZS )OPTModelrY   c                 d    t         |   |       t        |      | _        | j	                          y r   )r'   r(   r   decoderr   r)   rY   r*   s     r+   r(   zOPTModel.__init__  s&     !&)r,   c                 .    | j                   j                  S r   r   r   r)   s    r+   get_input_embeddingszOPTModel.get_input_embeddings  s    ||(((r,   c                 &    || j                   _        y r   r   r)   rB   s     r+   set_input_embeddingszOPTModel.set_input_embeddings  s    $)!r,   Nr   r-   ro   r   r   r/   rS   rq   c           
           | j                   d||||||d|}t        |j                  |j                  |j                  |j
                        S )Nr   r-   r/   ro   r   r   )r   ro   rn   r   r   )r   r   r   ro   rn   r   )	r)   r   r-   ro   r   r   r/   rS   decoder_outputss	            r+   r6   zOPTModel.forward  si     4@4<< 4
)%+'4
 4
 '-??+;;)77&11	
 	
r,   r   )r7   r8   r9   r   r(   r   r   r   r   r3   r<   r   r
   r   r   r   r   r   r6   r=   r>   s   @r+   r   r     s    y )*  .2.2(,26!%04
##d*
 t+
 	

 ((4/
 $;
 &&-
 +,
 
!
  
r,   r   c                   <    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	edz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  dee	j                  z  dee   deez  fd              Z xZS )OPTForCausalLMzlm_head.weightz!model.decoder.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr\   )
r'   r(   r   r   r   ri   r   r   lm_headr   r   s     r+   r(   zOPTForCausalLM.__init__  sK     f%
 yy!;!;V=N=NUZ[ 	r,   c                 B    | j                   j                  j                  S r   r   r   r   r   s    r+   r   z#OPTForCausalLM.get_input_embeddings      zz!!...r,   c                 :    || j                   j                  _        y r   r   r   s     r+   r   z#OPTForCausalLM.set_input_embeddings      */

'r,   Nr   r-   ro   r   labelsr   r/   logits_to_keeprS   rq   c	           
          | j                   j                  d||||||d|	}
|
j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         j                         }d}|* | j                  d||| j                  j                  d|	}t        |||
j                  |
j                  |
j                        S )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
        ```r   N)logitsr   r   lossr   ro   rn   r   r   )r   r   r   
isinstancer;   slicer   rR   loss_functionrY   r   r   ro   rn   r   )r)   r   r-   ro   r   r   r   r/   r   rS   outputsrn   slice_indicesr   r   s                  r+   r6   zOPTForCausalLM.forward  s    J ,>4::+=+= ,
)%+',
 ,
  118B>SV8W~ot4]kmA}a,?@ALLN%4%%pVFt{{OeOepiopD%#33!//))
 	
r,   )NNNNNNNr   )r7   r8   r9   _tied_weights_keysr(   r   r   r   r   r3   r<   r   r
   r   r   r;   r   r   r   r   r6   r=   r>   s   @r+   r   r     s   *,OP/0  .2.2(,26*.!%04-.<
##d*<
 t+<
 	<

 ((4/<
   4'<
 $;<
 &&-<
 ell*<
 +,<
 
'	'<
  <
r,   r   a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de
dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  dee   deez  fd              Zd Zd Z xZS )OPTForSequenceClassificationrY   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r   )
r'   r(   
num_labelsr   r   r   ri   r   scorer   r   s     r+   r(   z%OPTForSequenceClassification.__init__  sT      ++f%
YYv994??QVW
 	r,   Nr   r-   ro   r   r   r   r/   rS   rq   c           	          | j                   |f|||||d|}	|	j                  }
| j                  |
      }||j                  dd \  }}n|j                  dd \  }}| j                  j
                  |dk7  rt        d      | j                  j
                  d}n||| j                  j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}|| j                  j"                  | j$                  dk(  rd
| j                  _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                  _        nd| j                  _        | j                  j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                  j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                  j"                  dk(  rt5               } |||      }t7        |||	j8                  |	j:                  |	j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        ro   r-   r/   r   r   Nr%   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rF   )r   rH   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr   )r   r   r   r   rY   r   rh   rQ   r   r3   int32arangeargmaxrd   re   r*   r7   problem_typer   rH   r5   r;   r   squeezer   ru   r   r   ro   rn   r   )r)   r   r-   ro   r   r   r   r/   rS   transformer_outputsrn   r   
batch_sizesequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fcts                       r+   r6   z$OPTForSequenceClassification.forward'  s   & 8Btzz8
+)%'8
 8
 ,==M* *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6/ /??-;;*55
 	
r,   c                 B    | j                   j                  j                  S r   r   r   s    r+   r   z1OPTForSequenceClassification.get_input_embeddings|  r   r,   c                 :    || j                   j                  _        y r   r   r   s     r+   r   z1OPTForSequenceClassification.set_input_embeddings  r   r,   )NNNNNNN)r7   r8   r9   r   r(   r   r   r3   r<   r   r
   r   r   r   r   r   r6   r   r   r=   r>   s   @r+   r   r     s    y   .237(,26*.!%04Q
##d*Q
 ))D0Q
 	Q

 ((4/Q
   4'Q
 $;Q
 &&-Q
 +,Q
 
1	1Q
  Q
f/0r,   r   c                   :    e Zd Zdef fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de
dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dej                  dz  dee   deez  fd              Zd Zd Z xZS )OPTForQuestionAnsweringrY   c                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y r$   )	r'   r(   r   r   r   ri   r   
qa_outputsr   r   s     r+   r(   z OPTForQuestionAnswering.__init__  s@     f%
))F$>$>B 	r,   Nr   r-   ro   r   start_positionsend_positionsr   r/   rS   rq   c	           	      8    | j                   |f|||||d|	}
|
j                  }| j                  |      }|j                  dd      \  }}|j	                  d      j                         }|j	                  d      j                         }d}||t        |j                               dkD  r|j	                  d      }t        |j                               dkD  r|j	                  d      }|j                  d      }|j                  d|      j                  |j                        }|j                  d|      j                  |j                        }t        |      } |||      } |||      }||z   dz  }t        ||||
j                  |
j                  	      S )
a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
        >>> import torch

        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

        >>> inputs = tokenizer(question, text, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> answer_offset = len(tokenizer(question)[0])

        >>> predict_answer_tokens = inputs.input_ids[
        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
        ... ]
        >>> predicted = tokenizer.decode(predict_answer_tokens)
        >>> predicted
        ' a nice puppet'
        ```r   r   rF   r1   Nr   )ignore_indexr%   )r   start_logits
end_logitsrn   r   )r   r   r  splitr  rR   lenrt   clamprQ   r   r   r   rn   r   )r)   r   r-   ro   r   r  r  r   r/   rS   r  rn   r   r  r  
total_lossignored_indexr  
start_lossend_losss                       r+   r6   zOPTForQuestionAnswering.forward  s   \ 8Btzz8
+)%'8
 8
 ,==/#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EHHWO)//=ADDV]]SM']CH!,@J
M:H$x/14J+%!-;;*55
 	
r,   c                 B    | j                   j                  j                  S r   r   r   s    r+   r   z,OPTForQuestionAnswering.get_input_embeddings  r   r,   c                 :    || j                   j                  _        y r   r   r   s     r+   r   z,OPTForQuestionAnswering.set_input_embeddings  r   r,   )NNNNNNNN)r7   r8   r9   r   r(   r   r   r3   r<   r   r
   r   r   r   r   r   r6   r   r   r=   r>   s   @r+   r  r    s   y   .237(,263715!%04Q
##d*Q
 ))D0Q
 	Q

 ((4/Q
 ))D0Q
 ''$.Q
 $;Q
 &&-Q
 +,Q
 
-	-Q
  Q
f/0r,   r  )r   r   r   r   r  )rs   )=r:   collections.abcr   r3   r   torch.nnr   r   r   activationsr	   cache_utilsr
   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   configuration_optr   
get_loggerr7   rd   r   r    Moduler   floatrV   rX   r   r   r   r   r   r   r  __all__r   r,   r+   <module>r8     s    $   A A ! . ) / B 9  G & R R 7 5 ( 
		H	%;BLL ;H %II%<<% 
% <<	%
 LL4'% % %.T)299 T)nE0 EP    {
# {
| (
! (
 (
VQ
' Q
h c0#5 c0c0L b00 b0 b0Jr,   