
    iM                        d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*  G d de(      Z+ G d de&      Z, G d de$      Z- G d de%      Z.e G d de             Z/e G d de/             Z0 ed !       G d" d#e/e             Z1e G d$ d%e/             Z2 ed&!       G d' d(e/             Z3g d)Z4y)*zPyTorch BioGPT model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogger)merge_with_config_defaults)capture_outputs   )BartAttentionBartDecoderLayerBartScaledWordEmbedding)OPTLearnedPositionalEmbedding   )BioGptConfigc                   `     e Zd Z	 	 ddej                  dedej                  dz  f fdZ xZS ) BioGptLearnedPositionalEmbeddingNattention_maskpast_key_values_lengthposition_idsc                 &    t         |   |||      S )z3`input_ids_shape` is expected to be [bsz x seqlen].)superforward)selfr!   r"   r#   	__class__s       z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/biogpt/modular_biogpt.pyr&   z(BioGptLearnedPositionalEmbedding.forward4   s     w~/E|TT    )r   N)__name__
__module____qualname__torch
LongTensorintr&   __classcell__r(   s   @r)   r    r    3   sG     '(04	U((U !$U &&-	U Ur*   r    c                       e Zd Zy)BioGptScaledWordEmbeddingNr+   r,   r-    r*   r)   r4   r4   >       r*   r4   c                       e Zd Zy)BioGptAttentionNr5   r6   r*   r)   r9   r9   B   r7   r*   r9   c                        e Zd Zddededz  f fdZ	 	 	 	 ddej                  dej                  dz  dedz  de	dz  d	ej                  dz  d
ee   dej                  fdZ xZS )BioGptDecoderLayerNconfig	layer_idxc           	         t         |   |       |j                  | _        t	        | j                  |j
                  |j                  dd||      | _        |j                  | _	        t        |j                     | _        t        j                  | j                  |j                        | _        t        j                  |j                  | j                        | _        | `| `y )NT)	embed_dim	num_headsdropout
is_decoder	is_causalr<   r=   )r%   __init__hidden_sizer?   r9   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrA   r   
hidden_actactivation_fnnnLinearintermediate_sizefc1fc2encoder_attnencoder_attn_layer_norm)r'   r<   r=   r(   s      r)   rD   zBioGptDecoderLayer.__init__G   s     ++(nn0077
 11#F$5$5699T^^V-E-EF99V55t~~F(r*   hidden_statesr!   past_key_values	use_cacher#   kwargsreturnc                 D   |}| j                  |      } | j                  d||||d|\  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
        )rS   rT   r!   r#   ptrainingr6   )self_attn_layer_normrH   rL   
functionalrA   r[   final_layer_normrO   rK   activation_dropoutrP   )	r'   rS   r!   rT   rU   r#   rV   residual_s	            r)   r&   zBioGptDecoderLayer.forward]   s     !11-@ *4>> 
'+)%	

 
q --mt||VZVcVc-d =0 !--m</**=9--mt?V?Vaeanan-o/--mt||VZVcVc-d =0r*   N)NNTN)r+   r,   r-   r   r0   rD   r.   Tensorr   boolr/   r   r   r&   r1   r2   s   @r)   r;   r;   F   s    )| )d
 )2 /3(,!%04)||) t+) 	)
 $;) &&-) +,) 
)r*   r;   c                   :    e Zd ZU eed<   dZdZdZdZdZ	dZ
eedZy)BioGptPreTrainedModelr<   biogptT)rS   
attentionsN)r+   r,   r-   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr;   r9   _can_record_outputsr6   r*   r)   rf   rf      s9     &*#N!+%r*   rf   c                        e Zd Zdef fdZeee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ej                  dz  d
ee   deez  fd                     Z xZS )BioGptModelr<   c           	         t         |   |       || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        |j                  rt        j                  |j                        nd}t        |j                  | j                  | j                  |      | _        t!        |j"                  | j                        | _        t'        j(                  t+        |j,                        D cg c]  }t/        ||       c}      | _        t'        j2                  | j                        | _        d| _        | j9                          y c c}w )Ng      ?)embed_scale)r=   F)r%   rD   r<   	layerdroprI   rA   rE   r?   pad_token_idpadding_idxscale_embeddingmathsqrtr4   
vocab_sizeembed_tokensr    max_position_embeddingsembed_positionsrL   
ModuleListrangenum_hidden_layersr;   layers	LayerNorm
layer_normgradient_checkpointing	post_init)r'   r<   rt   ir(   s       r)   rD   zBioGptModel.__init__   s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vQR%7!%L$vw,,t~~6&+# %ws   E"N	input_idsr!   inputs_embedsrT   rU   r#   rV   rW   c           	      `   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|j	                         d d \  }}	||j                         nd}
|'|
|	z   }t        j                  |||j                        }|}t        | j                  |||      }|5t        j                  |	|j                        |
z   }|j                  d      }| j                  ||
|      }||z   }t        j                  j                  || j                  | j                         }t#        | j$                        D ]D  \  }}| j                   r%t        j&                  g       }|| j(                  k  r7 ||f||||d	|}F | j+                  |      }t-        ||
      S )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)r<   r   device)r<   input_embedsr!   rT   )r#   rY   )r!   rT   rU   r#   )last_hidden_staterT   )
ValueErrorr|   r	   r<   sizeget_seq_lengthr.   onesr   r   arange	unsqueezer~   rL   r]   rA   r[   	enumerater   randru   r   r   )r'   r   r!   r   rT   rU   r#   rV   
batch_size
seq_lengthr"   mask_seq_lengthself_attn_cachecausal_mask	positionsrS   idxdecoder_layerdropout_probabilitys                      r)   r&   zBioGptModel.forward   s    -t";<stt  --i8M 0*$++>O!.!3!3!5cr!:
JETE`!?!?!Afg!4zAO"ZZ
OML`L`aN)(;;&)+	
  <<
=;O;OPSiiL'11!4L((9O^j(k	%	1--mt||VZVcVc-d"+DKK"8 	C}}&+jjn#&7)* /#) M	 68++
 	
r*   )NNNNNN)r+   r,   r-   r   rD   r   r   r   r.   r/   FloatTensorr   rd   r   r   tupler   r&   r1   r2   s   @r)   rr   rr      s    | *   .23726(,!%04B
##d*B
 ))D0B
 ((4/	B

 B
 $;B
 &&-B
 +,B
 
:	:B
    B
r*   rr   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                   <    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
edz  de	j                  dz  dedz  de	j                  dz  dee	j                  z  dee   deez  fd              Z xZS )BioGptForCausalLMzoutput_projection.weightzbiogpt.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NF)bias)
r%   rD   rr   rg   rL   rM   rE   r{   output_projectionr   r'   r<   r(   s     r)   rD   zBioGptForCausalLM.__init__   sJ     !&)!#6+=+=v?P?PW\!] 	r*   c                     | j                   S rb   r   r'   s    r)   get_output_embeddingsz'BioGptForCausalLM.get_output_embeddings  s    %%%r*   c                     || _         y rb   r   )r'   new_embeddingss     r)   set_output_embeddingsz'BioGptForCausalLM.set_output_embeddings  s
    !/r*   Nr   r!   r   rT   labelsrU   r#   logits_to_keeprV   rW   c	           	          | j                   |f|||||d|	}
|
d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|* | j
                  d||| j                  j                  d|	}t        |||
j                  |
j                  |
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        )r!   r   rT   rU   r#   r   N)logitsr   r{   )lossr   rT   rS   rh   cross_attentionsr6   )rg   
isinstancer0   slicer   loss_functionr<   r{   r   rT   rS   rh   r   )r'   r   r!   r   rT   r   rU   r#   r   rV   outputsrS   slice_indicesr   r   s                  r)   r&   zBioGptForCausalLM.forward  s    ( $++
)'+%
 
  
8B>SV8W~ot4]k''a6I(JK%4%%pVFt{{OeOepiopD0#33!//))$55
 	
r*   NNNNNNNr   )r+   r,   r-   _tied_weights_keysrD   r   r   r   r   r.   r/   r   r   rd   r0   rc   r   r   r   r   r&   r1   r2   s   @r)   r   r      s    56RS&0  .23726(,*.!%04-.+
##d*+
 ))D0+
 ((4/	+

 +
   4'+
 $;+
 &&-+
 ell*+
 +,+
 
2	2+
  +
r*   r   c                       e Zd Z fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  deez  fd              Z xZS )BioGptForTokenClassificationc                 z   t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropout)r%   rD   
num_labelsrr   rg   hasattrr   rI   rL   DropoutrA   rM   rE   
classifierr   )r'   r<   r   r(   s      r)   rD   z%BioGptForTokenClassification.__init__@  s      ++!&)6/0V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr*   Nr   token_type_idsr!   rT   r   r   rU   r#   rW   c	           	      d    | j                   |f|||||d|	}
|
d   }| j                  |      }| j                  |      }d}|t               }||j	                  d      dk(  }|j	                  d| j
                        }t        j                  ||j	                  d      t        j                  |j                        j                  |            } |||      }n2 ||j	                  d| j
                        |j	                  d            }t        |||
j                  |
j                        S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        rT   r!   r   rU   r#   r   Nr   r   )r   r   rS   rh   )rg   rA   r   r   viewr   r.   wheretensorignore_indextype_asr   rS   rh   )r'   r   r   r!   rT   r   r   rU   r#   rV   transformer_outputsrS   r   r   loss_fctactive_lossactive_logitsactive_labelss                     r)   r&   z$BioGptForTokenClassification.forwardN  s.   ( *dkk
+)'%
 
 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/R$-;;*55	
 	
r*   )NNNNNNNN)r+   r,   r-   rD   r   r   r.   r/   r   r   rd   r   r   r&   r1   r2   s   @r)   r   r   >  s      .22637(,26*.!%042
##d*2
 ((4/2
 ))D0	2

 2
 ((4/2
   4'2
 $;2
 &&-2
 
&	&2
  2
r*   r   a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   0    e Zd Zdef fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de
dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  deej                  z  deez  fd              Zd Zd Z xZS )BioGptForSequenceClassificationr<   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r   )
r%   rD   r   rr   rg   rL   rM   rE   scorer   r   s     r)   rD   z(BioGptForSequenceClassification.__init__  sS      ++!&)YYv114??O
 	r*   Nr   r!   rT   r   r   rU   r#   r   rW   c	           	          | j                   |f|||||d|	}
|
d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }||j
                  dd \  }}n|j
                  dd \  }}| j                  j                  d}n|Vt        j                  || j                  j                        j                  d      dz
  j                  |j                        }n.d}t        j                  | j                  j                    d       |t        j"                  ||j                        |f   }d}|| j                  j$                  | j&                  dk(  rd	| j                  _        nl| j&                  dkD  rL|j(                  t        j*                  k(  s|j(                  t        j                  k(  rd
| j                  _        nd| j                  _        | j                  j$                  d	k(  rIt-               }| j&                  dk(  r& ||j/                         |j/                               }n |||      }n| j                  j$                  d
k(  r=t1               } ||j3                  d| j&                        |j3                  d            }n,| j                  j$                  dk(  rt5               } |||      }t7        |||
j8                  |
j:                  |
j<                        S )r   r   r   Nr   r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r   r   rT   rS   rh   )rg   r   r0   r   r   shaper<   rv   r.   nesumtor   r   warning_oncer(   r+   r   problem_typer   dtypelongr   squeezer   r   r   r   rT   rS   rh   )r'   r   r!   rT   r   r   rU   r#   r   rV   r   rS   r   r   r   sequence_lengthpooled_logitsr   r   s                      r)   r&   z'BioGptForSequenceClassification.forward  s   ( *dkk
+)'%
 
 ,A.8B>SV8W~ot4]kM!]A*=>? *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88It{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||Jv}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6/ /??-;;*55
 	
r*   c                 .    | j                   j                  S rb   rg   r|   r   s    r)   get_input_embeddingsz4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r*   c                 &    || j                   _        y rb   r   )r'   values     r)   set_input_embeddingsz4BioGptForSequenceClassification.set_input_embeddings  s    #( r*   r   )r+   r,   r-   r   rD   r   r   r.   r/   r   r   rd   r0   rc   r   r   r&   r   r   r1   r2   s   @r)   r   r     s   |   .237(,26*.!%04-.O
##d*O
 ))D0O
 	O

 ((4/O
   4'O
 $;O
 &&-O
 ell*O
 
1	1O
  O
b()r*   r   )r   r   r   rr   rf   )5__doc__ry   r.   torch.nnrL   r   r   r   activationsr   cache_utilsr   r	   
generationr
   masking_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   bart.modeling_bartr   r   r   opt.modeling_optr   configuration_biogptr   r    r4   r9   r;   rf   rr   r   r   r   __all__r6   r*   r)   <module>r      sH       A A ! . ) /  . &  8 5 
 = .U'D U	 7 		m 	@) @F O   [
' [
 [
| 
?
- ?

?
D C
#8 C
 C
L a)&; a)a)Hr*   