
    il                     "   d dl Z d dlmZ d dlZd dlmZ d dlmZmZmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+  e%jX                  e-      Z. G d dej^                        Z0 G d dej^                        Z1	 	 d2dejd                  dejf                  dejf                  dejf                  dejf                  dz  de4dz  de4de e"   fdZ5 G d  d!ejd                        Z6 G d" d#e      Z7e# G d$ d%e             Z8e# G d& d'e8             Z9 e#d()       G d* d+e8e             Z:e# G d, d-e8             Z; e#d.)       G d/ d0e8             Z<g d1Z=y)3    N)Callable)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )BioGptConfigc                   x     e Zd ZdZdedef fdZ	 	 d
dej                  dedej                  dz  f fd	Z xZ	S ) BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y )N   )offsetsuper__init__)selfr!   r"   	__class__s      {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/biogpt/modeling_biogpt.pyr'   z)BioGptLearnedPositionalEmbedding.__init__8   s$     $++5}E    Nattention_maskpast_key_values_lengthposition_idsc                     |8t        j                  |d      }||z  dz
  j                         }|dd|df   }t        |   || j
                  z         S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr&   forwardr%   )r(   r,   r-   r.   r)   s       r*   r5   z(BioGptLearnedPositionalEmbedding.forward>   s^      <<A>L(>9A=CCEL'+A+B(BCLw|dkk9::r+   )r   N)
__name__
__module____qualname____doc__intr'   r2   
LongTensorr5   __classcell__r)   s   @r*   r    r    3   s]    Fs F3 F '(04	;((; !$; &&-	; ;r+   r    c            
       `     e Zd ZdZd
dededededz  f fdZdej                  f fd	Z	 xZ
S )BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r!   r"   padding_idxembed_scaleNc                 6    t         |   |||       || _        y N)r&   r'   rA   )r(   r!   r"   r@   rA   r)   s        r*   r'   z"BioGptScaledWordEmbedding.__init__T   s    D&r+   	input_idsc                 <    t         |   |      | j                  z  S rC   )r&   r5   rA   )r(   rD   r)   s     r*   r5   z!BioGptScaledWordEmbedding.forwardX   s    wy)D,<,<<<r+   )      ?)r6   r7   r8   r9   r:   floatr'   r2   Tensorr5   r<   r=   s   @r*   r?   r?   O   sE    's '3 'S '_dgk_k '= = =r+   r?   modulequerykeyvaluer,   scalingdropoutkwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )N      r$   r   r0   ptrainingr   )
sizer2   matmul	transposenn
functionalsoftmaxrN   rU   
contiguous)
rI   rJ   rK   rL   r,   rM   rN   rO   attn_weightsattn_outputs
             r*   eager_attention_forwardr_   \   s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r+   c                       e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZ	 	 	 dde	j                  de	j                  dz  dedz  de	j                  dz  dee   dee	j                  e	j                  dz  f   fdZ xZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsrN   
is_decoderbias	is_causalconfig	layer_idxc	                    t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        || _        |9| j                  r-t        j                  d| j                  j                   d       t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rR   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.re   )r&   r'   rb   rc   rN   head_dimrg   
ValueErrorrM   rd   rf   rh   loggerwarning_oncer)   r6   rY   Lineark_projv_projq_projout_proj)
r(   rb   rc   rN   rd   re   rf   rg   rh   r)   s
            r*   r'   zBioGptAttention.__init__{   s$    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	94@ii	94@ii	94@		)YTBr+   hidden_stateskey_value_statespast_key_valuesr,   rO   returnc                    |du}|j                   dd }g |d| j                  }| j                  |      j                  |      j	                  dd      }	d}
|St        |t              rA|j                  j                  | j                        }
|r|j                  }n|j                  }n|}|r|n|}|rK|I|
rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j!                  |      }g |j                   dd d| j                  }|j                  |      j	                  dd      }|j                  |      j	                  dd      }|Kj#                  ||| j                        \  }}|r)t        |t              rd|j                  | j                  <   t%        j&                  | j(                  j*                  t,              } || |	|||f| j.                  sdn| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j9                  |      }||fS )	z#Input shape: Batch x Time x ChannelNrQ   r   r$   FT        )rN   rM   )shaperk   rr   viewrX   
isinstancer   
is_updatedgetrh   cross_attention_cacheself_attention_cachelayerskeysvaluesrp   rq   updater   get_interfacerg   _attn_implementationr_   rU   rN   rM   reshaper\   rs   )r(   rt   ru   rv   r,   rO   is_cross_attentioninput_shapehidden_shapequery_statesr}   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacer^   r]   s                      r*   r5   zBioGptAttention.forward   sd    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+FzS_aeaoao+p(
L%*_FY*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ *k));;;;FFHmmK0L((r+   )ry   FTFNN)NNN)r6   r7   r8   r9   r:   rG   boolr   r'   r2   rH   r	   r   r   tupler5   r<   r=   s   @r*   ra   ra   x   s   G  &* $%C%C %C 	%C
 %C %C %C t#%C :%CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H)r+   ra   c                        e Zd Zddededz  f fdZ	 	 	 	 ddej                  dej                  dz  dedz  de	dz  d	ej                  dz  d
ee   dej                  fdZ xZS )BioGptDecoderLayerNrg   rh   c           	      n   t         |           |j                  | _        t	        | j                  |j
                  |j                  dd||      | _        |j                  | _	        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t        j"                  | j                  |j$                        | _        t        j"                  |j$                  | j                        | _        t        j                  | j                        | _        y )NT)rb   rc   rN   rd   rf   rg   rh   )r&   r'   hidden_sizerb   ra   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrN   r   
hidden_actactivation_fnactivation_dropoutrY   	LayerNormself_attn_layer_normro   intermediate_sizefc1fc2final_layer_norm)r(   rg   rh   r)   s      r*   r'   zBioGptDecoderLayer.__init__   s    ++(nn0077
 11#F$5$56"(";";$&LL$@!99T^^V-E-EF99V55t~~F "T^^ <r+   rt   r,   rv   	use_cacher.   rO   rw   c                 D   |}| j                  |      } | j                  d||||d|\  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
        )rt   rv   r,   r.   rS    )r   r   rY   rZ   rN   rU   r   r   r   r   r   )	r(   rt   r,   rv   r   r.   rO   residual_s	            r*   r5   zBioGptDecoderLayer.forward  s     !11-@ *4>> 
'+)%	

 
q --mt||VZVcVc-d =0 !--m</**=9--mt?V?Vaeanan-o/--mt||VZVcVc-d =0r+   rC   )NNTN)r6   r7   r8   r   r:   r'   r2   rH   r	   r   r;   r   r   r5   r<   r=   s   @r*   r   r      s    =| =d
 =4 /3(,!%04)||) t+) 	)
 $;) &&-) +,) 
)r+   r   c                   :    e Zd ZU eed<   dZdZdZdZdZ	dZ
eedZy)BioGptPreTrainedModelrg   biogptT)rt   
attentionsN)r6   r7   r8   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   ra   _can_record_outputsr   r+   r*   r   r   1  s9     &*#N!+%r+   r   c                        e Zd Zdef fdZeee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ej                  dz  d
ee   deez  fd                     Z xZS )BioGptModelrg   c           	         t         |   |       || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        |j                  rt        j                  |j                        nd}t        |j                  | j                  | j                  |      | _        t!        |j"                  | j                        | _        t'        j(                  t+        |j,                        D cg c]  }t/        ||       c}      | _        t'        j2                  | j                        | _        d| _        | j9                          y c c}w )NrF   )rA   )rh   F)r&   r'   rg   	layerdropr   rN   r   rb   pad_token_idr@   scale_embeddingmathsqrtr?   
vocab_sizeembed_tokensr    max_position_embeddingsembed_positionsrY   
ModuleListrangenum_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r(   rg   rA   ir)   s       r*   r'   zBioGptModel.__init__B  s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vQR%7!%L$vw,,t~~6&+# %ws   E"NrD   r,   inputs_embedsrv   r   r.   rO   rw   c           	      `   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|j	                         d d \  }}	||j                         nd}
|'|
|	z   }t        j                  |||j                        }|}t        | j                  |||      }|5t        j                  |	|j                        |
z   }|j                  d      }| j                  ||
|      }||z   }t        j                  j                  || j                  | j                         }t#        | j$                        D ]D  \  }}| j                   r%t        j&                  g       }|| j(                  k  r7 ||f||||d	|}F | j+                  |      }t-        ||
      S )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)rg   rQ   r   device)rg   input_embedsr,   rv   )r.   rS   )r,   rv   r   r.   )last_hidden_staterv   )rl   r   r
   rg   rV   get_seq_lengthr2   onesr   r   arange	unsqueezer   rY   rZ   rN   rU   	enumerater   randr   r   r   )r(   rD   r,   r   rv   r   r.   rO   
batch_size
seq_lengthr-   mask_seq_lengthself_attn_cachecausal_mask	positionsrt   idxdecoder_layerdropout_probabilitys                      r*   r5   zBioGptModel.forwardW  s    -t";<stt  --i8M 0*$++>O!.!3!3!5cr!:
JETE`!?!?!Afg!4zAO"ZZ
OML`L`aN)(;;&)+	
  <<
=;O;OPSiiL'11!4L((9O^j(k	%	1--mt||VZVcVc-d"+DKK"8 	C}}&+jjn#&7)* /#) M	 68++
 	
r+   )NNNNNN)r6   r7   r8   r   r'   r   r   r   r2   r;   FloatTensorr	   r   r   r   r   r   r5   r<   r=   s   @r*   r   r   @  s    | *   .23726(,!%04B
##d*B
 ))D0B
 ((4/	B

 B
 $;B
 &&-B
 +,B
 
:	:B
    B
r+   r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                   <    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
edz  de	j                  dz  dedz  de	j                  dz  dee	j                  z  dee   deez  fd              Z xZS )BioGptForCausalLMzoutput_projection.weightzbiogpt.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFrj   )
r&   r'   r   r   rY   ro   r   r   output_projectionr   r(   rg   r)   s     r*   r'   zBioGptForCausalLM.__init__  sJ     !&)!#6+=+=v?P?PW\!] 	r+   c                     | j                   S rC   r   r(   s    r*   get_output_embeddingsz'BioGptForCausalLM.get_output_embeddings  s    %%%r+   c                     || _         y rC   r   )r(   new_embeddingss     r*   set_output_embeddingsz'BioGptForCausalLM.set_output_embeddings  s
    !/r+   NrD   r,   r   rv   labelsr   r.   logits_to_keeprO   rw   c	           	          | j                   |f|||||d|	}
|
d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|* | j
                  d||| j                  j                  d|	}t        |||
j                  |
j                  |
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        )r,   r   rv   r   r.   r   N)logitsr   r   )lossr   rv   rt   r   cross_attentionsr   )r   r|   r:   slicer   loss_functionrg   r   r   rv   rt   r   r   )r(   rD   r,   r   rv   r   r   r.   r   rO   outputsrt   slice_indicesr   r   s                  r*   r5   zBioGptForCausalLM.forward  s    ( $++
)'+%
 
  
8B>SV8W~ot4]k''a6I(JK%4%%pVFt{{OeOepiopD0#33!//))$55
 	
r+   NNNNNNNr   )r6   r7   r8   _tied_weights_keysr'   r   r   r   r   r2   r;   r   r	   r   r:   rH   r   r   r   r   r5   r<   r=   s   @r*   r   r     s    56RS&0  .23726(,*.!%04-.+
##d*+
 ))D0+
 ((4/	+

 +
   4'+
 $;+
 &&-+
 ell*+
 +,+
 
2	2+
  +
r+   r   c                       e Zd Z fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  deez  fd              Z xZS )BioGptForTokenClassificationc                 z   t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropout)r&   r'   
num_labelsr   r   hasattrr   r   rY   DropoutrN   ro   r   
classifierr   )r(   rg   r   r)   s      r*   r'   z%BioGptForTokenClassification.__init__  s      ++!&)6/0V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr+   NrD   token_type_idsr,   rv   r   r   r   r.   rw   c	           	      d    | j                   |f|||||d|	}
|
d   }| j                  |      }| j                  |      }d}|t               }||j	                  d      dk(  }|j	                  d| j
                        }t        j                  ||j	                  d      t        j                  |j                        j                  |            } |||      }n2 ||j	                  d| j
                        |j	                  d            }t        |||
j                  |
j                        S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        rv   r,   r   r   r.   r   NrQ   r   )r   r   rt   r   )r   rN   r   r   r{   r   r2   wheretensorignore_indextype_asr   rt   r   )r(   rD   r  r,   rv   r   r   r   r.   rO   transformer_outputsrt   r   r   loss_fctactive_lossactive_logitsactive_labelss                     r*   r5   z$BioGptForTokenClassification.forward  s.   ( *dkk
+)'%
 
 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/R$-;;*55	
 	
r+   )NNNNNNNN)r6   r7   r8   r'   r   r   r2   r;   r   r	   r   r   r   r5   r<   r=   s   @r*   r   r     s      .22637(,26*.!%042
##d*2
 ((4/2
 ))D0	2

 2
 ((4/2
   4'2
 $;2
 &&-2
 
&	&2
  2
r+   r   a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   0    e Zd Zdef fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de
dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  deej                  z  deez  fd              Zd Zd Z xZS )BioGptForSequenceClassificationrg   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r   )
r&   r'   r   r   r   rY   ro   r   scorer   r   s     r*   r'   z(BioGptForSequenceClassification.__init__<  sS      ++!&)YYv114??O
 	r+   NrD   r,   rv   r   r   r   r.   r   rw   c	           	          | j                   |f|||||d|	}
|
d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }||j
                  dd \  }}n|j
                  dd \  }}| j                  j                  d}n|Vt        j                  || j                  j                        j                  d      dz
  j                  |j                        }n.d}t        j                  | j                  j                    d       |t        j"                  ||j                        |f   }d}|| j                  j$                  | j&                  dk(  rd	| j                  _        nl| j&                  dkD  rL|j(                  t        j*                  k(  s|j(                  t        j                  k(  rd
| j                  _        nd| j                  _        | j                  j$                  d	k(  rIt-               }| j&                  dk(  r& ||j/                         |j/                               }n |||      }n| j                  j$                  d
k(  r=t1               } ||j3                  d| j&                        |j3                  d            }n,| j                  j$                  dk(  rt5               } |||      }t7        |||
j8                  |
j:                  |
j<                        S )r  r  r   Nr$   rQ   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r   r   rv   rt   r   )r   r|   r:   r   r  rz   rg   r   r2   nesumtor   rm   rn   r)   r6   r   problem_typer   dtyper4   r   squeezer   r{   r   r   rv   rt   r   )r(   rD   r,   rv   r   r   r   r.   r   rO   r	  rt   r   r   r   sequence_lengthpooled_logitsr   r
  s                      r*   r5   z'BioGptForSequenceClassification.forwardE  s   ( *dkk
+)'%
 
 ,A.8B>SV8W~ot4]kM!]A*=>? *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88It{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||Jv}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6/ /??-;;*55
 	
r+   c                 .    | j                   j                  S rC   r   r   r   s    r*   get_input_embeddingsz4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r+   c                 &    || j                   _        y rC   r  )r(   rL   s     r*   set_input_embeddingsz4BioGptForSequenceClassification.set_input_embeddings  s    #( r+   r   )r6   r7   r8   r   r'   r   r   r2   r;   r   r	   r   r:   rH   r   r   r5   r   r"  r<   r=   s   @r*   r  r  -  s   |   .237(,26*.!%04-.O
##d*O
 ))D0O
 	O

 ((4/O
   4'O
 $;O
 &&-O
 ell*O
 
1	1O
  O
b()r+   r  )r   r   r  r   r   )Nry   )>r   collections.abcr   r2   torch.nnrY   r   r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   configuration_biogptr   
get_loggerr6   rm   	Embeddingr    r?   ModulerH   rG   r_   ra   r   r   r   r   r   r  __all__r   r+   r*   <module>r6     s  *  $   A A ! C C ) / B 9  G & R R 7 5 . 
		H	%;r|| ;8
= 
=& !%II%<<% 
% <<	%
 LL4'% T\% % '(%8r)bii r)jA3 AH O   [
' [
 [
| 
?
- ?

?
D C
#8 C
 C
L a)&; a)a)Hr+   