
    i                     >   d Z ddlZddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)  e#jT                  e+      Z, G d dejZ                        Z. G d dejZ                        Z/ G d dejZ                        Z0 G d dejZ                        Z1 G d dejZ                        Z2 G d dejZ                        Z3 G d d ejZ                        Z4 G d! d"ejZ                        Z5 G d# d$e      Z6e! G d% d&e             Z7 G d' d(ejZ                        Z8 G d) d*ejZ                        Z9 G d+ d,ejZ                        Z:e! G d- d.e7             Z; G d/ d0ejZ                        Z<e! G d1 d2e7             Z= G d3 d4ejZ                        Z> e!d56       G d7 d8e7             Z?e! G d9 d:e7             Z@e! G d; d<e7             ZAe! G d= d>e7             ZBg d?ZCy)@zPyTorch ConvBERT model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNget_activation)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )ConvBertConfigc                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  f
d	Z xZ	S )ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_idsr   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr$   sizelongselfconfig	__class__s     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/convbert/modeling_convbert.pyr,   zConvBertEmbeddings.__init__6   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`"f&;&;AVAVWzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr(   r$   inputs_embedsreturnc                 2   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	| j                  |      }
||	z   |
z   }| j                  |      }| j                  |      }|S )Nr&   r   r(   r   r*   device)r@   r$   hasattrr(   r>   r<   r?   rA   rM   r1   r3   r5   r6   r:   )rC   rH   r(   r$   rI   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr3   r5   
embeddingss               rF   forwardzConvBertEmbeddings.forwardF   s,     #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M"66|D $ : :> J"%88;PP
^^J/
\\*-
rG   )NNNN)
__name__
__module____qualname____doc__r,   r<   
LongTensorFloatTensorrT   __classcell__rE   s   @rF   r   r   3   s    Q
$ .2260426$##d*$ ((4/$ &&-	$
 ((4/$ 
		$rG   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )SeparableConv1DzSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    t         |           t        j                  |||||dz  d      | _        t        j                  ||dd      | _        t        j                  t        j                  |d            | _	        | j                  j                  j                  j                  d|j                         | j
                  j                  j                  j                  d|j                         y )N   F)kernel_sizegroupspaddingbiasr   )ra   rd           meanstd)r+   r,   r   Conv1d	depthwise	pointwise	Parameterr<   r?   rd   weightdatanormal_initializer_range)rC   rD   input_filtersoutput_filtersra   kwargsrE   s         rF   r,   zSeparableConv1D.__init__p   s    # 1$
 =.aV[\LL^Q!?@	""**9Q9Q*R""**9Q9Q*RrG   hidden_statesrJ   c                 h    | j                  |      }| j                  |      }|| j                  z  }|S N)rj   rk   rd   )rC   rt   xs      rF   rT   zSeparableConv1D.forward   s0    NN=)NN1	TYYrG   	rU   rV   rW   rX   r,   r<   TensorrT   r[   r\   s   @rF   r^   r^   m   s'    ]S U\\ ell rG   r^   c                        e Zd Z fdZ	 	 d	dej
                  dej                  dz  dej
                  dz  dee   de	ej
                  ej
                  f   f
dZ
 xZS )
ConvBertSelfAttentionc                 j   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  |j                  z  }|dk  r|j                  | _        d| _        n|| _        |j                  | _        |j                  | _        |j                  | j                  z  dk7  rt        d      |j                  | j                  z  dz  | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        ||j                  | j                  | j                        | _        t        j                  | j                  | j                  | j                  z        | _        t        j                  |j                  | j                        | _        t        j&                  | j                  dgt)        | j                  dz
  dz        dg	      | _        t        j,                  |j.                        | _        y )
Nr   r/   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsr`   )ra   rc   )r+   r,   hidden_sizenum_attention_headsrN   
ValueError
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   Linearquerykeyvaluer^   key_conv_attn_layerconv_kernel_layerconv_out_layerUnfoldintunfoldr8   attention_probs_dropout_probr:   )rC   rD   new_num_attention_headsrE   s      rF   r,   zConvBertSelfAttention.__init__   s>    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)"<"<@Q@Q"Q"Q&$88DO'(D$'>D$$//DO & 7 7 8 88A=UVV$*$6$6$:R:R$RWX#X !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
#2F&&(:(:D<Q<Q$
  "$4+=+=t?W?WZ^ZoZo?o!p ii(:(:D<N<NOii..2S$BWBWZ[B[_`A`=acd<e
 zz&"E"EFrG   Nrt   attention_maskencoder_hidden_statesrs   rJ   c                     |j                   d d }g |d| j                  }|#| j                  |      }| j                  |      }n"| j                  |      }| j                  |      }| j	                  |j                  dd            }	|	j                  dd      }	| j                  |      }
|
j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }t        j                  |	|
      }| j                  |      }t        j                  |d| j                  dg      }t        j                  |d      }| j                  |      }t        j                  ||d   d| j                  g      }|j                  dd      j!                         j#                  d      }t$        j&                  j)                  || j                  dgd| j                  dz
  dz  dgd      }|j                  dd      j                  |d   d| j                  | j                        }t        j                  |d| j                  | j                  g      }t        j*                  ||      }t        j                  |d| j                  g      }t        j*                  ||j                  dd            }|t-        j.                  | j                        z  }|||z   }t$        j&                  j                  |d      }| j1                  |      }t        j*                  ||      }|j3                  dddd      j!                         }t        j                  ||d   d| j4                  | j                  g      }t        j6                  ||gd      }|j9                         d d | j4                  | j                  z  dz  fz   } |j                  | }||fS )	Nr&   r   r`   dimr   )ra   dilationrc   strider   )shaper   r   r   r   	transposer   viewr<   multiplyr   reshaper   softmaxr   r   
contiguous	unsqueezer   
functionalr   matmulmathsqrtr:   permuter   catr@   )rC   rt   r   r   rs   rO   hidden_shapemixed_key_layermixed_value_layermixed_key_conv_attn_layermixed_query_layerquery_layer	key_layervalue_layerconv_attn_layerr   r   attention_scoresattention_probscontext_layerconv_outnew_context_layer_shapes                         rF   rT   zConvBertSelfAttention.forward   s    $))#2.CCbC$*B*BC !,"hh'<=O $

+@ A"hh}5O $

= 9$($<$<]=T=TUVXY=Z$[!$=$G$G1$M! JJ}5',,\:DDQJ#((6@@AF	',,\:DDQJ..)BDUV 22?C!MM*;b$BWBWYZ=[\!MM*;C,,];~ADL^L^7_`'11!Q7BBDNNrR--..2++a/A5q9 . 
 (11!Q7??NB 2 2D4I4I
 ~D<T<TVZVkVk7lmn6GH~D<N<N7OP !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF==[^R1I1I4KcKcd
 		=(";Q? #0"4"4"6s";$$t'?'??!C?
 #
 +**,CDo--rG   NN)rU   rV   rW   r,   r<   ry   rZ   r   r   tuplerT   r[   r\   s   @rF   r{   r{      s}    %GT 4859	O.||O. ))D0O.  %||d2	O.
 +,O. 
u||U\\)	*O.rG   r{   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ConvBertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr"   )r+   r,   r   r   r~   denser6   r7   r8   r9   r:   rB   s     rF   r,   zConvBertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rG   rt   input_tensorrJ   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rv   r   r:   r6   rC   rt   r   s      rF   rT   zConvBertSelfOutput.forward  7    

=1]3}|'CDrG   rU   rV   rW   r,   r<   ry   rT   r[   r\   s   @rF   r   r     s1    >U\\  RWR^R^ rG   r   c                        e Zd Z fdZ	 	 d	dej
                  dej                  dz  dej
                  dz  dee   dej
                  f
dZ	 xZ
S )
ConvBertAttentionc                 b    t         |           t        |      | _        t	        |      | _        y rv   )r+   r,   r{   rC   r   outputrB   s     rF   r,   zConvBertAttention.__init__  s&    )&1	(0rG   Nrt   r   r   rs   rJ   c                 \     | j                   ||fd|i|\  }}| j                  ||      }|S )Nr   )rC   r   )rC   rt   r   r   rs   r   _attention_outputs           rF   rT   zConvBertAttention.forward  sL     %499
 #8
 	
q  ;;}mDrG   r   )rU   rV   rW   r,   r<   ry   rZ   r   r   rT   r[   r\   s   @rF   r   r     sg    1 4859	 ||  ))D0   %||d2	 
 +,  
 rG   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GroupedLinearLayerc                    t         |           || _        || _        || _        | j                  | j                  z  | _        | j                  | j                  z  | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        j                  t        j                  |            | _        y rv   )r+   r,   
input_sizeoutput_size
num_groupsgroup_in_dimgroup_out_dimr   rl   r<   emptyrm   rd   )rC   r   r   r   rE   s       rF   r,   zGroupedLinearLayer.__init__'  s    $&$ OOt>!--@ll5;;t@Q@QSWSeSe#fgLL[!9:	rG   rt   rJ   c                    t        |j                               d   }t        j                  |d| j                  | j
                  g      }|j                  ddd      }t        j                  || j                        }|j                  ddd      }t        j                  ||d| j                  g      }|| j                  z   }|S )Nr   r&   r   r`   )listr@   r<   r   r   r   r   r   rm   r   rd   )rC   rt   
batch_sizerw   s       rF   rT   zGroupedLinearLayer.forward1  s    -,,./2
MM-"doot?P?P)QRIIaALLDKK(IIaAMM!j"d.>.>?@		MrG   r   r\   s   @rF   r   r   &  s#    ;U\\ ell rG   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ConvBertIntermediatec                    t         |           |j                  dk(  r0t        j                  |j
                  |j                        | _        n1t        |j
                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y )Nr   r   r   r   )r+   r,   r   r   r   r~   intermediate_sizer   r   
isinstance
hidden_actstrr
   intermediate_act_fnrB   s     rF   r,   zConvBertIntermediate.__init__=  s    !6#5#5v7O7OPDJ+!--6;S;S`f`q`qDJ f''-'-f.?.?'@D$'-'8'8D$rG   rt   rJ   c                 J    | j                  |      }| j                  |      }|S rv   )r   r   rC   rt   s     rF   rT   zConvBertIntermediate.forwardJ  s&    

=100?rG   r   r\   s   @rF   r   r   <  s#    9U\\ ell rG   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ConvBertOutputc                    t         |           |j                  dk(  r0t        j                  |j
                  |j                        | _        n1t        |j
                  |j                  |j                        | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        y )Nr   r   r"   )r+   r,   r   r   r   r   r~   r   r   r6   r7   r8   r9   r:   rB   s     rF   r,   zConvBertOutput.__init__Q  s    !6#;#;V=O=OPDJ+!33ASAS`f`q`qDJ f&8&8f>S>STzz&"<"<=rG   rt   r   rJ   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rv   r   r   s      rF   rT   zConvBertOutput.forward\  r   rG   r   r\   s   @rF   r   r   P  s1    	>U\\  RWR^R^ rG   r   c                        e Zd Z fdZ	 	 	 ddej
                  dej                  dz  dej
                  dz  dej
                  dz  dee   dej
                  fd	Z	d
 Z
 xZS )ConvBertLayerc                 b   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r*| j                  st        |  d      t	        |      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is added)r+   r,   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr   intermediater   r   rB   s     rF   r,   zConvBertLayer.__init__d  s    '-'E'E$*62 ++#)#=#= ##??4&(f ghh"3F";D08$V,rG   Nrt   r   r   encoder_attention_maskrs   rJ   c                     | j                   ||fi |}| j                  r3|1t        | d      st        d|  d       | j                  ||fd|i|}t        | j                  | j                  | j                  |      }|S )Nr   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   r   rN   AttributeErrorr   r   feed_forward_chunkr   r   )rC   rt   r   r   r   rs   r   layer_outputs           rF   rT   zConvBertLayer.forwardr  s     *4>>
 
 ??4@4!12$=dV DD D   3t22 &  '<  	  1##T%A%A4CSCSUe
 rG   c                 L    | j                  |      }| j                  ||      }|S rv   )r   r   )rC   r   intermediate_outputr   s       rF   r   z ConvBertLayer.feed_forward_chunk  s,    "//0@A{{#68HIrG   NNN)rU   rV   rW   r,   r<   ry   rZ   r   r   rT   r   r[   r\   s   @rF   r   r   c  s    -" 48596:|| ))D0  %||d2	
 !&t 3 +, 
@rG   r   c                   d     e Zd ZU eed<   dZdZeedZ	 e
j                          fd       Z xZS )ConvBertPreTrainedModelrD   convbertT)rt   
attentionsc                 b   t         |   |       t        |t              r t	        j
                  |j                         yt        |t              rVt	        j                  |j                  d| j                  j                         t	        j
                  |j                         yt        |t              ryt	        j                  |j                  t        j                   |j                  j"                  d         j%                  d             t	        j
                  |j&                         yy)zInitialize the weightsre   rf   r&   r%   N)r+   _init_weightsr   r^   initzeros_rd   r   ro   rm   rD   rp   r   copy_r$   r<   r=   r   r>   r(   )rC   modulerE   s     rF   r   z%ConvBertPreTrainedModel._init_weights  s     	f%fo.KK$ 23LLSdkk6S6STKK$ 23JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 4rG   )rU   rV   rW   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   r{   _can_record_outputsr<   no_gradr   r[   r\   s   @rF   r   r     s?    "&*#&+
 U]]_
/ 
/rG   r   c                        e Zd Z fdZ	 	 	 d	dej
                  dej                  dz  dej
                  dz  dej
                  dz  def
dZ xZ	S )
ConvBertEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r+   r,   rD   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rC   rD   r   rE   s      rF   r,   zConvBertEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#Nrt   r   r   r   rJ   c                 V    | j                   D ]  } |||f||d|} t        |      S )N)r   r   )last_hidden_state)r  r   )rC   rt   r   r   r   rs   layer_modules          rF   rT   zConvBertEncoder.forward  sP     !JJ 	L( '<'=	
 M	 2+
 	
rG   r   )
rU   rV   rW   r,   r<   ry   rZ   r   rT   r[   r\   s   @rF   r  r    si    , 48596:
||
 ))D0
  %||d2	

 !&t 3
 
,
rG   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ConvBertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r+   r,   r   r   r~   r   r   r   r   r
   transform_act_fnr6   r7   rB   s     rF   r,   z(ConvBertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrG   rt   rJ   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rv   )r   r  r6   r   s     rF   rT   z'ConvBertPredictionHeadTransform.forward  s4    

=1--m<}5rG   r   r\   s   @rF   r  r    s$    UU\\ ell rG   r  c                        e Zd ZdZdef fdZ	 d	dej                  dej                  dz  dej                  fdZ	 xZ
S )
ConvBertSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ConvBertConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    rD   c                 f   t         |           t        |dd      | _        | j                  dk(  rt        t        j                         | _        t        |d      rq|j                  ret        |d      r(|j                  r|j                  dkD  r|j                  }n|j                  }t        j                  |j                  |      | _        t        |dd       }|rt        |      nt        j                         | _        t        j                         | _        t        |d      r3|j"                  dkD  r$t        j$                  |j"                        | _        t        j                         | _        t        |d	      r5|j(                  dkD  r%t        j$                  |j(                        | _        y y y )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r+   r,   getattrr  NotImplementedErrorr   IdentitysummaryrN   r  r  
num_labelsr~   r   r   
activationfirst_dropoutr  r8   last_dropoutr   )rC   rD   num_classesactivation_stringrE   s       rF   r,   z ConvBertSequenceSummary.__init__  sU   #FNFC& &%{{}6-.63J3Jv78V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]6238T8TWX8X!#F,H,H!IDKKM612v7R7RUV7V "

6+F+F GD 8W2rG   Nrt   	cls_indexrJ   c                    | j                   dk(  r|dddf   }n| j                   dk(  r|dddf   }n| j                   dk(  r|j                  d      }n| j                   d	k(  r|At        j                  |d
ddddf   |j                  d   dz
  t        j
                        }nX|j                  d      j                  d      }|j                  d|j                         dz
  z  |j                  d      fz         }|j                  d|      j                  d      }n| j                   dk(  rt        | j                        }| j                  |      }| j                  |      }| j!                  |      }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r  Nr&   firstr   rg   r   r   r+  .r   r)   )r&   r  )r  rg   r<   	full_liker   rA   r   r>   r   r@   gathersqueezer"  r'  r$  r&  r(  )rC   rt   r+  r   s       rF   rT   zConvBertSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rG   rv   )rU   rV   rW   rX   r   r,   r<   rZ   rY   rT   r[   r\   s   @rF   r  r    sQ    2H~ H< VZ)"..);@;K;Kd;R)			)rG   r  c                        e Zd Z fdZd Zd Zeee	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
ee   defd                     Z xZS )ConvBertModelc                 "   t         |   |       t        |      | _        |j                  |j
                  k7  r/t        j                  |j                  |j
                        | _        t        |      | _
        || _        | j                          y rv   )r+   r,   r   rS   r/   r~   r   r   embeddings_projectr  encoderrD   	post_initrB   s     rF   r,   zConvBertModel.__init__D  sl     ,V4  F$6$66&(ii0E0EvGYGY&ZD#&v.rG   c                 .    | j                   j                  S rv   rS   r1   rC   s    rF   get_input_embeddingsz"ConvBertModel.get_input_embeddingsP  s    ...rG   c                 &    || j                   _        y rv   r8  )rC   r   s     rF   set_input_embeddingsz"ConvBertModel.set_input_embeddingsS  s    */'rG   NrH   r   r(   r$   rI   rs   rJ   c                    ||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}	||j                  n|j                  }
|t	        j
                  ||
      }|pt        | j                  d      r4| j                  j                  d d d |	f   }|j                  ||	      }|}n&t	        j                  |t        j                  |
      }| j                  ||      }| j                  ||||      }t        | d      r| j                  |      } | j                  |fd	|i|}|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer&   z5You have to specify either input_ids or inputs_embeds)rM   r(   rL   )rH   r$   r(   rI   r4  r   )r   %warn_if_padding_and_no_attention_maskr@   rM   r<   onesrN   rS   r(   r>   r?   rA   get_extended_attention_maskr4  r5  )rC   rH   r   r(   r$   rI   rs   rO   r   rP   rM   rQ   rR   extended_attention_maskrt   encoder_outputss                   rF   rT   zConvBertModel.forwardV  sz     ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZFCN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z"&"B"B>S^"_l>iv ( 
 4-. 33MBM>Jdll?
2?
 ?
 rG   )NNNNN)rU   rV   rW   r,   r:  r<  r   r   r   r<   rY   rZ   r   r   r   rT   r[   r\   s   @rF   r2  r2  B  s    
/0   .237260426/##d*/ ))D0/ ((4/	/
 &&-/ ((4// +,/ 
,/    /rG   r2  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                     t         |           t        d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _
        y )Ngelur"   )r+   r,   r   r&  r   r6   r/   r7   r   r~   r   rB   s     rF   r,   z%ConvBertGeneratorPredictions.__init__  sV    (0f&;&;AVAVWYYv1163H3HI
rG   generator_hidden_statesrJ   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rv   )r   r&  r6   )rC   rG  rt   s      rF   rT   z$ConvBertGeneratorPredictions.forward  s3    

#:;6}5rG   )	rU   rV   rW   rX   r,   r<   rZ   rT   r[   r\   s   @rF   rD  rD    s+    KJu/@/@ UEVEV rG   rD  c                   $    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dee   deez  fd              Z xZS )ConvBertForMaskedLMzgenerator_lm_head.weightz*convbert.embeddings.word_embeddings.weightc                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  |j                        | _
        | j                          y rv   )r+   r,   r2  r   rD  generator_predictionsr   r   r/   r.   generator_lm_headr6  rB   s     rF   r,   zConvBertForMaskedLM.__init__  sR     %f-%A&%I"!#6+@+@&BSBS!TrG   c                     | j                   S rv   rM  r9  s    rF   get_output_embeddingsz)ConvBertForMaskedLM.get_output_embeddings  s    %%%rG   c                     || _         y rv   rO  )rC   r1   s     rF   set_output_embeddingsz)ConvBertForMaskedLM.set_output_embeddings  s
    !0rG   NrH   r   r(   r$   rI   labelsrs   rJ   c                 n    | j                   |f||||d|}|d   }	| j                  |	      }
| j                  |
      }
d}|Pt        j                         } ||
j                  d| j                  j                        |j                  d            }t        ||
|j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        r   r(   r$   rI   r   Nr&   losslogitsrt   r   )r   rL  rM  r   r   r   rD   r.   r   rt   r   )rC   rH   r   r(   r$   rI   rS  rs   rG  generator_sequence_outputprediction_scoresrW  loss_fcts                rF   rT   zConvBertForMaskedLM.forward  s    $ GTdmmG
))%'G
 G
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D$1??.99	
 	
rG   NNNNNN)rU   rV   rW   _tied_weights_keysr,   rP  rR  r   r   r<   rY   rZ   r   r   r   r   rT   r[   r\   s   @rF   rJ  rJ    s    46bc&1  .237260426*.(
##d*(
 ))D0(
 ((4/	(

 &&-(
 ((4/(
   4'(
 +,(
 
	(
  (
rG   rJ  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                 h   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        || _        y rv   )r+   r,   r   r   r~   r   classifier_dropoutr9   r8   r:   r%  out_projrD   rC   rD   ra  rE   s      rF   r,   z#ConvBertClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrG   rt   rJ   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )r:   r   r
   rD   r   rb  )rC   rt   rs   rw   s       rF   rT   z"ConvBertClassificationHead.forward  se    !Q'"LLOJJqM4;;))*1-LLOMM!rG   rx   r\   s   @rF   r_  r_    s&    7	U\\  rG   r_  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                       e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eez  fd              Z xZS )!ConvBertForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        | j                          y rv   )	r+   r,   r%  rD   r2  r   r_  
classifierr6  rB   s     rF   r,   z*ConvBertForSequenceClassification.__init__  sH      ++%f-4V< 	rG   NrH   r   r(   r$   rI   rS  rs   rJ   c                     | j                   |f||||d|}|d   }	| j                  |	      }
d}|| j                  j                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j
                  t        j                  k(  s|j
                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j                  dk(  r=t               } ||
j                  d| j                        |j                  d            }n,| j                  j                  dk(  rt               } ||
|      }t        ||
|j                  |j                   	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        rU  r   Nr   
regressionsingle_label_classificationmulti_label_classificationr&   rV  )r   ri  rD   problem_typer%  r*   r<   rA   r   r   r0  r   r   r   r   rt   r   rC   rH   r   r(   r$   rI   rS  rs   outputssequence_outputrX  rW  r[  s                rF   rT   z)ConvBertForSequenceClassification.forward  s   $ 7Ddmm7
))%'7
 7
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rG   r\  )rU   rV   rW   r,   r   r   r<   rY   rZ   r   r   r   r   rT   r[   r\   s   @rF   rg  rg    s      .237260426*.8
##d*8
 ))D08
 ((4/	8

 &&-8
 ((4/8
   4'8
 +,8
 
)	)8
  8
rG   rg  c                       e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eez  fd              Z xZS )ConvBertForMultipleChoicec                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  d      | _	        | j                          y )Nr   )r+   r,   r2  r   r  sequence_summaryr   r   r~   ri  r6  rB   s     rF   r,   z"ConvBertForMultipleChoice.__init__F  sM     %f- 7 ?))F$6$6: 	rG   NrH   r   r(   r$   rI   rS  rs   rJ   c                    ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |f||||d|}	|	d   }
| j	                  |
      }| j                  |      }|j                  d|      }d}|t               } |||      }t        |||	j                  |	j                        S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r&   r   rU  r   rV  )
r   r   r@   r   ru  ri  r   r   rt   r   )rC   rH   r   r(   r$   rI   rS  rs   num_choicesrp  rq  pooled_outputrX  reshaped_logitsrW  r[  s                   rF   rT   z!ConvBertForMultipleChoice.forwardP  s   V -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 7Ddmm7
))%'7
 7
 "!*--o>/ ++b+6')HOV4D("!//))	
 	
rG   r\  )rU   rV   rW   r,   r   r   r<   rY   rZ   r   r   r   r   rT   r[   r\   s   @rF   rs  rs  D  s      .237260426*.N
##d*N
 ))D0N
 ((4/	N

 &&-N
 ((4/N
   4'N
 +,N
 
*	*N
  N
rG   rs  c                       e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eez  fd              Z xZS )ConvBertForTokenClassificationc                 `   t         |   |       |j                  | _        t        |      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y rv   )r+   r,   r%  r2  r   ra  r9   r   r8   r:   r   r~   ri  r6  rc  s      rF   r,   z'ConvBertForTokenClassification.__init__  s      ++%f-)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rG   NrH   r   r(   r$   rI   rS  rs   rJ   c                 F    | j                   |f||||d|}|d   }	| j                  |	      }	| j                  |	      }
d}|<t               } ||
j	                  d| j
                        |j	                  d            }t        ||
|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        rU  r   Nr&   rV  )	r   r:   ri  r   r   r%  r   rt   r   ro  s                rF   rT   z&ConvBertForTokenClassification.forward  s      7Ddmm7
))%'7
 7
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
rG   r\  )rU   rV   rW   r,   r   r   r<   rY   rZ   r   r   r   r   rT   r[   r\   s   @rF   r{  r{    s      .237260426*.&
##d*&
 ))D0&
 ((4/	&

 &&-&
 ((4/&
   4'&
 +,&
 
&	&&
  &
rG   r{  c                   *    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	e
   defd              Z xZS )ConvBertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y rv   )
r+   r,   r%  r2  r   r   r   r~   
qa_outputsr6  rB   s     rF   r,   z%ConvBertForQuestionAnswering.__init__  sS      ++%f-))F$6$68I8IJ 	rG   NrH   r   r(   r$   rI   start_positionsend_positionsrs   rJ   c                     | j                   |f||||d|}	|	d   }
| j                  |
      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }t        ||||	j                  |	j                        S )	NrU  r   r   r&   r   )ignore_indexr`   )rW  start_logits
end_logitsrt   r   )r   r  splitr0  r   lenr@   clampr   r   rt   r   )rC   rH   r   r(   r$   rI   r  r  rs   rp  rq  rX  r  r  
total_lossignored_indexr[  
start_lossend_losss                      rF   rT   z$ConvBertForQuestionAnswering.forward  s~    7Ddmm7
))%'7
 7
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
rG   )NNNNNNN)rU   rV   rW   r,   r   r   r<   rY   rZ   r   r   r   rT   r[   r\   s   @rF   r  r    s      .23726042637152
##d*2
 ))D02
 ((4/	2

 &&-2
 ((4/2
 ))D02
 ''$.2
 +,2
 
&2
  2
rG   r  )rJ  rs  r  rg  r{  r   r2  r   )DrX   r   collections.abcr   r<   r   torch.nnr   r   r    r	   r   activationsr
   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   configuration_convbertr   
get_loggerrU   loggerModuler   r^   r{   r   r   r   r   r   r   r   r  r  r  r2  rD  rJ  r_  rg  rs  r{  r  __all__ rG   rF   <module>r     s+     $   A A & 1 9  . & 6  8 5 2 
		H	%7 7tbii 4w.BII w.t  		  . ,299 (RYY &2. 2j /o / /.
bii 
:bii $`bii `F E+ E EP299 $ =
1 =
 =
@ 0 E
(? E
E
P [
 7 [
 [
| 7
%< 7
 7
t ?
#: ?
 ?
D	rG   