
    i|                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&  ejN                  e(      Z) G d dejT                        Z+	 d>dejT                  dejX                  dejX                  dejX                  dejX                  dz  de-de-fdZ. G d dejT                        Z/ G d  d!ejT                        Z0 G d" d#ejT                        Z1 G d$ d%ejT                        Z2 G d& d'ejT                        Z3 G d( d)e      Z4 G d* d+ejT                        Z5e G d, d-e             Z6e G d. d/e6             Z7 G d0 d1ejT                        Z8 G d2 d3ejT                        Z9e G d4 d5e6             Z:e ed67       G d8 d9e                    Z; ed:7       G d; d<e6             Z<g d=Z=y)?zPyTorch Splinter model.    )Callable)	dataclassN)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputModelOutputQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)merge_with_config_defaults)capture_outputs   )SplinterConfigc                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dej                  dz  def
d	Z	 xZ
S )SplinterEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 |   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       y )N)padding_idxepsposition_idsr   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__s     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/splinter/modeling_splinter.pyr&   zSplinterEmbeddings.__init__*   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
    N	input_idstoken_type_idsr!   inputs_embedsreturnc                    ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                  |      }||z   }| j                  |      }	||	z  }| j                  |      }| j                  |      }|S )Nr#   r   dtypedevice)sizer!   r6   zeroslongrF   r+   r/   r-   r0   r4   )
r:   r?   r@   r!   rA   input_shape
seq_lengthr/   
embeddingsr-   s
             r=   forwardzSplinterEmbeddings.forward8   s      #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r>   )NNNN)__name__
__module____qualname____doc__r&   r6   
LongTensorFloatTensortuplerM   __classcell__r<   s   @r=   r   r   '   sz    Q
  .2260426##d* ((4/ &&-	
 ((4/ 
r>   r   modulequerykeyvalueattention_maskscalingr4   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )N   r   r#   )dimrE   )ptrainingr   )r6   matmul	transposer   
functionalsoftmaxfloat32torE   r4   ra   
contiguous)
rW   rX   rY   rZ   r[   r\   r4   kwargsattn_weightsattn_outputs
             r=   eager_attention_forwardrl   Z   s     <<s}}Q':;gEL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r>   c                        e Zd Z fdZ	 ddej
                  dej                  dz  dee   de	ej
                  ej
                  dz  f   fdZ
 xZS )	SplinterSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )r%   r&   r)   num_attention_headshasattr
ValueErrorr;   intattention_head_sizeall_head_sizer   LinearrX   rY   rZ   r2   attention_probs_dropout_probr4   attention_dropoutr\   r9   s     r=   r&   zSplinterSelfAttention.__init__r   sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r>   Nhidden_statesr[   ri   rB   c                 x   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  sdn| j                  | j                  d|\  }
} |
j                  g |d j!                         }
|
|fS )Nr#   r   r^           )r4   r\   )shaperv   rX   viewrc   rY   rZ   r   get_interfacer;   _attn_implementationrl   ra   rz   r\   reshaperh   )r:   r{   r[   ri   rJ   hidden_shapequery_states
key_statesvalue_statesattention_interfacerk   rj   s               r=   rM   zSplinterSelfAttention.forward   s>    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHL((r>   N)rN   rO   rP   r&   r6   TensorrS   r   r   rT   rM   rU   rV   s   @r=   rn   rn   q   sd    60 48)||) ))D0) +,	)
 
u||U\\D00	1)r>   rn   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )SplinterSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r%   r&   r   rx   r)   denser0   r1   r2   r3   r4   r9   s     r=   r&   zSplinterSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r>   r{   input_tensorrB   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r4   r0   r:   r{   r   s      r=   rM   zSplinterSelfOutput.forward   7    

=1]3}|'CDr>   rN   rO   rP   r&   r6   r   rM   rU   rV   s   @r=   r   r      1    >U\\  RWR^R^ r>   r   c            	            e Zd Z fdZ	 ddej
                  dej                  dz  dee   dej
                  fdZ	 xZ
S )	SplinterAttentionc                 b    t         |           t        |      | _        t	        |      | _        y r   )r%   r&   rn   r:   r   outputr9   s     r=   r&   zSplinterAttention.__init__   s&    )&1	(0r>   Nr{   r[   ri   rB   c                 ^    |} | j                   |fd|i|\  }}| j                  ||      }|S Nr[   )r:   r   )r:   r{   r[   ri   residual_s         r=   rM   zSplinterAttention.forward   sK     !$499
)
 
q
 M8<r>   r   )rN   rO   rP   r&   r6   r   rS   r   r   rM   rU   rV   s   @r=   r   r      sQ    1 48|| ))D0 +,	
 
r>   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )SplinterIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r%   r&   r   rx   r)   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnr9   s     r=   r&   zSplinterIntermediate.__init__   s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r>   r{   rB   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )r:   r{   s     r=   rM   zSplinterIntermediate.forward   s&    

=100?r>   r   rV   s   @r=   r   r      s#    9U\\ ell r>   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )SplinterOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r%   r&   r   rx   r   r)   r   r0   r1   r2   r3   r4   r9   s     r=   r&   zSplinterOutput.__init__   s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r>   r{   r   rB   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r=   rM   zSplinterOutput.forward   r   r>   r   rV   s   @r=   r   r      r   r>   r   c            	            e Zd Z fdZ	 d	dej
                  dej                  dz  dee   dej
                  fdZ	d Z
 xZS )
SplinterLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
r%   r&   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r9   s     r=   r&   zSplinterLayer.__init__   sI    '-'E'E$*6208$V,r>   Nr{   r[   ri   rB   c                      | j                   |fd|i|}t        | j                  | j                  | j                  |      }|S r   )r   r   feed_forward_chunkr   r   )r:   r{   r[   ri   s       r=   rM   zSplinterLayer.forward   sY     '
)
 
 2##T%A%A4CSCSUb
 r>   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r:   attention_outputintermediate_outputlayer_outputs       r=   r   z SplinterLayer.feed_forward_chunk  s,    "//0@A{{#68HIr>   r   )rN   rO   rP   r&   r6   r   rS   r   r   rM   r   rU   rV   s   @r=   r   r      sV    - 48|| ))D0 +,	
 
$r>   r   c            	       n     e Zd Z fdZ	 ddej
                  dej                  dz  dee   de	fdZ
 xZS )	SplinterEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r%   r&   r;   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r:   r;   ir<   s      r=   r&   zSplinterEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#Nr{   r[   ri   rB   c                 P    | j                   D ]  } |||fi |} t        |      S )Nlast_hidden_state)r   r   )r:   r{   r[   ri   layer_modules        r=   rM   zSplinterEncoder.forward  sC     !JJ 	L( M	 +
 	
r>   r   )rN   rO   rP   r&   r6   r   rS   r   r   r   rM   rU   rV   s   @r=   r   r     sM    , 48
||
 ))D0
 +,	

 

r>   r   c                   <     e Zd ZU eed<   dZdZeedZ	 fdZ
 xZS )SplinterPreTrainedModelr;   splinterT)r{   
attentionsc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )Nr#   r"   )r%   _init_weightsr   r   initcopy_r!   r6   r7   r~   r8   )r:   rW   r<   s     r=   r   z%SplinterPreTrainedModel._init_weights2  s[    f%f01JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 2r>   )rN   rO   rP   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   rn   _can_record_outputsr   rU   rV   s   @r=   r   r   (  s/    "&*#&+
i ir>   r   c                   
    e Zd ZdZ fdZd Zd Zeee		 	 	 	 	 dde
j                  dz  de
j                  dz  de
j                  dz  d	e
j                  dz  d
e
j                  dz  dee   deez  fd                     Z xZS )SplinterModela2  
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r%   r&   r;   r   rL   r   encoder	post_initr9   s     r=   r&   zSplinterModel.__init__@  s;     ,V4&v. 	r>   c                 .    | j                   j                  S r   rL   r+   )r:   s    r=   get_input_embeddingsz"SplinterModel.get_input_embeddingsJ  s    ...r>   c                 &    || j                   _        y r   r   )r:   rZ   s     r=   set_input_embeddingsz"SplinterModel.set_input_embeddingsM  s    */'r>   Nr?   r[   r@   r!   rA   ri   rB   c                 
   ||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |\  }}	||j                  n|j                  }
|t	        j
                  ||	f|
      }|&t	        j                  |t        j                  |
      }| j                  ||      }| j                  ||||      } | j                  |fd|i|}|d	   }t        |
      S )a  
        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer#   z5You have to specify either input_ids or inputs_embeds)rF   rD   )r?   r!   r@   rA   r[   r   r   )rt   %warn_if_padding_and_no_attention_maskrG   rF   r6   onesrH   rI   get_extended_attention_maskrL   r   r   )r:   r?   r[   r@   r!   rA   ri   rJ   
batch_sizerK   rF   extended_attention_maskembedding_outputencoder_outputssequence_outputs                  r=   rM   zSplinterModel.forwardP  s7   6  ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!"[[EJJvVN 150P0PQ_al0m??%)'	 + 
 '$,,
2
 

 *!,-
 	
r>   )NNNNN)rN   rO   rP   rQ   r&   r   r   r   r   r   r6   r   r   r   rT   r   rM   rU   rV   s   @r=   r   r   8  s    /0   *..2.2,0-1=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 +,=
 
	 =
    =
r>   r   c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )SplinterFullyConnectedLayerc                     t         |           || _        || _        t	        j
                  | j                  | j                        | _        t        |   | _        t	        j                  | j                        | _	        y r   )
r%   r&   	input_dim
output_dimr   rx   r   r	   act_fnr0   )r:   r   r   r   r<   s       r=   r&   z$SplinterFullyConnectedLayer.__init__  sV    "$YYt~~t?
Z(doo6r>   inputsrB   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r0   )r:   r   r{   s      r=   rM   z#SplinterFullyConnectedLayer.forward  s2    

6*M2}5r>   )gelur   rV   s   @r=   r   r     s#    7ell u|| r>   r   c                   (     e Zd ZdZ fdZd Z xZS )QuestionAwareSpanSelectionHeadzf
    Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

    c                    t         |           t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        j                  |j                  |j                  d      | _
        t        j                  |j                  |j                  d      | _        y )NF)bias)r%   r&   r   r)   query_start_transformquery_end_transformstart_transformend_transformr   rx   start_classifierend_classifierr9   s     r=   r&   z'QuestionAwareSpanSelectionHead.__init__  s    %@ASASU[UgUg%h"#>v?Q?QSYSeSe#f :6;M;MvOaOab89K9KVM_M_` "		&*<*<f>P>PW\ ] ii(:(:F<N<NUZ[r>   c                    |j                         \  }}}|j                  d      j                  dd|      }t        j                  |d|      }| j                  |      }| j                  |      }| j                  |      }	| j                  |      }
| j                  |      }|	j                  ddd      }	t        j                  ||	      }| j                  |      }|
j                  ddd      }
t        j                  ||
      }||fS )Nr#   r   )r_   indexr   r^   )rG   	unsqueezerepeatr6   gatherr   r   r   r   r   permuterb   r   )r:   r   	positionsr   r_   r   gathered_repsquery_start_repsquery_end_reps
start_repsend_repsr{   start_logits
end_logitss                 r=   rM   z&QuestionAwareSpanSelectionHead.forward  s    KKM	1c##B'..q!S9V%@55mD11-@))&1
%%f---.>?''1a0
||M:>++N;##Aq!,\\-:
Z''r>   )rN   rO   rP   rQ   r&   rM   rU   rV   s   @r=   r   r     s    
	\(r>   r   c                   P    e Zd Z fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	e
   deez  fd              Z xZS )SplinterForQuestionAnsweringc                     t         |   |       t        |      | _        t	        |      | _        |j                  | _        | j                          y r   r%   r&   r   r   r   splinter_qassquestion_token_idr   r9   s     r=   r&   z%SplinterForQuestionAnswering.__init__  C     %f-;FC!'!9!9 	r>   Nr?   r[   r@   r!   rA   start_positionsend_positionsquestion_positionsri   rB   c	                    d}
||Dt        j                  t        j                  || j                        j	                         d      }nJt        j
                  |j                  d      t         j                  |j                  |j                        }|j                  d      }d}
 | j                  |f||||d|	}|d   }| j                  ||      \  }}|
r"|j                  d	      |j                  d	      }}|d|d	|z
  t        j                  |j                        j                   z  z   }|d	|z
  t        j                  |j                        j                   z  z   }d}||t#        |j                               d	kD  r|j                  d      }t#        |j                               d	kD  r|j                  d      }|j                  d	      }|j%                  d|       |j%                  d|       t'        |
      } |||      } |||      }||z   dz  }t)        ||||j*                  |j,                        S )a  
        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        FNr#   )r_   r   )rE   layoutrF   Tr[   r@   r!   rA   r   ignore_indexr^   lossr  r  r{   r   )r6   argmaxeqr
  ru   rH   rG   rI   r  rF   r   r   r	  squeezefinforE   minlenclamp_r   r   r{   r   )r:   r?   r[   r@   r!   rA   r  r  r  ri   question_positions_were_none"question_position_for_each_exampleoutputsr   r  r  
total_lossignored_indexloss_fct
start_lossend_losss                        r=   rM   z$SplinterForQuestionAnswering.forward  sF   D (-$%$5:\\XXi)?)?@EEGR62 6;[[!&&q)MDXDXanauau62 "D!M!Mb!Q+/($--
))%'
 
 "!*#'#5#5oGY#Z j''3';';A'>
@R@RST@U*L%'1~+=\M_M_A`AdAd*ddL#q>'9U[[IYIY=Z=^=^&^^J
&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J+%!!//))
 	
r>   NNNNNNNN)rN   rO   rP   r&   r   r   r6   r   rR   r   r   rT   r   rM   rU   rV   s   @r=   r  r    s     *..2.2,0-137156:W
<<$&W
 t+W
 t+	W

 llT)W
 ||d*W
 ))D0W
 ''$.W
 ",,t3W
 +,W
 
-	-W
  W
r>   r  zB
    Class for outputs of Splinter as a span selection model.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   y)SplinterForPreTrainingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-start scores (before SoftMax).
    end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-end scores (before SoftMax).
    Nr  r  r  r{   r   )rN   rO   rP   rQ   r  r6   rS   r   r  r  r{   rT   r    r>   r=   r(  r(  3  s|     &*D%

d
")-1L%##d*1+/J!!D(/59M5**+d2926Je''(4/6r>   r(  z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    c                       e Zd Z fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	e
   deez  fd              Zdej                  dej                  fdZ xZS )SplinterForPreTrainingc                     t         |   |       t        |      | _        t	        |      | _        |j                  | _        | j                          y r   r  r9   s     r=   r&   zSplinterForPreTraining.__init__R  r  r>   Nr?   r[   r@   r!   rA   r  r  r  ri   rB   c	                    |||t        d      ||t        d      || j                  |      } | j                  |f||||d|	}
|
d   }|j                         \  }}}| j	                  ||      \  }}|j                  d      }||j                  d      j                  |||      }|d|z
  t        j                  |j                        j                  z  z   }|d|z
  t        j                  |j                        j                  z  z   }d}|||j                  dt        d|dz
               |j                  dt        d|dz
               t        | j                  j                        } ||j!                  ||z  |      |j!                  ||z              } ||j!                  ||z  |      |j!                  ||z              }||z   dz  }t#        ||||
j$                  |
j&                  	      S )
a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NzCquestion_positions must be specified in order to calculate the lossz?question_positions must be specified when inputs_embeds is usedr  r   r   r  r^   r  )	TypeError_prepare_question_positionsr   rG   r	  r   r8   r6   r  rE   r  r  maxr   r;   r*   r   r(  r{   r   )r:   r?   r[   r@   r!   rA   r  r  r  ri   r  r   r   sequence_lengthr_   r  r  num_questions attention_mask_for_each_questionr   r"  r#  r$  s                          r=   rM   zSplinterForPreTraining.forward\  sD   j %/*E-Jcabb'I,=]^^'!%!A!A)!L$--
))%'
 
 "!*+:+?+?+A(
OS#'#5#5oGY#Z j*//2%/=/G/G/J/Q/QM?0, (1/O+OSXS^S^_k_q_qSrSvSv*vvL#q+K'Ku{{[e[k[kOlOpOp&ppJ
&=+D""1c!_q-@&AB  C?Q+>$?@ (T[[5M5MNH!!!*}"<oN$$Z-%?@J  
] :OL"":#=>H %x/14J+%!!//))
 	
r>   c                 4   t        j                  || j                  j                  k(        \  }}t        j                  |      }t        j
                  |j                  d      |j                         f| j                  j                  t         j                  |j                        }t        |j                  d      |j                  d      k(  d       t        j                  |D cg c]  }t        j                  |       c}      }||||f<   |S c c}w )Nr   rD   z?All samples in the batch must have at least one question token.)r6   wherer;   r
  bincountfullrG   r0  r*   rI   rF   r   catr7   )r:   r?   rowsflat_positionsr2  r   ncolss           r=   r/  z2SplinterForPreTraining._prepare_question_positions  s    ${{98U8U+UVnt,JJ^^A 1 1 34KK$$**##	
	 	q!Y^^A%66M	
 yy=Aa%,,q/AB .	$* Bs   )Dr%  )rN   rO   rP   r&   r   r   r6   r   rR   r   r   rT   r(  rM   r/  rU   rV   s   @r=   r+  r+  J  s"     *..2.2,0-137156:n
<<$&n
 t+n
 t+	n

 llT)n
 ||d*n
 ))D0n
 ''$.n
 ",,t3n
 +,n
 
-	-n
  n
`U\\ ell r>   r+  )r  r+  r   r   r   )r}   )>rQ   collections.abcr   dataclassesr   r6   r   torch.nnr    r   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_splinterr   
get_loggerrN   loggerModuler   r   floatrl   rn   r   r   r   r   r   r   r   r   r   r   r  r(  r+  __all__r)  r>   r=   <module>rP     s    $ !   % & ! 9 Z Z F & 6 j j 7 5 2 
		H	%/ /t %II%<<% 
% <<	%
 LL4'% % %.3)BII 3)n 		 .299  RYY . D
bii 
2 io i i W
+ W
 W
t")) $#(RYY #(L d
#: d
 d
N 
7; 7 7" L4 LL^r>   