
    ih                     j   d Z ddlZddlZddlZddlmZ ddlmc mZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)  e!jT                  e+      Z, G d dejZ                        Z. G d dejZ                        Z/ G d dejZ                        Z0 G d dejZ                        Z1 G d dejZ                        Z2 G d de      Z3 G d dejZ                        Z4 G d d ejZ                        Z5 G d! d"ejZ                        Z6e G d# d$e             Z7e G d% d&e7             Z8 G d' d(ejZ                        Z9 ed)*       G d+ d,e7             Z:e G d- d.e7             Z; ed/*       G d0 d1e7             Z<g d2Z=y)3zPyTorch LayoutLMv3 model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)apply_chunking_to_forward)auto_docstringcan_return_tuplelogging	torch_int)TransformersKwargsmerge_with_config_defaults)capture_outputs   )LayoutLMv3Configc                   *     e Zd ZdZ fdZddZ xZS )LayoutLMv3PatchEmbeddingszLayoutLMv3 image (patch) embeddings. This class also automatically interpolates the position embeddings for varying
    image sizes.c                    t         |           t        |j                  t        j
                  j                        r|j                  n|j                  |j                  f}t        |j                  t        j
                  j                        r|j                  n|j                  |j                  f}|d   |d   z  |d   |d   z  f| _        t        j                  |j                  |j                  ||      | _        y )Nr   r   )kernel_sizestride)super__init__
isinstance
input_sizecollectionsabcIterable
patch_sizepatch_shapennConv2dnum_channelshidden_sizeproj)selfconfig
image_sizer&   	__class__s       /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.pyr    z"LayoutLMv3PatchEmbeddings.__init__6   s     &++[__-E-EF ##V%6%67 	 &++[__-E-EF ##V%6%67 	
 'qMZ]:JqMZXY]<Z[IIf1163E3ES]fpq	    c                 l   | j                  |      }||j                  d| j                  d   | j                  d   d      }|j                  dddd      }|j                  d   |j                  d   }}t        j                  |||fd      }||z   }|j                  d      j                  dd      }|S )Nr   r   r      bicubic)sizemode)	r,   viewr'   permuteshapeFinterpolateflatten	transpose)r-   pixel_valuesposition_embedding
embeddingspatch_heightpatch_widths         r1   forwardz!LayoutLMv3PatchEmbeddings.forwardF   s    YY|,
)!3!8!8D<L<LQ<OQUQaQabcQdfh!i!3!;!;Aq!Q!G(2(8(8(;Z=M=Ma=P+L!"/AWbHcjs!t#&88J''*44Q:
r2   N__name__
__module____qualname____doc__r    rE   __classcell__r0   s   @r1   r   r   2   s    r r2   r   c                   F     e Zd ZdZ fdZd Zd Zd Z	 	 	 	 	 ddZ xZ	S )LayoutLMv3TextEmbeddingszm
    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
    c                 .   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       |j                  | _        t        j                  |j$                  |j
                  | j(                        | _        t        j                  |j,                  |j.                        | _        t        j                  |j,                  |j.                        | _        t        j                  |j,                  |j4                        | _        t        j                  |j,                  |j4                        | _        y )N)padding_idxepsposition_idsr   r4   F
persistent)r   r    r(   	Embedding
vocab_sizer+   pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangemax_position_embeddingsexpandrQ   position_embeddingsmax_2d_position_embeddingscoordinate_sizex_position_embeddingsy_position_embeddings
shape_sizeh_position_embeddingsw_position_embeddingsr-   r.   r0   s     r1   r    z!LayoutLMv3TextEmbeddings.__init__Z   s}   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 "..#%<<**F,>,>DL\L\$
  &(\\&2S2SU[UkUk%l"%'\\&2S2SU[UkUk%l"%'\\&2S2SU[UfUf%g"%'\\&2S2SU[UfUf%g"r2   c           	      H   	 | j                  |d d d d df         }| j                  |d d d d df         }| j                  |d d d d df         }| j                  |d d d d df         }| j                  t	        j
                  |d d d d df   |d d d d df   z
  dd            }| j                  t	        j
                  |d d d d df   |d d d d df   z
  dd            }t	        j                  ||||||gd      }	|	S # t        $ r}t        d      |d }~ww xY w)	Nr   r   r5   r   z;The `bbox` coordinate values should be within 0-1000 range.i  r4   dim)rk   rl   
IndexErrorrn   rd   clipro   cat)
r-   bboxleft_position_embeddingsupper_position_embeddingsright_position_embeddingslower_position_embeddingsern   ro   spatial_position_embeddingss
             r1   %calculate_spatial_position_embeddingsz>LayoutLMv3TextEmbeddings.calculate_spatial_position_embeddingsq   sN   	c'+'A'A$q!Qw-'P$(,(B(B41a=(Q%(,(B(B41a=(Q%(,(B(B41a=(Q% !% : :5::d1aQR7mVZ[\^_ab[bVcFcefhl;m n $ : :5::d1aQR7mVZ[\^_ab[bVcFcefhl;m n ',ii()))%% 
'
# +*%  	cZ[abb	cs   A,D 	D!DD!c                     |j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )z
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
        r   rr   )neintrd   cumsumtype_aslong)r-   	input_idsrQ   maskincremental_indicess        r1   "create_position_ids_from_input_idsz;LayoutLMv3TextEmbeddings.create_position_ids_from_input_ids   sP     ||K(,,.$||Da8@@F$N"'')K77r2   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
        Nr4   r   dtypedevicer   )r7   rd   re   rQ   r   r   	unsqueezerg   )r-   inputs_embedsinput_shapesequence_lengthrT   s        r1   &create_position_ids_from_inputs_embedsz?LayoutLMv3TextEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r2   c                 N   |I|6| j                  || j                        j                  |j                        }n| j	                  |      }||j                         }n|j                         d d }|:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }||z   }| j                  |      }	||	z  }| j                  |      }
||
z   }| j                  |      }| j                  |      }|S )Nr4   r   )r   rQ   tor   r   r7   rd   zerosr   rT   r[   r]   rh   r~   r^   rb   )r-   r   rw   token_type_idsrT   r   r   r]   rB   rh   r}   s              r1   rE   z LayoutLMv3TextEmbeddings.forward   s+    $#FFyRVRbRbcff$$   $JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN  00;M $ : :> J"%::
"66|D))
&*&P&PQU&V#"==
^^J/
\\*-
r2   NNNNN)
rH   rI   rJ   rK   r    r~   r   r   rE   rL   rM   s   @r1   rO   rO   U   s3    h.+48
= 'r2   rO   c                   @     e Zd Z fdZddZ	 	 	 ddee   fdZ xZS )LayoutLMv3SelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |j"                  | _        |j$                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r   r    r+   num_attention_headshasattr
ValueErrorr   attention_head_sizeall_head_sizer(   Linearquerykeyvaluer`   attention_probs_dropout_probrb   has_relative_attention_biashas_spatial_attention_biasrp   s     r1   r    z LayoutLMv3SelfAttention.__init__   s8    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF+1+M+M(*0*K*K'r2   c                     ||z  }|j                  d      j                  d      }||z
  |z  } t        j                  d      |      S )a  
        https://huggingface.co/papers/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
        (PB-Relax). A replacement of the original nn.Softmax(dim=-1)(attention_scores). Seems the new attention_probs
        will result in a slower speed and a little bias. Can use torch.allclose(standard_attention_probs,
        cogview_attention_probs, atol=1e-08) for comparison. The smaller atol (e.g., 1e-08), the better.
        r4   rr   )amaxr   r(   Softmax)r-   attention_scoresalphascaled_attention_scores	max_valuenew_attention_scoress         r1   cogview_attentionz)LayoutLMv3SelfAttention.cogview_attention   sT     #3U":+00b0:DDRH	 7) CuL!rzzb!"677r2   kwargsc                 d   |j                   d   }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	t        j                  |t        j                  | j                        z  |j                  dd            }
| j                  r5| j                  r)|
||z   t        j                  | j                        z  z  }
n1| j                  r%|
|t        j                  | j                        z  z  }
||
|z   }
| j                  |
      }| j                  |      }t        j                  ||	      }|j!                  dddd      j#                         }|j%                         d d | j&                  fz   } |j                  | }||fS )Nr   r4   r   r5   r   )r;   r   r9   r   r   r?   r   r   rd   matmulmathsqrtr   r   r   rb   r:   
contiguousr7   r   )r-   hidden_statesattention_maskrel_pos
rel_2d_posr   
batch_sizequery_layer	key_layervalue_layerr   attention_probscontext_layernew_context_layer_shapes                 r1   rE   zLayoutLMv3SelfAttention.forward   s    #((+
JJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<dii@X@X6Y(Y[d[n[noqsu[vw++0O0O:!54C[C[9\ \\--$))D4L4L*M MM%/.@ 001AB ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDo--r2   )    NNN)	rH   rI   rJ   r    r   r   r   rE   rL   rM   s   @r1   r   r      s.    L(
8 5. +,5.r2   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )LayoutLMv3SelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrR   )r   r    r(   r   r+   denser^   r_   r`   ra   rb   rp   s     r1   r    zLayoutLMv3SelfOutput.__init__&  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r2   r   input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rF   r   rb   r^   r-   r   r   s      r1   rE   zLayoutLMv3SelfOutput.forward,  7    

=1]3}|'CDr2   rH   rI   rJ   r    rd   TensorrE   rL   rM   s   @r1   r   r   %  1    >U\\  RWR^R^ r2   r   c                   8     e Zd Z fdZ	 	 	 ddee   fdZ xZS )LayoutLMv3Attentionc                 b    t         |           t        |      | _        t	        |      | _        y rF   )r   r    r   r-   r   outputrp   s     r1   r    zLayoutLMv3Attention.__init__5  s&    +F3	*62r2   r   c                 b    |} | j                   ||f||d|\  }}| j                  ||      }|S Nr   r   )r-   r   )	r-   r   r   r   r   r   residualattention_output_s	            r1   rE   zLayoutLMv3Attention.forward:  sV     !'dii
 !	

 
!  ;;'7Br2   r   )rH   rI   rJ   r    r   r   rE   rL   rM   s   @r1   r   r   4  s(    3   +, r2   r   c                   @     e Zd Z fdZ	 	 	 	 ddee   fdZd Z xZS )LayoutLMv3Layerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y Nr   )
r   r    chunk_size_feed_forwardseq_len_dimr   	attentionLayoutLMv3IntermediateintermediateLayoutLMv3Outputr   rp   s     r1   r    zLayoutLMv3Layer.__init__P  sI    '-'E'E$,V426:&v.r2   r   c                     | j                  ||||      }t        | j                  | j                  | j                  |      }|S r   )r   r   feed_forward_chunkr   r   )	r-   r   r   output_attentionsr   r   r   r   layer_outputs	            r1   rE   zLayoutLMv3Layer.forwardX  sT      >>!	 * 
 1##T%A%A4CSCSUe
 r2   c                 L    | j                  |      }| j                  ||      }|S rF   )r   r   )r-   r   intermediate_outputr   s       r1   r   z"LayoutLMv3Layer.feed_forward_chunkn  s,    "//0@A{{#68HIr2   )NFNN)	rH   rI   rJ   r    r   r   rE   r   rL   rM   s   @r1   r   r   O  s0    /  +,,r2   r   c                   P     e Zd Z fdZddZd Zd Z	 	 	 	 	 ddee   fdZ	 xZ
S )	LayoutLMv3Encoderc                    t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        |j                  | _
        |j                  | _        | j                  rS|j                  | _        |j                  | _        t        j                  | j                  |j                  d      | _        | j                  r|j"                  | _        |j$                  | _        t        j                  | j$                  |j                  d      | _        t        j                  | j$                  |j                  d      | _        y y c c}w )NF)bias)r   r    r.   r(   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingr   r   rel_pos_binsmax_rel_posr   r   rel_pos_biasmax_rel_2d_posrel_2d_pos_binsrel_pos_x_biasrel_pos_y_bias)r-   r.   r   r0   s      r1   r    zLayoutLMv3Encoder.__init__u  s   ]]U6KcKcEd#eOF$;#ef
&+#+1+M+M(*0*K*K'++ & 3 3D%11D "		$*;*;V=W=W^c dD**"("7"7D#)#9#9D "$))D,@,@&B\B\ch"iD"$))D,@,@&B\B\ch"iD	 + $fs   E5c                 6   d}|r4|dz  }||dkD  j                         |z  z  }t        j                  |      }n*t        j                  | t        j                  |            }|dz  }||k  }|t        j
                  |j                         |z        t        j
                  ||z        z  ||z
  z  j                  t        j                         z   }	t        j                  |	t        j                  |	|dz
              }	|t        j                  |||	      z  }|S )Nr   r5   r   )r   rd   absmax
zeros_likelogfloatr   r   min	full_likewhere)
r-   relative_positionbidirectionalnum_bucketsmax_distanceretn	max_exactis_smallval_if_larges
             r1   relative_position_bucketz*LayoutLMv3Encoder.relative_position_bucket  s   AK%)//1K??C		+,A		,,e.>.>?P.QRA  1$	y= !IIaggi)+,txxy8P/QQU`clUlm
"UZZ. yyu|[[\_/]^u{{8Q55
r2   c                    |j                  d      |j                  d      z
  }| j                  || j                  | j                        }t	        j
                         5  | j                  j                  j                         |   j                  dddd      }d d d        |j                         }|S # 1 sw Y   xY w)Nr   r4   r  r	  r   r   r   r5   )r   r  r   r   rd   no_gradr   weighttr:   r   )r-   rT   rel_pos_matr   s       r1   _cal_1d_pos_embz!LayoutLMv3Encoder._cal_1d_pos_emb  s    ",,R0<3I3I"3MM//)))) 0 
 ]]_ 	P''..0027;CCAq!QOG	P$$&	P 	Ps    :B44B=c                    |d d d d df   }|d d d d df   }|j                  d      |j                  d      z
  }|j                  d      |j                  d      z
  }| j                  || j                  | j                        }| j                  || j                  | j                        }t	        j
                         5  | j                  j                  j                         |   j                  dddd      }| j                  j                  j                         |   j                  dddd      }d d d        |j                         }|j                         }||z   }|S # 1 sw Y   0xY w)Nr   r   r   r4   r  r   r5   )r   r  r   r   rd   r  r   r  r  r:   r   r   )	r-   rw   position_coord_xposition_coord_yrel_pos_x_2d_matrel_pos_y_2d_mat	rel_pos_x	rel_pos_yr   s	            r1   _cal_2d_pos_embz!LayoutLMv3Encoder._cal_2d_pos_emb  st   1a=1a=+55b9<L<V<VWY<ZZ+55b9<L<V<VWY<ZZ11,,,, 2 
	
 11,,,, 2 
	 ]]_ 	V++22446yAII!QPQSTUI++22446yAII!QPQSTUI	V ((*	((*	*
	V 	Vs   A3E%%E.r   c                     | j                   r| j                  |      nd }| j                  r| j                  |      nd }	| j                  D ]  }
 |
||f||	d|} t        |      S )Nr   last_hidden_state)r   r  r   r  r   r
   )r-   r   rw   r   rT   rC   rD   r   r   r   layer_modules              r1   rE   zLayoutLMv3Encoder.forward  s     9=8X8X$&&|4^b373R3RT))$/X\
 JJ 	L(  %	
 M	 ??r2   )Tr      r   )rH   rI   rJ   r    r  r  r  r   r   rE   rL   rM   s   @r1   r   r   t  s@    j(."< @ +,@r2   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )r   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rF   )r   r    r(   r   r+   intermediate_sizer   r!   
hidden_actstrr   intermediate_act_fnrp   s     r1   r    zLayoutLMv3Intermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r2   r   r   c                 J    | j                  |      }| j                  |      }|S rF   )r   r)  )r-   r   s     r1   rE   zLayoutLMv3Intermediate.forward  s&    

=100?r2   r   rM   s   @r1   r   r     s#    9U\\ ell r2   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )r   c                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r   r    r(   r   r&  r+   r   r^   r_   r`   ra   rb   rp   s     r1   r    zLayoutLMv3Output.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r2   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rF   r   r   s      r1   rE   zLayoutLMv3Output.forward  r   r2   r   rM   s   @r1   r   r     r   r2   r   c                   d     e Zd ZU eed<   dZdZeedZ	 e
j                          fd       Z xZS )LayoutLMv3PreTrainedModelr.   
layoutlmv3)imagetext)r   
attentionsc                 h   t         |   |       t        |t              r| j                  j
                  r>t        j                  |j                         t        j                  |j                         t        |d      rGt        j                  |j                  |j                  |j                  |j                  f             yyt        |t              rZt        j                  |j                   t#        j$                  |j                   j&                  d         j)                  d             yy)zInitialize the weightsvisual_bboxr/   r4   rU   N)r   _init_weightsr!   LayoutLMv3Modelr.   visual_embedinitzeros_	cls_token	pos_embedr   copy_r5  create_visual_bboxr7   rO   rT   rd   re   r;   rg   )r-   moduler0   s     r1   r7  z'LayoutLMv3PreTrainedModel._init_weights  s     	f%fo.{{''F,,-F,,-v}-

6--v/H/HU[U`U`bhbmbmTn/H/op . 89JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh :r2   )rH   rI   rJ   r   __annotations__base_model_prefixinput_modalitiesr   r   _can_record_outputsrd   r  r7  rL   rM   s   @r1   r/  r/    s=    $(,;KbcU]]_
i 
ir2   r/  c                   Z    e Zd Z fdZd Zd ZddZd Zd Ze	e
e	 	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd                     Z xZS )r8  c                 @   t         |   |       || _        |j                  rt	        |      | _        |j                  rt        |      | _        t        |j                  |j                  z        | _        t        j                  t        j                   dd|j"                              | _        t        j                  t        j                   d| j                  | j                  z  dz   |j"                              | _        t        j(                  d      | _        t        j,                  |j"                  |j.                        | _        t        j(                  |j0                        | _        | j                  j4                  s| j                  j6                  r:| j9                  d| j;                  | j                  | j                  f      d       t        j,                  |j"                  d	      | _        t?        |      | _         | jC                          y )
Nr   g        )prR   r5  r6  FrV   gư>)"r   r    r.   
text_embedrO   rB   r9  r   patch_embedr   r"   r&   r7   r(   	Parameterrd   r   r+   r<  r=  r`   pos_dropr^   r_   ra   rb   r   r   rc   r?  normr   encoder	post_initrp   s     r1   r    zLayoutLMv3Model.__init__  s    6v>DO  9@DF--0A0AABDI\\%++aF<N<N*OPDN\\%++aTYY9NQR9RTZTfTf*ghDNJJ-DM\\&*<*<&BWBWXDN::f&@&@ADL{{66$++:`:`$$!4#:#:tyyRVR[R[F\#:#]jo %  V%7%7TBDI(0r2   c                 .    | j                   j                  S rF   rB   r[   r-   s    r1   get_input_embeddingsz$LayoutLMv3Model.get_input_embeddings:  s    ...r2   c                 &    || j                   _        y rF   rP  r-   r   s     r1   set_input_embeddingsz$LayoutLMv3Model.set_input_embeddings=  s    */'r2   c           	         t        j                  t        j                  d||d   dz   z  |      |d   d      }t        j                  t        j                  d||d   dz   z  |      |d   d      }t        j                  |dd j	                  |d   d      |dd j	                  |d   d      j                  dd      |dd j	                  |d   d      |dd j	                  |d   d      j                  dd      gd      j                  dd      }t        j                  dd|dz
  |dz
  gg      }t        j                  ||gd      S )	zJ
        Create the bounding boxes for the visual (patch) tokens.
        r   r   trunc)rounding_modeNr4   rr      )	rd   divre   stackrepeatr?   r9   tensorrv   )r-   r/   max_lenvisual_bbox_xvisual_bbox_yr5  cls_token_boxs          r1   r?  z"LayoutLMv3Model.create_visual_bbox@  s\    		LLGz!}q'897CZPQ]bi
 		LLGz!}q'897CZPQ]bi
 kkcr"))*Q-;cr"))*Q-;EEaKab!((A:ab!((A:DDQJ	 
 $r1+ 	 ueWq['A+&N%OPyy-51==r2   c                     | j                   j                  |dd      }|j                  |      j                  |      }|S r   )r5  r\  r   type)r-   r   r   r   r5  s        r1   calculate_visual_bboxz%LayoutLMv3Model.calculate_visual_bboxW  s;    &&--j!Q?!nnV,11%8r2   c                 6   | j                  |      }|j                         \  }}}| j                  j                  |dd      }t	        j
                  ||fd      }| j                  || j                  z   }| j                  |      }| j                  |      }|S )Nr4   r   rr   )	rI  r7   r<  rg   rd   rv   r=  rK  rL  )r-   r@   rB   r   seq_lenr   
cls_tokenss          r1   forward_imagezLayoutLMv3Model.forward_image\  s    %%l3
 ",!2
GQ^^**:r2>
YY
J7Q?
 >>%#dnn4J]]:.
YYz*
r2   Nr   rw   r   r   rT   r   r@   r   r   c           	      6   |"|j                         }	|	\  }
}|j                  }nL|%|j                         dd }	|	\  }
}|j                  }n%|t        |      }
|j                  }nt        d      |||t	        j
                  |
f|      }|&t	        j                  	t        j                  |      }|<t	        j                  t        t        	      dgz         t        j                  |      }| j                  |||||      }dx}}dx}}|&t        |j                  d   | j                  j                  z        t        |j                  d	   | j                  j                  z        }}| j                  |      }t	        j
                  |
|j                  d
   ft        j                  |      }|t	        j                   ||gd
      }n|}| j                  j"                  s| j                  j$                  r| j                  j$                  r@| j'                  |t        j                  |
      }|t	        j                   ||gd
      }n|}t	        j(                  d|j                  d
   t        j                  |      j+                  |
d
      }||Ut	        j(                  d	d
   |      j-                  d      }|j/                  |	      }t	        j                   ||gd
      }n|}||t	        j                   |gd
      }n|}| j1                  |      }| j3                  |      }n| j                  j"                  s| j                  j$                  rc| j                  j$                  r|}| j                  j"                  r5| j                  j4                  ddd	d
   f   }|j7                  |      }|}| j9                  |dj:                        } | j<                  |f|||||d|}|j>                  }tA        |      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        bbox (`torch.LongTensor` of shape `(batch_size, token_sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, token_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr4   zEYou have to specify either input_ids or inputs_embeds or pixel_values)r   r   rY  )r   rw   rT   r   r   r5   r   r   rr   )r   r   r   )r   )rw   rT   r   rC   rD   r   )!r7   r   lenr   rd   onesr   r   tuplelistrB   r   r;   r.   r&   rh  rv   r   r   rd  re   r\  r   rg   r^   rb   rT   	expand_asget_extended_attention_maskr   rM  r!  r
   )r-   r   rw   r   r   rT   r   r@   r   r   r   
seq_lengthr   embedding_output
final_bboxfinal_position_idsrC   rD   visual_embeddingsvisual_attention_maskr5  visual_position_idsextended_attention_maskencoder_outputssequence_outputs                            r1   rE   zLayoutLMv3Model.forwardm  s   Z  #..*K%0"J
%%F&',,.s3K%0"J
"))F%\*J!((Fdee M$=%!&j*-Ev!V%!&[

SY!Z|{{5k):aS)@#A\bc##)-+  /   +/.
'%)){#,,,Q/$++2H2HHI,,,Q/$++2H2HHI &L !% 2 2< @$)JJ.44Q78

SY%! )!&N<Q+RXY!Z!6{{66$++:`:`;;99"&"<"<V5::bl"<"mK'%*YYk/B%J
%0
&+ll(..q1F'&Q' $ (M,E#(<<;q>&#Q#[#[\]#^L#/#6#6{#CL).LBU3V\])^&)<&$(A#(99.>@Q-RXY#Z #4 #~~.>?#||,<=[[448^8^{{55!
{{66#;;A?OQ?O<OP+55i@%1"040P0PD(8(>(> 1Q 1
 '$,,
+2%#
 
 *;;-
 	
r2   ))   rz  i  )NNNNNNN)rH   rI   rJ   r    rR  rU  r?  rd  rh  r   r   r   rd   
LongTensorFloatTensorr   r   rl  r
   rE   rL   rM   s   @r1   r8  r8    s   >/0>.
"   .2(,3726042615k
##d*k
 %k
 ))D0	k

 ((4/k
 &&-k
 ((4/k
 ''$.k
 +,k
 
	 k
    k
r2   r8  c                   *     e Zd ZdZd fd	Zd Z xZS )LayoutLMv3ClassificationHeadz\
    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
    c                    t         |           || _        |r3t        j                  |j
                  dz  |j
                        | _        n/t        j                  |j
                  |j
                        | _        |j                  |j                  n|j                  }t        j                  |      | _
        t        j                  |j
                  |j                        | _        y )Nr   )r   r    pool_featurer(   r   r+   r   classifier_dropoutra   r`   rb   
num_labelsout_proj)r-   r.   r  r  r0   s       r1   r    z%LayoutLMv3ClassificationHead.__init__#  s    (6#5#5#96;M;MNDJ6#5#5v7I7IJDJ)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHr2   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S rF   )rb   r   rd   tanhr  )r-   xs     r1   rE   z$LayoutLMv3ClassificationHead.forward0  sI    LLOJJqMJJqMLLOMM!r2   )FrG   rM   s   @r1   r~  r~    s    Ir2   r~  a  
    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
    )custom_introc                   \    e Zd Z fdZd Zd Zee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS ) LayoutLMv3ForTokenClassificationc                 p   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        |j                  dk  r0t        j                  |j                  |j                        | _        nt        |d      | _        | j                          y )N
   Fr  )r   r    r  r8  r0  r(   r`   ra   rb   r   r+   
classifierr~  rN  rp   s     r1   r    z)LayoutLMv3ForTokenClassification.__init__B  s      ++)&1zz&"<"<=r! ii(:(:F<M<MNDO:6PUVDOr2   c                 6    | j                   j                         S rF   r0  rR  rQ  s    r1   rR  z5LayoutLMv3ForTokenClassification.get_input_embeddingsO      3355r2   c                 :    | j                   j                  |       y rF   r0  rU  rT  s     r1   rU  z5LayoutLMv3ForTokenClassification.set_input_embeddingsR      ,,U3r2   Nr   rw   r   r   rT   r   labelsr@   r   r   c	           
          | j                   |f||||||d|	}
||j                         }n|j                         dd }|d   }|
d   ddd|f   }| j                  |      }| j                  |      }d}|<t	               } ||j                  d| j                        |j                  d            }t        |||
j                  |
j                        S )a!  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForTokenClassification
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> word_labels = example["ner_tags"]

        >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```)rw   r   r   rT   r   r@   Nr4   r   r   losslogitsr   r3  )
r0  r7   rb   r  r   r9   r  r   r   r3  )r-   r   rw   r   r   rT   r   r  r@   r   outputsr   rp  ry  r  r  loss_fcts                    r1   rE   z(LayoutLMv3ForTokenClassification.forwardU  s    Z "$//	
))%'%	
 	
  #..*K',,.s3K ^
!!*Q^4,,71')HFKKDOO<fkk"oND$!//))	
 	
r2   NNNNNNNN)rH   rI   rJ   r    rR  rU  r   r   rd   r{  r|  r   r   rl  r   rE   rL   rM   s   @r1   r  r  9  s   64  .2(,37260426*.04J
##d*J
 %J
 ))D0	J

 ((4/J
 &&-J
 ((4/J
   4'J
 &&-J
 +,J
 
&	&J
  J
r2   r  c                   |    e Zd Z fdZd Zd Zee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )LayoutLMv3ForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        |d      | _        | j                          y NFr  )r   r    r  r8  r0  r~  
qa_outputsrN  rp   s     r1   r    z'LayoutLMv3ForQuestionAnswering.__init__  sA      ++)&16vERr2   c                 6    | j                   j                         S rF   r  rQ  s    r1   rR  z3LayoutLMv3ForQuestionAnswering.get_input_embeddings  r  r2   c                 :    | j                   j                  |       y rF   r  rT  s     r1   rU  z3LayoutLMv3ForQuestionAnswering.set_input_embeddings  r  r2   Nr   r   r   rT   r   start_positionsend_positionsrw   r@   r   r   c
           
          | j                   |f||||||	d|
}|d   }| j                  |      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }t        ||||j                  |j                  	      S )
a  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
        >>> from datasets import load_dataset
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
        >>> start_positions = torch.tensor([1])
        >>> end_positions = torch.tensor([3])

        >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
        >>> loss = outputs.loss
        >>> start_scores = outputs.start_logits
        >>> end_scores = outputs.end_logits
        ```r   r   rT   r   rw   r@   r   r   r4   rr   N)ignore_indexr5   )r  start_logits
end_logitsr   r3  )r0  r  splitsqueezer   rj  r7   clampr   r   r   r3  )r-   r   r   r   rT   r   r  r  rw   r@   r   r  ry  r  r  r  
total_lossignored_indexr  
start_lossend_losss                        r1   rE   z&LayoutLMv3ForQuestionAnswering.forward  s   ` $34??	$
))%'%	$
 	$
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
r2   )	NNNNNNNNN)rH   rI   rJ   r    rR  rU  r   r   rd   r{  r|  r   r   rl  r   rE   rL   rM   s   @r1   r  r    s3   64  .2372604263715(,04W
##d*W
 ))D0W
 ((4/	W

 &&-W
 ((4/W
 ))D0W
 ''$.W
 %W
 &&-W
 +,W
 
-	-W
  W
r2   r  a
  
    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for document image classification tasks such as the
    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
    c                   \    e Zd Z fdZd Zd Zee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )#LayoutLMv3ForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |d      | _        | j                          y r  )	r   r    r  r.   r8  r0  r~  r  rN  rp   s     r1   r    z,LayoutLMv3ForSequenceClassification.__init__  sH      ++)&16vERr2   c                 6    | j                   j                         S rF   r  rQ  s    r1   rR  z8LayoutLMv3ForSequenceClassification.get_input_embeddings"  r  r2   c                 :    | j                   j                  |       y rF   r  rT  s     r1   rU  z8LayoutLMv3ForSequenceClassification.set_input_embeddings%  r  r2   Nr   r   r   rT   r   r  rw   r@   r   r   c	           
          | j                   |f||||||d|	}
|
d   dddddf   }| j                  |      }d}|| j                  j                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j
                  t        j                  k(  s|j
                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                  j                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                  j                  dk(  rt               } |||      }t        |||
j                  |
j                   	      S )
a_  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForSequenceClassification
        >>> from datasets import load_dataset
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")
        >>> sequence_label = torch.tensor([1])

        >>> outputs = model(**encoding, labels=sequence_label)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```r  r   Nr   
regressionsingle_label_classificationmulti_label_classificationr4   r  )r0  r  r.   problem_typer  r   rd   r   r   r   r  r   r9   r   r   r   r3  )r-   r   r   r   rT   r   r  rw   r@   r   r  ry  r  r  r  s                  r1   rE   z+LayoutLMv3ForSequenceClassification.forward(  s   X $34??	$
))%'%	$
 	$
 "!*Q1W-1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r2   r  )rH   rI   rJ   r    rR  rU  r   r   rd   r{  r|  r   r   rl  r   rE   rL   rM   s   @r1   r  r    s   64  .237260426*.(,04T
##d*T
 ))D0T
 ((4/	T

 &&-T
 ((4/T
   4'T
 %T
 &&-T
 +,T
 
)	)T
  T
r2   r  )r  r  r  r8  r/  )>rK   r#   r   rd   torch.nnr(   torch.nn.functional
functionalr<   r   r   r    r   r:  activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_layoutlmv3r   
get_loggerrH   loggerModuler   rO   r   r   r   r   r   r   r   r/  r8  r~  r  r  r  __all__ r2   r1   <module>r     s           A A & ! 9  . & 6  L 5 6 
		H	% 		  Fsryy slV.bii V.t299  "))  6"0 "Jn@		 n@dRYY  ryy  i i i( A
/ A
 A
H299 6 `
'@ `
`
F i
%> i
 i
X f
*C f
f
Rr2   