
    iJ                    |   d Z ddlZddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z)dQdejT                  de+de+de+dejT                  f
dZ,dejT                  de+de+dejT                  fdZ-dQdejT                  de+de+de+dejT                  f
dZ.de+dejT                  fdZ/dejT                  de+dejT                  fdZ0dejT                  de+d ejb                  dejT                  fd!Z2dejT                  d"e+de3ejT                  ejT                  f   fd#Z4dejT                  d"e+dejT                  fd$Z5d%ejT                  d&ejT                  d'e+dejT                  fd(Z6 G d) d*ejn                        Z8	 dd+l9m:Z: e:Z8e)jw                  d,        G d. d/ejn                        Z? G d0 d1ejn                        Z@ G d2 d3ejn                        ZA G d4 d5ejn                        ZB G d6 d7ejn                        ZC G d8 d9ejn                        ZD G d: d;ejn                        ZE G d< d=ejn                        ZF G d> d?ejn                        ZG G d@ dAejn                        ZH G dB dCe      ZIe" G dD dEe             ZJ G dF dGeJ      ZKe" G dH dIeJ             ZL e"dJK       G dL dMeJe             ZMe" G dN dOeJ             ZNg dPZOy# e<$ r Y =e=$ r e)j}                  d-       Y Uw xY w)RzPyTorch LongT5 model.    N)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torchdynamo_compilinglogging   )LongT5Configx	block_lendim	pad_valuereturnc                 t   | j                   |    |z  }t        | j                         sCt        | j                         }||xx   |z  cc<   t        j                  || j
                        S dg| j                  z  }d|f||<   t        |ddd   d      }t        j                  j                  | |d|      } | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr"   ndimsumr   
functionalr(   )r   r   r   r   pad_len	new_shaper(   s          {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multipler6   5   s    wws|mi'Gqww<M	#'!{{9AGG44(QVV
C7|CH
c$B$i
C
!:YGAH    c                 >   | j                   |   |z  dk7  rt        | ||d      } | j                   |   |z  }| j                   d| ||fz   | j                   |dz   d z   }d|v r,t        j                  || j                  | j
                        S | j                  |      S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r   Nr   r"   device)r+   r6   r.   emptyr"   r:   reshape)r   r   r   
num_blocksoutput_shapes        r5   _split_into_blocksr?   E   s    
 	wws|i1$Q	3!<*J774C=J	#::QWWcAg[=QQLL{{<qwwqxxHH99\""r7   	block_dimsequence_dimc                    | j                   |   }dg| j                  z  }d||<   t        |ddd   d      }t        j                  j                  | |d|      } g }t        d      D ]M  }t        d	d      g| j                  z  }t        |||z         ||<   t        |      }|j                  | |          O t        j                  ||
      S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://huggingface.co/papers/2112.07916.
    r#   )r   r   Nr$   r%   r&   r'   r   r   r   )r+   r0   r1   r   r2   r(   rangeslicetupleappendr.   cat)	r   r@   rA   r   r=   r(   blocks_listiindicess	            r5   _concatenate_3_blocksrL   T   s    
 #J(QVV
CC	N
c$B$i
C
!:YGA&(K1X ' D>"QVV+"1a*n5	.1W:&' 99[l33r7   c                     t        j                  d| z  t         j                        }|| |   }|j                  d      |j                  d      z
  }|S )z:Makes 3-blocked relative position ids for local attention.r   r!   r   r   )r.   arangeint32	unsqueeze)r   position_idscenter_position_idsrelative_position_idss       r5   "_make_3block_relative_position_idsrT   m   sR    <<IU[[AL&y)<(22158K8U8UVW8XX  r7   local_attention_maskc                     t        |      }t        j                  |      |k  }|ddddddf   }|j                  | j                        }t        j
                  | |      S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)rT   r.   abstor:   logical_and)rU   r   rS   locality_masks       r5   _mask_local_attention_maskr[   v   s_    >yIII34y@M!$a"23M!$$%9%@%@AM1=AAr7   attention_maskr:   c                    t        | |d      }t        |dd      }|j                  d      }|j                  d      }t        j                  ||      }t        ||      }|j                  d      j                  |      S )z;Prepare attention mask to be applied for a local attention.r   rC      r@   rA   r$   )r?   rL   rP   r.   rY   r[   rX   )r\   r   r:   _blocked_attention_mask_3blocked_attention_maskrU   s         r5   _get_local_attention_maskrc      s     1PQR45LXYhij5??C7AA"E ,,-DF^_56JIV))!,//77r7   global_block_sizec                    | j                   dd \  }dt        j                  dt        j                  ffd}t        j                  | | j                        z  }t        j
                  |d      |z
  }t        j                  | d	k7  d
d      j                  | j                        }t        j                  ||z   d
z
        j                  | j                        }t        j                  d|j                  |j                        }t        j                  ||kD  ||      }|| z  | dz
  z   } ||      }z  }|dkD  rBt        j                  |d      j                  j                  |d      j                  dd      }	n-t        j                  |d|j                  |j                        }	t        j
                  t        j                   ||      d      dz
  }
|
j#                  | j                        }
t        j                  |
|	k  dd      }
|j                  t        j$                        |
j                  t        j$                        fS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simplified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    Nr^   	block_idsr   c                 X   t        j                        z  dz
  k(  }|j                  | j                        }t        j                  || dk\        }|j                  d      j                  d      j                  | j                        dz
  }t        j                  | |k  | |      } | S )Nr   r   r$   )
r.   rN   rX   r:   rY   r1   rP   typer"   where)rf   
block_endstrue_block_endsfull_blocksrd   seq_lens       r5   handle_orphan_tokensz:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   s    ll7+.??DUXYDYY
]]9#3#34
++J	QG%))"-77;@@QTUUKK	K 7KP	r7   r:   r   )axis              ?g     @r$   r9   r   rC   )r+   r.   Tensor	ones_liker:   cumsumri   rh   r"   floortensormaxvaluesrepeat	transposer/   onesrX   int)r\   rd   
batch_sizern   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsrm   s    `         @r5   _make_global_fixed_block_idsr      s    )..r2J   ~n>S>STWhh||$41=@PP;;~,c7;@@AUAUVD{{4*:#:S#@AFF~G[G[\$)LL;K;Q;QZjZqZq$r!{{88:JLi )>9nq>PQ+,<=..KQ"')),<""E"L"L"S"ST_ab"c"m"mnoqr"s"'++!1!7!7@P@W@W#
 ejj[&IrRUVV+..~/D/DE%7;R%RTUWXY  +-?-D-DUYY-OOOr7   c                     t        | |      \  }}|j                  d   }t        j                  ||j                        }||d   z
  }|j                  t        j                        S )zBCreate the relative position tensor for local -> global attention.r$   ro   .N)r   r+   r.   rN   r:   rh   int64)r\   rd   rf   r   global_seq_lenglobal_positionsside_relative_positions          r5    _make_side_relative_position_idsr      sa    $@Qb$c!I!'--b1N||N9;K;KL-	)0DD!&&u{{33r7   hidden_statesrf   r   c                 x   |j                  |dk\  t        j                  ||j                  |j                              }t
        j                  j                  |j                  t        j                        |dz         ddddddf   }t        j                  d| |j                  | j                              S )zFCompute individual block aggregates by summing over individual blocks.r   r9   r   Nr$   z...nd,...ng->...gd)ri   r.   rw   r"   r:   r   r2   one_hotrh   r   einsum)r   rf   r   one_hot_block_idss       r5   _create_global_aggregatesr      s    
 Q^9??S\ScScdI --innU[[.I>\]K]^_`bcehfheh_hi<<,m=N=S=STaTgTg=hiir7   c                   &     e Zd Zd fd	Zd Z xZS )LongT5LayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr.   r|   weightvariance_epsilon)selfhidden_sizeeps	__class__s      r5   r   zLongT5LayerNorm.__init__   s1     	ll5::k#:; #r7   c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )Nr^   r$   T)keepdim)rX   r.   float32powmeanrsqrtr   r   r"   float16bfloat16)r   r   variances      r5   forwardzLongT5LayerNorm.forward   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r7   )gư>)__name__
__module____qualname__r   r   __classcell__r   s   @r5   r   r      s    $+r7   r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                   *     e Zd Zdef fdZd Z xZS )LongT5DenseActDenseconfigc                 ^   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _
        t        |j                     | _        y NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   s     r5   r   zLongT5DenseActDense.__init__   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r7   c                    | j                  |      }| j                  |      }| j                  |      }t        | j                  j
                  t        j                        r|j                  | j                  j
                  j                  k7  r`| j                  j
                  j                  t        j                  k7  r/|j                  | j                  j
                  j                        }| j	                  |      }|S N)r   r   r   
isinstancer   r   r.   rs   r"   int8rX   )r   r   s     r5   r   zLongT5DenseActDense.forward  s    ./]3tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r7   r   r   r   r   r   r   r   r   s   @r5   r   r      s    /| /r7   r   c                   *     e Zd Zdef fdZd Z xZS )LongT5DenseGatedActDenser   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   s     r5   r   z!LongT5DenseGatedActDense.__init__  s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r7   c                     | j                  | j                  |            }| j                  |      }||z  }| j                  |      }| j	                  |      }|S r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r5   r   z LongT5DenseGatedActDense.forward  sS    hhtyy78		-0#m3]3.r7   r   r   s   @r5   r   r     s    /| /r7   r   c                   *     e Zd Zdef fdZd Z xZS )LongT5LayerFFr   c                    t         |           |j                  rt        |      | _        nt        |      | _        t        |j                  |j                        | _	        t        j                  |j                        | _        y )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s     r5   r   zLongT5LayerFF.__init__'  s_    ":6"BD"5f"=D)&..f>W>WXzz&"5"56r7   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S r   )r   r   r   )r   r   forwarded_statess      r5   r   zLongT5LayerFF.forward1  s=    ??=9../?@%5E(FFr7   r   r   s   @r5   r   r   &  s    7| 7r7   r   c                   `     e Zd Z	 	 ddededz  f fdZed	d       Zd
dZ	 	 	 	 	 ddZ	 xZ
S )LongT5AttentionNr   	layer_idxc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  | j                  z  | _        || _        |9| j                  r-t        j!                  d| j"                  j$                   d       t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        | j                  r/t'        j2                  | j                  | j                        | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr   r   r   r   r   s       r5   r   zLongT5Attention.__init__:  si    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(&+#r7   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r^   r   rX   r.   longrW   min
zeros_likelogfloatmath	full_likeri   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r5   _relative_position_bucketz)LongT5Attention._relative_position_bucket\  s(   , AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r7   c                    | | j                   j                  j                  }t        j                  |t        j
                  |      dddf   |z   }t        j                  |t        j
                  |      dddf   }||z
  }| j                  || j                   | j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )%Compute binned relative position biasNr9   r   r   r   r^   r   r   r   )r   r   r:   r.   rN   r   r  r   r   r   permuterP   )
r   query_length
key_lengthr:   past_seen_tokenscontext_positionmemory_positionr   relative_position_bucketry   s
             r5   compute_biaszLongT5Attention.compute_bias  s    >1188??F <<EJJvVWXZ^W^_brr,,zFSTXZ[T[\+.>>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r7   c                    |j                   dd }g |d| j                  }	||j                  | j                        nd}
t	        |
t
        j                        r|
j                         n|
}
|du}| j                  |      j                  |	      j                  dd      }d}t	        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rG|j                   | j                     j"                  }|j                   | j                     j$                  }ng |j                   dd d| j                  }| j'                  |      j                  |      j                  dd      }| j)                  |      j                  |      j                  dd      }|K|j+                  ||| j                        \  }}|r)t	        |t              rd|j                  | j                  <   t        j,                  ||j                  dd            }||j                   d	   }| j.                  s`t        j0                  d|j                   d   |d   |f|j2                  |j4                  
      }| j6                  r6| j8                  r*d|_        n"| j=                  |d   ||j2                  |
      }|#|ddddddd|j                   d	   f   }||z   }|}||z  }t>        j@                  jC                  |jE                         d      jG                  |      }t>        j@                  jI                  || jH                  | j8                        }t        j,                  ||      }|j                  dd      jK                         } |jL                  g |d }| jO                  |      }||f}|r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr$   r   r   r^   FTr   r`   r:   r"   )r:   r  rC   ptraining)(r+   r   get_seq_lengthr   r   r.   rs   cloner   viewr{   r   
is_updatedgetcross_attention_cacheself_attention_cachelayerskeysry   r   r   updatematmulr   r/   r:   r"   r   r  requires_gradr  r   r2   softmaxr   type_asr   
contiguousr<   r   )r   r   r   key_value_statesposition_biaspast_key_valuesoutput_attentionskwargsinput_shapehidden_shaper  is_cross_attentionquery_statesr  curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapescoresr
  causal_maskposition_bias_maskedattn_weightsattn_outputoutputss                             r5   r   zLongT5Attention.forward  s    $))#2.BBbB$*A*ABM\Mh?99$..Ino7ABRTYT`T`7a+113gw .T9vvm,11,?II!QO 
o':;(3377GJ!'6'L'L$'6'K'K$#2 -?)]/"=*-44T^^DIIJ/66t~~FMMLP--cr2PBP8O8OPH/44X>HHANJ66.166x@JJ1aPL*+?+F+FzS_aeaoao+p(
L%*_FY*ZAEO..t~~> lJ,@,@A,FG #))"-J33 %**1-{1~zJSYS`S`hnhtht! ..4==26M/ $ 1 1NJv}}Wg !2 ! "1a,Bj.>.>r.B,B#BC - ;,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>)k));;;;ff[)./Gr7   FNT       )Nr   )NNNNF)r   r   r   r   r}   r   staticmethodr  r  r   r   r   s   @r5   r   r   9  sX     %* $	 , , :	 ,D -  - ^( [r7   r   c                   Z     e Zd Zd
dededdf fdZedd       ZdefdZ		 	 	 dd	Z
 xZS )LongT5LocalAttentionr   r   r   Nc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  dz   | _        |j                  | _        | j                  | j                  z  | _        t!        j"                  | j                  | j                  d      | _        t!        j"                  | j                  | j                  d      | _        t!        j"                  | j                  | j                  d      | _        t!        j"                  | j                  | j                  d      | _        | j                  r/t!        j,                  | j                  | j                        | _        d| _        y )Nr   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s      r5   r   zLongT5LocalAttention.__init__  sG    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.**(?(??4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(&+#r7   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S r   r   r   s           r5   r  z.LongT5LocalAttention._relative_position_bucket  (   . AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r7   block_lengthc                    | j                   j                  j                  j                  dk7  r | j                   j                  j                  nd}t	        j
                  d|z  t        j                  |      }|||  }|dddf   |dddf   z
  }| j                  || j                   | j                  | j                        }| j                  |      }|j                  g d      j                  d      j                  d      }|S r  metaNr   r9   r  r  r   r   r   r:   rh   r.   rN   r   r  r   r   r   r  rP   r   rD  target_devicer  r  r   r  ry   s           r5   r  z!LongT5LocalAttention.compute_biasE      ++2299>>&H ((//66 	
  ,,q<'7uzzR_`*<F ,D!G47G47PP#'#A#A#.;;==	 $B $
  --.FG	*44Q7AA!Dr7   c                     |j                   d d \  } fd} fd} | j                  |            } | j                  |            }	 | j                  |            }
t	        | j
                  d      }t	        |	 j
                  d      }	t	        |
 j
                  d      }
t        |	dd      }	t        |
dd      }
t        j                  d||	      }|ʉ j                  srt        j                  dd j                   j
                  d j
                  z  f|j                  |j                  	      } j                  r/ j                  r#d
|_        n j#                   j
                        }|/t        j$                  |dkD  dd      }||j'                  dd      z   }||z  }t(        j*                  j-                  |j/                         d      j1                  |      }t(        j*                  j3                  | j2                   j                        }|j5                  |
j                        } |t        j                  d||
            }|d d d |d d f   } j7                  |      }||f}|r||fz   }|S )Nr^   c                 T    | j                  dj                  j                        S 
projectionr$   r  r   r   statesr~   r   s    r5   r+   z+LongT5LocalAttention.forward.<locals>.shapef  "    ;;z2t||T=T=TUUr7   c                 Z    | j                         j                  dj                        S r<   r$   r#  r  r   rQ  s    r5   unshapez-LongT5LocalAttention.forward.<locals>.unshapej  %    $$&++JDNNKKr7   r   rC   r_   ...qhd,...khd->...hqkr   r  Tr   rq       _r$   r  ...hqk,...khd->...qhd)r+   r   r   r   r?   r   rL   r.   r   r   r/   r   r:   r"   r   r  r   r  ri   r{   r   r2   r!  r   r"  r   rh   r   )r   r   r   r%  r'  
seq_lengthr+   rW  r,  r/  r0  r2  r5  r6  r7  r~   s   `              @r5   r   zLongT5LocalAttention.forward]  sQ    "/!4!4Ra!8
J	V	L
 TVVM23466-01
TVVM23 *,AN'
DNNJ
),AN +:QRS
,\QUVW #\:
  33 %4<<T^^9KLU[UbUbjpjvjv! ..4==26M/ $ 1 1$.. A{{4!8S%8 -q!0D D-}},,V\\^,DLLVT}},,\T\\TXTaTa,b#((););<ell+BLR^_`!![j[!"34ff[) 

 /Gr7   Fr9  NNF)r   r   r   r   boolr   r<  r  r}   r  r   r   r   s   @r5   r>  r>    sP    ,| ,$ ,[_ ,0 -  - ^ 6 Gr7   r>  c                        e Zd Zddededdf fdZedd       ZdefdZ	d	e
j                  d
e
j                  de
j                  fdZ	 	 	 ddZ xZS )LongT5TransientGlobalAttentionr   r   r   Nc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  dz   | _        |j                  | _        |j                  | _        | j                  | j                  z  | _        t#        j$                  | j                  | j                   d      | _        t#        j$                  | j                  | j                   d      | _        t#        j$                  | j                  | j                   d      | _        t#        j$                  | j                   | j                  d      | _        | j                  r/t#        j.                  | j                  | j                        | _        | j                  r/t#        j.                  | j                  | j                        | _        t5        |j                  |j6                        | _        y )Nr   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   r@  r   rd   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normrA  s      r5   r   z'LongT5TransientGlobalAttention.__init__  s    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.!'!9!9**(?(??4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD( ++24,,t?b?bdhdpdp2qD/'6v~~6KdKd'e$r7   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S r   r   r   s           r5   r  z8LongT5TransientGlobalAttention._relative_position_bucket  rC  r7   rD  c                    | j                   j                  j                  j                  dk7  r | j                   j                  j                  nd}t	        j
                  d|z  t        j                  |      }|||  }|dddf   |dddf   z
  }| j                  || j                   | j                  | j                        }| j                  |      }|j                  g d      j                  d      j                  d      }|S rF  rH  rI  s           r5   r  z+LongT5TransientGlobalAttention.compute_bias  rK  r7   r   r   c                 v   t        j                  |d   |d d d d d f         d d d df   }t        j                  |dkD  dd      }t        || j                        }| j                  || j                   | j                  | j                        }| j                  |      }|j                  g d      }||z   }|S )Nr   .r   rq   rZ  r  )r   r   r   r^   )r.   eqri   r   rd   r  r   r   r   rc  r  )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r5   compute_side_biasz0LongT5TransientGlobalAttention.compute_side_bias  s    #hhtI8J1dTU:8VWXY[_adXde#kk*=*A3N!A$H^H^!_(,(F(F"#.;;==	 )G )
% 778UV	 %%l3	1I=""r7   c                 6	    |j                   d d \  } fd} fd}t        ||n!t        j                  |j                   d d        j                        \  }}	|	j                   d   }
t        |||
      } j                  |      } | j                  |            } | j                  |            } | j                  |            } | j                  |            } | j                  |            }t        | j                  d      }t        | j                  d      }t        | j                  d      }t        |dd      }t        |dd      }dg|j                  dz   z  }|j                   d   |d<   |j                  d      j                  |      }|j                  d      j                  |      }t        j                   ||gd      }t        j                   ||gd      }t        j"                  d||      }|<t%        | j                  |j&                        }t        j(                  |d	kD  d
d      }nd }|j j*                  srt        j,                  dd j.                   j                  d j                  z  f|j&                  |j0                        } j2                  r/ j4                  r#d|_        n j9                   j                        }|||j;                  dd      z   }|j=                  |j0                        }|t        j                  |      } j?                  ||	      }t        | j                  d      j;                  dd      }|j=                  |j0                        jA                  |j&                        }t        j                   ||gd      }||z  }tB        jD                  jG                  |jI                         d      jK                  |      }tB        jD                  jM                  | jL                   j4                        }|j=                  |j0                        } |t        j"                  d||            }|d d d |d d f   } jO                  |      }||f}|r||fz   }|S )Nr^   c                 T    | j                  dj                  j                        S rN  rP  rQ  s    r5   r+   z5LongT5TransientGlobalAttention.forward.<locals>.shape+  rS  r7   c                 Z    | j                         j                  dj                        S rU  rV  rQ  s    r5   rW  z7LongT5TransientGlobalAttention.forward.<locals>.unshape/  rX  r7   r$   r   rC   r_   rY  r   rq   rZ  r   r  Tr`   r  r[  )(r+   r   r.   r|   rd   r   rd  r   r   r   r?   r   rL   r0   rP   rz   rH   r   rc   r:   ri   r   r/   r   r"   r   r  r   r  r{   rh   rm  rX   r   r2   r!  r   r"  r   r   )r   r   r   r%  r'  r\  r+   rW  rf   r   _global_seq_lenglobal_inputsr,  r/  r0  side_key_statesside_value_statesrepsr2  rU   side_position_biasr5  r6  r7  r~   s   `                       @r5   r   z&LongT5TransientGlobalAttention.forward"  s0    "/!4!4Ra!8
J	V	L )E$D%**]5H5H"5M*N"")
%	%
 -22261-O\44]C TVVM23466-01
TVVM23} 56!$&&"78 *,AN'
DNNJ
),AN +:QRS
,\QUVW so**Q./""1%Q)33A6==dC-77:AA$G YY
O<!D
yy,0A!BJ 5|ZP#<T4>>S`SgSg#h #(;;/Ca/Ge#T #'  33 %4<<T^^9KL!== ,,!
 ..4==26M/ $ 1 1$.. A#/ -0D0N0NqRS0T T)..v||<M |zz*j9!%!7!7>P!Q!34F\^!_!i!ijkmn!o!3!8!8!F!I!I&--!X!II}6H&IrRM-}},,V\\^,DLLVT}},,\T\\TXTaTa,b#((););<ell+BLR^_`!![j[!"34ff[)./Gr7   r]  r9  r^  )r   r   r   r   r_  r   r<  r  r}   r  r.   rs   rm  r   r   r   s   @r5   ra  ra    s}    f| f$ f[_ f8 -  - ^ 0#ell # #Y^YeYe #0 qr7   ra  c                   >     e Zd Zddedz  f fdZ	 	 	 	 	 ddZ xZS )LongT5LayerSelfAttentionNr   c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r   s       r5   r   z!LongT5LayerSelfAttention.__init__  sT    ,0KW`
 *&..f>W>WXzz&"5"56r7   c                     | j                  |      }| j                  ||||||      }	|| j                  |	d         z   }|f|	dd  z   }
|
S )N)r   r%  r&  	use_cacher'  r   r   )r   r{  r   )r   r   r\   r%  r&  r}  r'  r(  normed_hidden_statesattention_outputr7  s              r5   r   z LongT5LayerSelfAttention.forward  sp      $}=-- '+/ . 
 &5Ea5H(II "%5ab%99r7   r8  )NNNFFr   r   r   r}   r   r   r   r   s   @r5   rx  rx    s*    7SSWZ 7 r7   rx  c                   D     e Zd ZdZddedz  f fdZ	 	 	 ddefdZ xZS )	LongT5LayerLocalSelfAttentionz$Local self attention used in encoderNr   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y N)r   r   )r   r   r>  LocalSelfAttentionr   r   r   r   r   r   r   r   r   s       r5   r   z&LongT5LayerLocalSelfAttention.__init__  sL    "6v[v"w)&..f>W>WXzz&"5"56r7   r(  c                     | j                  |      }| j                  ||||      }|| j                  |d         z   }|f|dd  z   }|S N)r   r%  r'  r   r   )r   r  r   	r   r   r\   r%  r'  r(  r~  r  r7  s	            r5   r   z%LongT5LayerLocalSelfAttention.forward  sj      $}=22 '/	 3 
 &5Ea5H(II "%5ab%99r7   r8  r^  	r   r   r   __doc__r}   r   r   r   r   r   s   @r5   r  r    s1    .7SSWZ 7  r7   r  c                   D     e Zd ZdZddedz  f fdZ	 	 	 ddefdZ xZS )	'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderNr   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y r  )r   r   ra  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   s       r5   r   z0LongT5LayerTransientGlobalSelfAttention.__init__  sQ    ,J0K-
) *&..f>W>WXzz&"5"56r7   r(  c                     | j                  |      }| j                  ||||      }|| j                  |d         z   }|f|dd  z   }|S r  )r   r  r   r  s	            r5   r   z/LongT5LayerTransientGlobalSelfAttention.forward  sj      $}=<< '/	 = 
 &5Ea5H(II "%5ab%99r7   r8  r^  r  r   s   @r5   r  r    s1    97SSWZ 7  r7   r  c                   <     e Zd Zddedz  f fdZ	 	 	 	 ddZ xZS )LongT5LayerCrossAttentionNr   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFrz  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   s      r5   r   z"LongT5LayerCrossAttention.__init__  sO    .vSXdmn)&..f>W>WXzz&"5"56r7   c                     | j                  |      }| j                  ||||||      }	|| j                  |	d         z   }
|
f|	dd  z   }|S )N)r   r$  r%  r&  r'  r   r   )r   r  r   )r   r   r$  r\   r%  r&  r'  r(  r~  r  layer_outputr7  s               r5   r   z!LongT5LayerCrossAttention.forward  so      $}=// -'+/ 0 
 %t||4DQ4G'HH/$4QR$88r7   r   )NNNFr  r   s   @r5   r  r    s&    7#* 7 r7   r  c                   F     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 	 	 ddZ xZS )LongT5BlockNr   c                    t         |           |j                  | _        |j                  rt        }nE|j                  dk(  rt
        }n/|j                  dk(  rt        }nt        d|j                   d      t        j                         | _
        | j                  j                   ||||             | j                  r&| j                  j                  t        ||             | j                  j                  t        |             y )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .rz  )r   )r   r   r   rx  encoder_attention_typer  r  
ValueErrorr   
ModuleListlayerrG   r  r   )r   r   r   r   attention_layerr   s        r5   r   zLongT5Block.__init__  s     ++6O**g5;O**.@@EO!889<  ]]_


F@[gpq	
 ??JJ7)TU

-/0r7   c                     | j                   d   ||||||	      }|d   }|dd  }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }| j                  xr |d u}|r | j                   d   ||||||	      }|d   }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }||dd  z   } | j                   d   |      }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|f|z   S )Nr   )r\   r%  r&  r}  r'  r   i  )r   rx   )r$  r\   r%  r&  r'  r$   )
r  r"   r.   r   isinfanyfinforx   clampr   )r   r   r\   r%  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr&  r}  r'  return_dictr(  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputss                    r5   r   zLongT5Block.forward,  s    "/A)'+/"
 /q12126 %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM!__R1Fd1R&3djjm!65; /"3'# 4A6M ""emm3M8R8V8V8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O '

2}5 %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM 00	
r7   r8  )	NNNNNNFFTr  r   s   @r5   r  r    s6    1SSWZ 14 "#&*<
r7   r  c                   n    e Zd ZU eed<   dZdZdgZdZe	d        Z
 ej                         d        Zd Zy	)
LongT5PreTrainedModelr   transformerTr  Fc                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r.   rw   r   r   )r   r  
input_maskdummy_inputss       r5   r  z"LongT5PreTrainedModel.dummy_inputst  s8     LL.	\\*-
!*"&0

 r7   c                    | j                   j                  }t        |t              r$t	        j
                  |j                  |dz         yt        |t        t        t        f      rt	        j                  |j                  j                  d|dz         t        |d      rG| j                   j                  s0t	        j                  |j                  j                  d|dz         yyyt        |t              r9t	        j                  |j                   j                  d|| j                   j"                  dz  z         t        |j                   d      r?|j                   j$                  )t	        j&                  |j                   j$                         t	        j                  |j(                  j                  d|| j                   j*                  dz  z         t        |j(                  d      rA|j(                  j$                  *t	        j&                  |j(                  j$                         yyyt        |t,              rt	        j                  |j.                  j                  d|| j                   j"                  dz  z         t        |j.                  d      r?|j.                  j$                  )t	        j&                  |j.                  j$                         t	        j                  |j0                  j                  d|| j                   j"                  dz  z         t        |j0                  d      r?|j0                  j$                  )t	        j&                  |j0                  j$                         t	        j                  |j(                  j                  d|| j                   j*                  dz  z         t        |j(                  d      rA|j(                  j$                  *t	        j&                  |j(                  j$                         yyyt        |t2        t4        t6        f      r| j                   j"                  }| j                   j8                  }| j                   j:                  }t	        j                  |j<                  j                  d|||z  dz  z         t	        j                  |j>                  j                  d||dz  z         t	        j                  |j@                  j                  d||dz  z         t	        j                  |jB                  j                  d|||z  dz  z         |jD                  rvt	        j                  |jF                  j                  d||dz  z         t        |t6              r3t	        j                  |jH                  j                  d||dz  z         yyyy)zInitialize the weightsrr   rq   )r   stdlm_head      r   N)%r   initializer_factorr   r   init	constant_r   LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelnormal_sharedhasattrtie_word_embeddingsr  r   r   r   r   zeros_r   r   r   r   r   r   r>  ra  r   r   r   r   r   r   r   r   rc  )r   modulefactorr   r   r   s         r5   _init_weightsz#LongT5PreTrainedModel._init_weights  s    //fo.NN6==&3,7.LN` abLL--CVc\Jvy)$++2Q2QV^^22&3,O 3R) 34LL))DKKDWDW\`C`9abvyy&)fiinn.HFIINN+LL))DKKDTDTY]C]9^_vyy&)fiinn.HFIINN+ /I) 89LL++#6dkkFYFY^bEb;cdv{{F+0@0@0LFKK,,-LL++#6dkkFYFY^bEb;cdv{{F+0@0@0LFKK,,-LL))DKKDTDTY]C]9^_vyy&)fiinn.HFIINN+ /I)2FHf ghkk))G!%!1!1kk++GLLs7M_C_dhBh8ijLLs'4-8PQLLs'4-8PQLLs7M_C_dhBh8ij11V;;BBRX]dim\mRnof&DELL==DD3TZ_fko^oTp F 2 ir7   c                 8   | j                   j                  }| j                   j                  }|t        d      |j	                  |j
                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |dk(  |       |S )Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information..r$   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  	new_zerosr+   r  masked_fill_)r   r  r  r  shifted_input_idss        r5   _shift_rightz"LongT5PreTrainedModel._shift_right  s    !%!C!C{{//!)8 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r7   N)r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_can_compile_fullgraphpropertyr  r.   no_gradr  r  r%   r7   r5   r  r  k  sV    %&*#&"  U]]_' 'T!r7   r  c                   @     e Zd Z fdZd Z	 	 	 	 	 	 	 	 	 	 ddZ xZS )LongT5Stackc                 `   t         |   |       t        j                  |j                  |j
                        | _        |j                  | _        |j                  | _        | j                  dz   | _	        t        j                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _        t!        |j
                  |j"                        | _        t        j&                  |j(                        | _        d| _        | j/                          y c c}w )Nr   r   rz  r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r@  r   r  rD   
num_layersr  r_  blockr   r   final_layer_normr   r   r   r   	post_init)r   r   rJ   r   s      r5   r   zLongT5Stack.__init__  s     LL):):FNNK ++"//**Q.]] v001 FQ!VXYZ

 !0FD]D] ^zz&"5"56&+# 	s   !D+c                     || _         y r   )r  r   new_embeddingss     r5   set_input_embeddingsz LongT5Stack.set_input_embeddings  s
    *r7   c                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|$|"| j
                  rdnd}t        d| d| d      |&|j                         }|j                  d|d         }n8||j                         d d }n"| j
                  rdnd}t        d| d| d	      | j                  r%| j                  r|rt        j                  d
       d}|$| j                  J d       | j                  |      }|\  }}| j
                  rf|rr|p| j                   j                  r5t        t!        | j                         t!        | j                               }n%t!        | j                         }n| j
                  sd }||j#                         nd}|1t%               s'||z   }t'        j(                  |||j*                        }| j
                  rt-        | j                   |||      }n=| j                   j.                  dk(  r"t1        || j2                  |j*                        }n|}| j
                  rO|M|j                         \  }}}||f}|!t'        j(                  ||j*                        }| j5                  |      }nd }|	rdnd }|rdnd }|r| j
                  rdnd }d }d }| j7                  |      }t9        | j:                        D ]b  \  }}|	r||fz   } |||||||||||

      } | d   }| d   }| j
                  r|	| |rdnd   }|sD|| d   fz   }| j
                  sZ|| d   fz   }d | j=                  |      }| j7                  |      }|	r||fz   }|
st?        d |||||fD              S tA        |||||      S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer$   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddings)r   r   ro   )r   r  r\   r&  r  r%   )r&  r}  r'  r  r   r   r^      c              3   $   K   | ]  }|| 
 y wr   r%   ).0r   s     r5   	<genexpr>z&LongT5Stack.forward.<locals>.<genexpr>j  s      
 = 
s   )last_hidden_stater&  r   
attentionscross_attentions)!r   r}  r'  output_hidden_statesr  r   r  sizer  r   r  r   r   r  is_encoder_decoderr   r
   r  r   r.   r|   r:   r   r  rc   r   invert_attention_maskr   	enumerater  r  rF   r   )!r   r  r\   r  r  r  r&  r}  r'  r  r  r(  err_msg_prefixr)  r~   r\  past_key_values_lengthmask_seq_lengthr3  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr%  r  r   rJ   layer_modulelayer_outputss!                                    r5   r   zLongT5Stack.forward  sK    "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	 $$0p2pp0 --i8M!,
J??_4;;11&9$DKK8,dkk:Z'O '3$++&FO #OETE`!?!?!Afg!*B*D4zAO"ZZ
OML`L`aN??,{{+- /	K [[//7:3NDNNTaThThiK(K ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+"6BD0d&7DOOrRV(,%]3(4  	VOA|#$58H$H!(%/- /#"3'M  *!,M
 *!,M#8#D0=CTaZ[0\- !/=3C2E!E??+?=QRCSBU+U(A 	VD --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r7   )
NNNNNNNNNN)r   r   r   r   r  r   r   r   s   @r5   r  r    s3    0+
 "#!^
r7   r  c                       e Zd ZdgZdddZdef fdZd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 dd	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  deee
j                        dz  dedz  de
j                   dz  de
j                   dz  dedz  dedz  dedz  dedz  dee
j                     ez  fd       Z xZS )r  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightshared.weight)encoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        d|_
        t        |      | _        t        j                  |      }d|_	        |j                  |_        t        |      | _        | j!                          y )NFT)r   r   r   r   r  r   r  copydeepcopyr   r}  r  encodernum_decoder_layersr  decoderr  r   r   encoder_configdecoder_configr   s       r5   r   zLongT5Model.__init__  s     ll6#4#4fnnEv.$)!#( ">2v.$(!$*$=$=!">2 	r7   c                     | j                   S r   r  r   s    r5   get_input_embeddingsz LongT5Model.get_input_embeddings      {{r7   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r   r  r
  r  r  r  s     r5   r  z LongT5Model.set_input_embeddings  -    $)).9)).9r7   Nr  r\   r  r  encoder_outputsr&  r  decoder_inputs_embedsr}  r'  r  r  r   c                 D   |	|	n| j                   j                  }	||n| j                   j                  }|| j                  ||||
||      }nI|rGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }| j                  |||||||	|
||
      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )	a	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  r\   r  r'  r  r  r   r   r^   r  r   r  
r  r\   r  r&  r  r  r}  r'  r  r  )r  r&  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)r   r}  r  r
  r   r   lenr  r   r  r&  r   r  r  )r   r  r\   r  r  r  r&  r  r  r}  r'  r  r  r(  r   decoder_outputss                   r5   r   zLongT5Model.forward  sO   D "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1/!5# ' 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r7   )NNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r   r  r  r   r.   
LongTensorFloatTensor
BoolTensorrF   r	   rs   r_  r   r   r   r   s   @r5   r  r  ~  sx    	R*& (7'6
| ":
  .23759:>BF(,-159!%)-,0#'q
##d*q
 ))D0q
 !++d2	q

 !& 0 04 7q
 uU%6%6784?q
 q
 ||d*q
  %||d2q
 $;q
  $;q
 #Tkq
 D[q
 
u  	!$6	6q
 q
r7   r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc                       e Zd ZdgZddddZdef fdZd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 dd	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  deee
j                        dz  dedz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  dedz  dedz  dee
j                     ez  fd       Zde
j                  fdZ xZS )r  r  r  )r  r  zlm_head.weightr   c                    t         |   |       |j                  | _        t	        j
                  |j                  |j                        | _        t        j                  |      }d|_
        d|_        t        |      | _        t        j                  |      }d|_
        |j                  |_        t        |      | _        t	        j"                  |j                  |j                  d      | _        | j'                          y )NFTr   )r   r   r   	model_dimr   r   r  r  r  r	  r   r}  r  r
  r  r  r  r   r  r  r  s       r5   r   z'LongT5ForConditionalGeneration.__init__%  s     ll6#4#4fnnEv.$)!#( ">2v.$(!$*$=$=!">2yy1B1BO 	r7   c                     | j                   S r   r  r  s    r5   r  z3LongT5ForConditionalGeneration.get_input_embeddings:  r  r7   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r   r  r  s     r5   r  z3LongT5ForConditionalGeneration.set_input_embeddings=  r  r7   Nr  r\   r  r  r  r&  r  r  labelsr}  r'  r  r  r   c                    |
|
n| j                   j                  }
||n| j                   j                  }|| j                  ||||||      }nI|rGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|	||| j                  |	      }| j                  |||||||
|||
      }|d   }| j                   j                  r|| j                  dz  z  }| j                  |      }d}|	^t        d	
      }|	j                  |j                        }	 ||j                  d|j!                  d            |	j                  d            }|s|f|dd z   |z   }||f|z   S |S t#        |||j$                  |j&                  |j(                  |j*                  |j,                  |j&                  |j(                  	      S )a7  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nr  r   r   r^   r  r  r  r  )ignore_indexr$   )	losslogitsr&  r  r  r  r   r  r!  )r   r}  r  r
  r   r   r"  r  r  r  r,  r  r   rX   r:   r  r  r   r&  r   r  r  r  )r   r  r\   r  r  r  r&  r  r  r/  r}  r'  r  r  r(  r   r#  sequence_output	lm_logitsr2  loss_fctoutputs                         r5   r   z&LongT5ForConditionalGeneration.forwardB  s   L "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1/!5# ' 
 *!,;;**-1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r7   c                 $    | j                  |      S r   )r  )r   r/  s     r5   %prepare_decoder_input_ids_from_labelszDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r7   )NNNNNNNNNNNNN)r   r   r   r$  r%  r   r   r  r  r   r.   r&  r'  r(  rF   rs   r	   r_  r   r   r9  r   r   s   @r5   r  r    s    	R*& (7'6)| *:
  .23759:>=A(,26:>*.!%)-,0#'J
##d*J
 ))D0J
 !++d2	J

 !& 0 04 7J
 uU\\23d:J
 J
 ((4/J
  %0047J
   4'J
 $;J
  $;J
 #TkJ
 D[J
  
u  	!O	3!J
 J
X)ELL )r7   r  c                        e Zd ZddiZdgZdef fdZd Zd Ze		 	 	 	 	 	 dd	e
j                  dz  d
e
j                  dz  de
j                  dz  dedz  dedz  dedz  dee
j                     ez  fd       Z xZS )r  r  r  r  r   c                     t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        t        |      | _        | j                          y )NF)r   r   r   r   r  r   r  r  r	  r}  r  r
  r  )r   r   r  r   s      r5   r   zLongT5EncoderModel.__init__  sZ     ll6#4#4fnnEv.#( ">2 	r7   c                     | j                   S r   r  r  s    r5   r  z'LongT5EncoderModel.get_input_embeddings  r  r7   c                 H    || _         | j                  j                  |       y r   )r  r
  r  r  s     r5   r  z'LongT5EncoderModel.set_input_embeddings  s    $)).9r7   Nr  r\   r  r'  r  r  r   c                 h    ||n| j                   j                  }| j                  ||||||      }|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```r  )r   r  r
  )	r   r  r\   r  r'  r  r  r(  r  s	            r5   r   zLongT5EncoderModel.forward  sH    F &1%<k$++BYBY,,)'/!5# ' 
 r7   )NNNNNN)r   r   r   r%  r$  r   r   r  r  r   r.   r&  r'  r_  rF   r   r   r   r   s   @r5   r  r    s     	& +5&	| 	:  .23726)-,0#'-##d*- ))D0- ((4/	-
  $;- #Tk- D[- 
u  	!O	3- -r7   r  )r  r  r  r  )r   )Pr  r  r   typingr   r.   r   torch.nnr   r  r   r  activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_longt5r   
get_loggerr   r   rs   r}   r6   r?   rL   rT   r[   r:   rc   rF   r   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionwarningr   r   r   r   r>  ra  rx  r  r  r  r  r  r  r  r  r  __all__r%   r7   r5   <module>rR     s         % & ! C C ) / 9  .  / 
		H	%  3 3 W\WcWc  #%,, #3 #S #U\\ #4U\\ 4c 4 4Y\ 4ejeqeq 42!# !%,, !BU\\ Bc BV[VbVb B8ell 8s 8TYT`T` 8ejeqeq 8 .PLL.P58.P
5<<%&.Pb4U\\ 4VY 4^c^j^j 4	j<<	j,1LL	jJM	j
\\	j+bii +2
]/"O
KKef")) ,ryy &BII &bii Di299 iXlRYY l`ryy BBII :bii @		 >T
, T
n R!O R! R!jz
' z
z T
' T
 T
n 
u)%:O u)
u)p F. F FR kQ1  	 ]
NN[\]s   L L;"L;:L;