
    i                     j    d Z ddlmZ ddlmZ ddlmZ  ed      e G d d	e                    Zd	gZy
)zLongT5 model configuration    )strict   )PreTrainedConfig)auto_docstringzgoogle/long-t5-local-base)
checkpointc                       e Zd ZU dZdZdgZdddddZd	Zee	d
<   dZ
ee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZeez  e	d<   dZee	d<   dZee	d <   d!Zee	d"<   d#Zee	d$<   d%Zee	d&<   d#Zee	d'<   d(Zedz  e	d)<   d*Zee e   z  dz  e	d+<   dZ!edz  e	d,<   d-Z"ee	d.<   d#Z#ee	d/<    fd0Z$d1 Z% xZ&S )2LongT5Configa!  
    d_ff (`int`, *optional*, defaults to 2048):
        Size of the intermediate feed forward layer in each `LongT5Block`.
    local_radius (`int`, *optional*, defaults to 127):
        Number of tokens to the left/right for each token to locally self-attend in a local attention mechanism.
    global_block_size (`int`, *optional*, defaults to 16):
        Length of blocks an input sequence is divided into for a global token representation. Used only for
        `encoder_attention_type = "transient-global"`.
    relative_attention_num_buckets (`int`, *optional*, defaults to 32):
        The number of buckets to use for each attention layer.
    relative_attention_max_distance (`int`, *optional*, defaults to 128):
        The maximum distance of the longer sequences for the bucket separation.
    feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
        Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. LongT5v1.1 uses the
        `"gated-gelu"` feed forward projection. Original LongT5 implementation uses `"gated-gelu"`.
    encoder_attention_type (`string`, *optional*, defaults to `"local"`):
        Type of encoder attention to be used. Should be one of `"local"` or `"transient-global"`, which are
        supported by LongT5 implementation.
    longt5past_key_valuesd_model	num_heads
num_layersd_kv)hidden_sizenum_attention_headsnum_hidden_layershead_dimi}  
vocab_sizei   @   i   d_ff   Nnum_decoder_layers      local_radius   global_block_size    relative_attention_num_buckets   relative_attention_max_distanceg?dropout_rategư>layer_norm_epsilong      ?initializer_factorrelufeed_forward_projTis_encoder_decoderlocalencoder_attention_type	use_cacher   pad_token_id   eos_token_idbos_token_idF
is_decodertie_word_embeddingsc                    | j                   | j                   n| j                  | _         | j                  j                  d      }|d   | _        |d   dk(  | _        | j                  dk(  rd| _        t        |   di | y )N-r   gatedz
gated-gelugelu_new )r   r   r&   splitdense_act_fnis_gated_actsuper__post_init__)selfkwargsact_info	__class__s      /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/longt5/configuration_longt5.pyr;   zLongT5Config.__post_init__N   s~    =A=T=T=`$"9"9fjfufu))//4$RL$QK72!!\1 *D''    c                     | j                   j                  d      }t        |      dkD  r|d   dk7  st        |      dkD  rt        d| j                    d      y)	zOPart of `@strict`-powered validation. Validates the architecture of the config.r2   r,   r   r4      z`feed_forward_proj`: z is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'N)r&   r7   len
ValueError)r<   r>   s     r@   validate_architecturez"LongT5Config.validate_architectureY   sf    ))//4x=1!!73x=1;L'(>(>'? @) )  <MrA   )'__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   r   r   r   r   r   r   r   r   r!   r"   floatr#   r$   r&   strr'   boolr)   r*   r+   r-   listr.   r/   r0   r;   rF   __classcell__)r?   s   @r@   r	   r	      sC   ( J#4"5 *)	M JGSD#ND#J%)d
)IsL#s*,"C,+.#S. #L%#+# $$ ###s###")C)It L#* +,L#S	/D(,#L#*#J $$	(rA   r	   N)	rJ   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r6   rA   r@   <module>rY      sK    ! . 3 # 67I# I  8IX 
rA   