
    i                         d Z ddlmZ ddlmZ ddlmZmZ  ej                  e	      Z
 ed      e G d d	e                    Zd	gZy
)zXLNet configuration    )strict   )PreTrainedConfig)auto_docstringloggingzxlnet/xlnet-large-cased)
checkpointc                   :    e Zd ZU dZdZdgZdddddZd	Zee	d<   d
Z
ee	d<   dZee	d<   dZee	d<   dZee	d<   dZedz  e	d<   dZee	d<   dZee	d<   dZee	d<   dZee	d<   dZeez  e	d<   dZedz  e	d<   dZedz  e	d<   dZee	d<   d Zee	d!<   d Zee	d"<   d#Zee	d$<   d Zee	d%<   d&Zee	d'<   dZee	d(<   d)Z ee	d*<   dZ!eez  e	d+<   d,Z"ee	d-<   d,Z#ee	d.<   d,Z$edz  e	d/<   d0Z%edz  e	d1<   d2Z&ee'e   z  dz  e	d3<   dZ(ee	d4<    fd5Z)d6 Z*e+d7        Z,e,jZ                  d8        Z, xZ.S )9XLNetConfiga  
    ff_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the If string, `"gelu"`, `"relu"`, `"silu"` and
        `"gelu_new"` are supported.
    attn_type (`str`, *optional*, defaults to `"bi"`):
        The attention type used by the model. Set `"bi"` for XLNet, `"uni"` for Transformer-XL.
    mem_len (`int` or `None`, *optional*):
        The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous
        forward pass won't be re-computed. See the
        [quickstart](https://huggingface.co/transformers/quickstart.html#using-the-past) for more information.
    reuse_len (`int`, *optional*):
        The number of tokens in the current batch to be cached and reused in the future.
    use_mems_eval (`bool`, *optional*, defaults to `True`):
        Whether or not the model should make use of the recurrent memory mechanism in evaluation mode.
    use_mems_train (`bool`, *optional*, defaults to `False`):
        Whether or not the model should make use of the recurrent memory mechanism in train mode.
        <Tip>
        For pretraining, it is recommended to set `use_mems_train` to `True`. For fine-tuning, it is recommended to
        set `use_mems_train` to `False` as discussed
        [here](https://github.com/zihangdai/xlnet/issues/41#issuecomment-505102587). If `use_mems_train` is set to
        `True`, one has to make sure that the train batches are correctly pre-processed, *e.g.* `batch_1 = [[This
        line is], [This is the]]` and `batch_2 = [[ the first line], [ second line]]` and that all batches are of
        equal size.
        </Tip>
    bi_data (`bool`, *optional*, defaults to `False`):
        Whether or not to use bidirectional input pipeline. Usually set to `True` during pretraining and `False`
        during finetuning.
    clamp_len (`int`, *optional*, defaults to -1):
        Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping.
    same_length (`bool`, *optional*, defaults to `False`):
        Whether or not to use the same attention length for each token.
    summary_type (`str`, *optional*, defaults to "last"):
        Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
        Has to be one of the following options:
            - `"last"`: Take the last token hidden state (like XLNet).
            - `"first"`: Take the first token hidden state (like BERT).
            - `"mean"`: Take the mean of all tokens hidden states.
            - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
            - `"attn"`: Not implemented now, use multi-head attention.
    summary_use_proj (`bool`, *optional*, defaults to `True`):
        Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
        Whether or not to add a projection after the vector extraction.
    summary_activation (`str`, *optional*):
        Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
        Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
    summary_last_dropout (`float`, *optional*, defaults to 0.1):
        Used in the sequence classification and multiple choice models.
        The dropout ratio to be used after the projection and activation.
    start_n_top (`int`, *optional*, defaults to 5):
        Used in the SQuAD evaluation script.
    end_n_top (`int`, *optional*, defaults to 5):
        Used in the SQuAD evaluation script.

    Examples:

    ```python
    >>> from transformers import XLNetConfig, XLNetModel

    >>> # Initializing a XLNet configuration
    >>> configuration = XLNetConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = XLNetModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```xlnetmems
vocab_sized_modeln_headn_layer)n_tokenhidden_sizenum_attention_headsnum_hidden_layersi }  i         i   d_innerNd_headgeluff_activationbi	attn_typeg{Gz?initializer_rangeg-q=layer_norm_epsg?dropouti   mem_len	reuse_lenTuse_mems_evalFuse_mems_trainbi_data	clamp_lensame_lengthlastsummary_typesummary_use_projtanhsummary_activationsummary_last_dropout   start_n_top	end_n_toppad_token_id   bos_token_id   eos_token_idtie_word_embeddingsc                 |    | j                   xs | j                  | j                  z  | _         t        |   di | y )N )r   r   r   super__post_init__)selfkwargs	__class__s     ~/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/xlnet/configuration_xlnet.pyr:   zXLNetConfig.__post_init__   s0    kk@T\\T[[%@''    c                 :   | j                   | j                  z  dk7  r&t        d| j                   | j                  z   d      | j                  | j                   | j                  z  k7  r3t        d| j                   d| j                   | j                  z   d      y)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   z'd_model % n_head' (z) should be equal to 0z
`d_head` (z*) should be equal to `d_model // n_head` ()N)r   r   
ValueErrorr   r;   s    r>   validate_architecturez!XLNetConfig.validate_architecture   s    <<$++%*3DLL4;;4N3OOefgg;;$,,$++55T[[M)STXT`T`dhdodoToSppqr  6r?   c                 J    t         j                  d| j                   d       y)N
The model < is one of the few models that has no sequence length limit.r%   )loggerinfo
model_typerC   s    r>   max_position_embeddingsz#XLNetConfig.max_position_embeddings   s     j 11mnor?   c                 4    t        d| j                   d      )NrF   rG   )NotImplementedErrorrJ   )r;   values     r>   rK   z#XLNetConfig.max_position_embeddings   s#     "))ef
 	
r?   )/__name__
__module____qualname____doc__rJ   keys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   r   r   r   r   r   strr   r   floatr   r   r    r!   r"   boolr#   r$   r&   r'   r)   r*   r,   r-   r/   r0   r1   r3   r5   listr6   r:   rD   propertyrK   setter__classcell__)r=   s   @r>   r
   r
      s   BH J#)( '&	M JGSGSFCGSFC$JM3Is#u#!NE!GUS[GS4Z IsTz M4 ND GTIsKL#!d!$$(+%#++KIs L#*  L#* +,L#S	/D(, $$(   ##
 $
r?   r
   N)rR   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r   
get_loggerrO   rH   r
   __all__r8   r?   r>   <module>rc      s^     . 3 , 
		H	% 45B
" B
  6B
J /r?   