
    i                         U d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 ddl
mZmZ  ej                  e      Z e       rd dlZerddlmZ d Z	 	 	 	 d!d	ed
   ded   dedz  dedz  dedef   f
dZ	 	 	 	 	 d"d	ed
   ded   dedz  dedz  dededef   fdZ	 	 	 	 d!d	ed
   ded   dedz  dedz  dedef   f
dZ	 	 	 d#d	d
ded   dedz  dedz  dedef   f
dZ	 	 	 d#d	d
ded   dedz  dedz  dedef   f
dZ	 	 	 d#d	d
ded   dedz  dedz  dedef   f
dZeeeeeedZeeededef   f   f   e d<    G d de	      Z! G d d      Z"d$d	e"de#dz  fd Z$y)%    N)Callablewraps)TYPE_CHECKINGOptional	TypedDict   )is_torch_availablelogging)PreTrainedConfigc                 H     ddddt               d fd	       }|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                 P   t        j                  |      dz   }|4| j                  }| j                  }d}| j                  j
                  d   }n?| j                  |   }t        | | d      }| d}| j                  j
                  |   d   }||kD  r\t        | | d      s%t        |   }	 |	| j                  ||dz   |      \  }
}| j                  | d	
d
       t        | | d|
       y|j                  |      }| j                  | d	|d
       t        | | d|       y)zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r	   N  original_max_position_embeddings_original_inv_freq__long_inv_freqseq_len
layer_typeinv_freqF
persistentlong_inv_freqoriginal_inv_freq)torchmax	rope_typer   configrope_parametersgetattrhasattrROPE_INIT_FUNCTIONSregister_buffersetattrto)selfposition_idsdevicer   r   r   r   prefixr   rope_init_fnr   r   s               q/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_updatez6dynamic_rope_update.<locals>.longrope_frequency_update/   sY   ))L)A-I $ 6 6F/3{{/J/JKm/n,z2I '<N.O P"|1%F/3{{/J/J:/V20, 554J<~!>?29=#/KK<q@)	$ q   F88!4mPU VDVHM2MB !2 4 4V <  F88!46GTY ZDVH$568IJ    c                    t        j                  |      dz   }|'| j                  }| j                  }| j                  }d}n=| j                  |   }t        | | d| j                        }t        | | d      }| d}||kD  rNt        |   }	 |	| j                  |||      \  }
| _        | j                  | d|
d	
       t        | | d|       || j                  k  rc|| j                  kD  rS|j                  |      }| j                  | d|d	
       t        | | d|       t        | | d| j                         yyy)a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r	   Nr   _max_seq_len_cachedr   r   r   r   Fr   r   )r   r   r   max_seq_len_cachedr   r!   r#   r   attention_scalingr$   r%   original_max_seq_lenr&   )r'   r(   r)   r   r   r   r1   r   r*   r+   r   s              r,   dynamic_frequency_updatez5dynamic_rope_update.<locals>.dynamic_frequency_updateR   s|    ))L)A-I!%!8!8 $ 6 6Fz2I!(*=P/QSWSjSj!k '<N.O P"|1%F''.y9L/;%	0,Hd,   F88!4h5 QDZL(;<gFT...3EHaHa3a !2 4 4V <  F88!46GTY ZDVH$568IJDZL(;<d>W>WX 4b.r.   c                     || j                   n| j                   |   }|d|ini }d|v r | |fd|j                  i| n|dk(  r | |fd|j                  i|  | ||fi |S )Nr   dynamicr)   longrope)r   r)   )	r'   xr(   r   r   kwargsr4   r-   rope_forwards	         r,   wrapperz$dynamic_rope_update.<locals>.wrapperx   s    &0&8DNNdnnZ>X	/9/E,
+2	!$T<SSFS*$%dLTTVTD!\<V<<r.   Nr   )r:   r;   r4   r-   s   ` @@r,   dynamic_rope_updater=   "   s1    !KF$YL <= = Nr.   r   r   r)   ztorch.devicer   r   returnztorch.Tensorc                    | j                          || j                  |   n| j                  }|d   }|d   }|j                  dd      }t        | dd      xs | j                  | j
                  z  }t        ||z        }	d}
d|t        j                  d|	dt        j                  	      j                  |t        j                  
      |	z  z  z  }||z  }||
fS )aX  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nfactor
rope_thetapartial_rotary_factor      ?head_dimr      dtyper)   rG   )standardize_rope_paramsr    getr!   hidden_sizenum_attention_headsintr   arangeint64r&   float)r   r)   r   r   rope_parameters_dictr@   baserB   rD   dimattention_factorr   s               r,   '_compute_linear_scaling_rope_parametersrU      s    B ""$AKAW611*=]c]s]s!(+F  -D0445LcRvz40dF4F4F&JdJd4dH
h..
/C du||AsAU[[ILLTZbgbmbmLnqttuvH
 H%%%r.   head_dim_keyc                 t   | j                          || j                  |   n| j                  }t        | |d      xs | j                  | j                  z  }|d   }|j                  dd      }|j                  dd      }	d}
t        |	|z  dz        }d|t        j                  dd|z  dt        j                        j                  |t        j                  	      |z  z  z  }|dz  |z
  }|dkD  r>t        j                  |t        j                  |t        j                  |
      fd      }n|}||z  }||
fS )a  
    Computes the inverse frequencies with proportional RoPE.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): The proportion of the embedding dimension
                to apply rotary positional encoding, e.g., [0.0, 0.25, 0.5, 0.75, 1.0]. Unlike other RoPE functions
                that use this parameter, proportional RoPE will always return an encoding that is the size of
                `head_dim`.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    NrA   r@   rC   rB   rE   r   rF   rH   rG   r)   )rS   )rI   r    r!   rK   rL   rJ   rM   r   rN   rO   r&   rP   catzerosfloat32)r   r)   r   r   rV   rQ   rD   rR   r@   rope_proportionrT   rope_anglesinv_freq_rotatednope_anglesr   s                  r,   %_compute_proportional_rope_parametersr`      sK   J ""$AKAW611*=]c]s]sv|T2ff6H6HFLfLf6fH-D!%%h4F*../FLOo0A56KLLAOQekkBEEV[`[f[fEgjrr	t
 a-+-KQ99 Ku}}VL 
 $H%%%r.   c                    | j                          || j                  |   n| j                  }|d   }|j                  dd      }t        | d| j                  | j
                  z        }t        ||z        }|d   }	d}
|| j                  }n{t        |t        j                        rKt        j                  |t        j                  | j                  |j                  |j                              }nt        || j                        }||	|z  | j                  z  |	dz
  z
  ||dz
  z  z  z  }d|t        j                   d	|dt        j"                  
      j%                  |t        j&                        |z  z  z  }||
fS )a
  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    rA   rB   rC   rD   r@   rX   r	   rE   r   rF   rH   )rI   r    rJ   r!   rK   rL   rM   max_position_embeddings
isinstancer   TensormaximumtensorrG   r)   r   rN   rO   r&   rP   )r   r)   r   r   rQ   rR   rB   rD   rS   r@   rT   r   s               r,   _compute_dynamic_ntk_parametersrg     sw   V ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h..
/C!(+F 00	GU\\	*--LL77w}}U\UcUcd

 gv==> FW$v'E'EE&ST*U[^behibi[jkkDdu||AsAU[[ILLTZbgbmbmLnqttuvH%%%r.   c                    | j                          || j                  |   n| j                  }|d   }|j                  dd      }t        | d| j                  | j
                  z        }t        ||z        }|d   }	|j                  d      }
|j                  d      }|j                  d      }|d	   }|	| j                  |z  }	dd}|
)|r|rt         ||	|       ||	|      z        }
n ||	      }
|j                  d      xs d}|j                  d      xs d
}d fd}d }|t        j                  d|d      j                  |t        j                        |z  z  }d|z  }d|	|z  z  }| j                  j                  dd      } |||||||      \  }}d
 ||||dz        j                  |t        j                        z
  }|d
|z
  z  ||z  z   }||
fS )aD  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as available.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used during pretraining.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    rA   rB   rC   rD   r@   rT   mscalemscale_all_dimr   r	   c                 J    | dk  ryd|z  t        j                  |       z  dz   S )Nr	   rC   g?)mathlog)scaleri   s     r,   
get_mscalez,_compute_yarn_parameters.<locals>.get_mscale  s(    A:V|dhhuo-33r.   	beta_fast    	beta_slowc                     |t        j                  || dz  t         j                  z  z        z  dt        j                  |      z  z  S )zPInverse dimension formula to find the dimension based on the number of rotationsrE   )rl   rm   pi)num_rotationsrS   rR   rb   s       r,   find_correction_dimz5_compute_yarn_parameters.<locals>.find_correction_dim  sB    dhh6-!:Kdgg:UVWW\]`d`h`him`n\noor.   c                      | |||      } ||||      }|r*t        j                  |      }t        j                  |      }t        |d      t	        ||dz
        fS )z.Find dimension range bounds based on rotationsr   r	   )rl   floorceilr   min)	low_rothigh_rotrS   rR   rb   truncatelowhighrv   s	           r,   find_correction_rangez7_compute_yarn_parameters.<locals>.find_correction_range  s^    !'36MN"8S$8OP**S/C99T?D3{CcAg...r.   c                     | |k(  r|dz  }t        j                  |t         j                        | z
  || z
  z  }t        j                  |dd      }|S )NgMbP?rF   r   r	   )r   rN   r[   clamp)rz   r   rS   linear_func	ramp_funcs        r,   linear_ramp_factorz4_compute_yarn_parameters.<locals>.linear_ramp_factor  sL    #:5LC||Cu}}=Cc	RKKQ2	r.   r   rE   rH   r}   T)r	   )rI   r    rJ   r!   rK   rL   rM   rb   rP   r   rN   r&   )r   r)   r   r   rQ   rR   rB   rD   rS   r@   rT   ri   rj   r   ro   rp   rr   r   r   	pos_freqsinv_freq_extrapolationinv_freq_interpolationr}   r~   r   inv_freq_extrapolation_factorr   rv   s                              @r,   _compute_yarn_parametersr   G  s9   t ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h..
/C!(+F+//0BC!%%h/F)--.>?N';<^'_$
 ~//2RR4 n$Z%?*VUcBd%de)&1 %((5;I$((5:Ip/ aa03363UX[[\I 9_ FY$67%%))*d;H%iCGgiqrIC %&(:3cQh(O(R(RZ`hmhshs(R(t$t!!&C"CD
 #@
@	A  %%%r.   c                 @   | j                          || j                  |   n| j                  }|d   }|j                  dd      }t        | d| j                  | j
                  z        }t        ||z        }|d   }	|d   }
|j                  d      }|j                  d      }|d	   }|| j                  |z  }|I|dk  rd}nAt        j                  d
t        j                  |      t        j                  |      z  z         }|r,||kD  r't        j                  |	t        j                  |      }n&t        j                  |
t        j                  |      }t        j                  d|dt        j                  |      j!                         |z  }d|||z  z  z  }||fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    rA   rB   rC   rD   long_factorshort_factorr@   rT   r   r	   rX   r   rE   )rI   r    rJ   r!   rK   rL   rM   rb   rl   sqrtrm   r   rf   r[   rN   rO   rP   )r   r)   r   r   rQ   rR   rB   rD   rS   r   r   r@   rT   r   ext_factorsinv_freq_shaper   s                    r,   _compute_longrope_parametersr     s   d ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h..
/C&}5K'7L!%%h/F+//0BC';<^'_$
 ~//2RR S="#yyTXXf-=Ii@j-j)jk 7==ll;emmFSll<u}}VT\\!S!5;;vNTTVY\\NkD.$889H%%%r.   c                    | j                          || j                  |   n| j                  }|d   }|j                  dd      }t        | dd      xs | j                  | j
                  z  }t        ||z        }d}	d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }
|d
   }|d   }|d   }|d   }||z  }||z  }dt        j                  z  |
z  }t        j                  ||kD  |
|z  |
      }||z  |z
  ||z
  z  }d|z
  |z  |z  ||z  z   }||k   ||kD   z  }t        j                  |||      }||	fS )a
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    NrA   rB   rC   rD   r   rE   rF   rH   r@   low_freq_factorhigh_freq_factorr   r	   )rI   r    rJ   r!   rK   rL   rM   r   rN   rO   r&   rP   rl   rt   where)r   r)   r   r   rQ   rR   rB   rD   rS   rT   r   r@   r   r   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs                         r,   _compute_llama3_parametersr   &  s   Z ""$AKAW611*=]c]s]s  -D0445LcRvz40dF4F4F&JdJd4dH
h..
/C du||AsAU[[ILLTZbgbmbmLnqttuvH!(+F*+<=O+,>?*+MNO&8'*::$''kH$G [[+;!;X=NPXYN$w.@EUXgEghM]*n<vEXfHff!223BR8R6SSN[[1BNSN+++r.   )linearr6   yarnr7   llama3proportional.r#   c                       e Zd ZU dZedz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed	<   edz  ed
<   ee   dz  ed<   ee   dz  ed<   edz  ed<   edz  ed<   y)RopeParametersu  
    Args:
        rope_theta (`float`, *optional*, defaults to `RotaryEmbeddingConfigMixin.default_theta`):
            The base period of the RoPE embeddings. Optional in serialized configs — if omitted,
            the model's `default_theta` (typically 10000.0) is used.
        rope_type (`str`, *optional*, defaults to "default"):
            The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
            'llama3'], with 'default' being the original RoPE implementation.
        partial_rotary_factor (`float`, *optional*):
            The percentage of the query and key head embedding on which RoPE will be applied.
        factor (`float`, *optional*):
            Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
            most scaling types, a `factor` of x will enable the model to handle sequences of length x *
            original maximum pre-trained length.
        original_max_position_embeddings (`int`, *optional*):
            Used with 'yarn', 'longrope' and 'llama3'. The original max position embeddings used during
            pretraining.
        attention_factor (`float`, *optional*):
            Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
            computation. If unspecified, it defaults to value recommended by the implementation, using the
            `factor` field to infer the suggested value.
        beta_fast (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
            ramp function. If unspecified, it defaults to 32.
        beta_slow (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
            ramp function. If unspecified, it defaults to 1.
        short_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to short contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        long_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to long contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        low_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
        high_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    NrA   r   rB   r@   r   rT   rp   rr   r   r   r   r   )	__name__
__module____qualname____doc__rP   __annotations__strrM   list r.   r,   r   r     s    'R Tz 4<'DL&)Dj0dl"t|t|u+$$et##T\!dl"r.   r   c                      e Zd ZdZdZ e       Zd Zd ZddZ	dde
dedz  fd	Zdde
dedz  fd
Zdde
dedz  fdZdde
dedz  fdZdde
dedz  fdZdde
dedz  fdZdde
dedz  fdZe	 	 ddededededz  dedz  f
d       Zy)RotaryEmbeddingConfigMixinz[
    A Mixin containing the functionality to standardize and validate RoPE parameters.
    g     @c                    |j                  dd       }|xs | j                  | _        | j                  | j                  ni | _        |j                  dt        | d| j                              }| j                  j	                  d|       |j                  dt        | dd             }|1| j                  j	                  d|       | j                  dhz  | _        | j                          |S )Nrope_scalingrA   rB   )popr    r!   default_theta
setdefaultrJ   ignore_keys_at_rope_validationrI   )r'   r9   r   rA   rB   s        r,   convert_rope_params_to_dictz6RotaryEmbeddingConfigMixin.convert_rope_params_to_dict  s    zz.$7+Ct/C/C7;7K7K7Wt33]_ ZZgdL$J\J\.]^
''jA &

+BGDRikoDp q ,  ++,CEZ[262U2UYpXq2qD/$$&r.   c                 d   t        | dd      }t        | dd      }t        | dd      xs i }t        | dd      }|s|st        j                  d       y|-|i k(  s(t        |j	                               j                  |      s|j                  d|j                  dd	             |j                  d|       |||d<   |d   d
v rt        | d      r!| j                  | j                  d<   || _
        y| j                  j                  d| j                         || _
        yt        |      D ]}  }||   j                  d||   j                  dd	             ||   j                  d|       ||||   d<   ||   d   d
v sU| j                  |   j                  d| j                          || _
        y)z
        Helper to standardize the config's rope params field by ensuring the params are defined for each
        later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility)
        rA   NrB   r    layer_typeszG`standardize_rope_params` was called but no RoPE parameters were found.r   typedefault)r   r   r7   r   )r!   loggerwarningsetkeysissubsetr   rJ   r"   r   r    rb   )r'   rA   rB   r    r   r   s         r,   rI   z2RotaryEmbeddingConfigMixin.standardize_rope_params  s    T<6
 '.Et L!$(94@FBdM48  :NNde Or$9_EYEYE[A\AeAefqAr&&{O4G4GPY4Z[&&|Z@$0;P 78 {+/MM4!CD PTOtOtD(()KL"  / ((334VX\XtXtu  / "+. 	

+66{OT^D_DcDcdjluDvw
+66|ZP(4K`OJ/0GH":.{;?]]((4??:D<X<X	  /r.   c                    t        | dd      }|syt        | dd      3t        |j                               j                  | j                        rnd|i}|j                         D ]j  }|j                  d|j                  dd            }t        | d| d	d      }||d<   | ||| j                  
       Rt        j                  d| d       l y)zY
        Validate the RoPE config arguments, given a `"PreTrainedConfig"` object
        r    Nr   full_attentionr   r   r   
_validate__rope_parametersignore_keyszMMissing validation function in 'RotaryEmbeddingConfigMixin' for 'rope_type'='')
r!   r   r   r   r   valuesrJ   r   r   r   )r'   rQ   r    r   validation_fns        r,   validate_ropez(RotaryEmbeddingConfigMixin.validate_rope  s      't->E#4-9cBVB[B[B]>^>g>g?
 $46J#K 3::< 
	O'++K9L9LVU^9_`I#DJykAQ*RTXYM+4OK((o4;^;^_cdmcnnop
	r.   Nr    r   c                 x    dh}dh}t        |j                               }|d   }| j                  |||||       y )Nr   rA   optional_keysr   )r   r   _check_received_keys)r'   r    r   required_keysr   received_keysr   s          r,   !_validate_default_rope_parametersz<RotaryEmbeddingConfigMixin._validate_default_rope_parameters$  sL    $%O0023#K0	!!}m=^i 	" 	
r.   c                     ddh}dh}t        |j                               }|d   }| j                  |||||       |d   }|t        |t              r|dk  rt
        j                  d|        y y Nr   r@   rA   r   rC   ;`rope_parameters`'s factor field must be a float >= 1, got r   r   r   rc   rP   r   r   r'   r    r   r   r   r   r   r@   s           r,    _validate_linear_rope_parametersz;RotaryEmbeddingConfigMixin._validate_linear_rope_parameters-      $h/%O0023#K0	!!}m=^i 	" 	
 !*>FE!:fslNNXY_X`ab ?Kr.   c                     ddh}dh}t        |j                               }|d   }| j                  |||||       |d   }|t        |t              r|dk  rt
        j                  d|        y y r   r   r   s           r,   !_validate_dynamic_rope_parametersz<RotaryEmbeddingConfigMixin._validate_dynamic_rope_parameters:  r   r.   c           	      4   h d}h d}t        |j                               }|d   }| j                  |||||       |d   }|t        |t              r|dk  rt
        j                  d|        |j                  d      }|-t        |t              r|d	k  rt
        j                  d
|        |j                  d      }	|	(t        |	t              st
        j                  d|	        |j                  d      }
|
(t        |
t              st
        j                  d|
        |	xs d|
xs dk  rt
        j                  d|	 d|
 d       | j                  d   }| j                  |z  }||k7  r&|dk7  r t
        j                  d| d| d| d       y y y )N>   r@   r   r   >   ri   r}   rp   rr   rA   rj   rT   r   r   r@   rC   r   rT   r   O`rope_parameters`'s attention_factor field must be a float greater than 0, got rp   z9`rope_parameters`'s beta_fast field must be a float, got rr   z9`rope_parameters`'s beta_slow field must be a float, got rq   r	   zR`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zKThe explicitly set RoPE scaling factor (config.rope_parameters['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'original_max_position_embeddings' fields in the model config.)r   r   r   rc   rP   r   r   rJ   r    rb   warning_once)r'   r    r   r   r   r   r   r@   rT   rp   rr   r   implicit_factors                r,   _validate_yarn_rope_parametersz9RotaryEmbeddingConfigMixin._validate_yarn_rope_parametersG  s   S
 O0023#K0	!!)]M=fq!r *>FE!:fslNNXY_X`ab*../AB'<Le1TXhklXlNNabrast $''4	 Iu)ENNVW`Vabc#''4	 Iu)ENNVW`VabcO	Q/NNdendo p::CD\^ ,0+?+?@b+c(669YYf$A)=]^d]e fq ###A& J~	~ *>$r.   c                    h d}h d}t        |j                               }|d   }| j                  |||||       |j                  dd      }t	        | d| j
                  | j                  z        }t        ||z        }	|j                  d      }
t        |
t              s*t        d	 |
D              rt        j                  d
|
        t        |
      |	dz  k7  r't        j                  d|	dz   dt        |
              |j                  d      }t        |t              s*t        d |D              rt        j                  d|        t        |      |	dz  k7  r't        j                  d|	dz   dt        |              |j                  d      }|d   }||t        j                  d       nG||t        j                  d       n-t        |t              r|dk  rt        j                  d|        |j                  d      }|/t        |t              r|dk  rt        j                  d|        y y y )N>   r   r   r   r   >   r@   rA   rT   r   r   rB   rC   rD   r   c              3   H   K   | ]  }t        |t        t        f        y wr<   rc   rM   rP   .0r8   s     r,   	<genexpr>zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>  s     5hVWjS%L6Q5h    "zF`rope_parameters`'s short_factor field must be a list of numbers, got rE   z8`rope_parameters`'s short_factor field must have length z, got r   c              3   H   K   | ]  }t        |t        t        f        y wr<   r   r   s     r,   r   zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>  s     4fUVZC<5P4fr   zE`rope_parameters`'s long_factor field must be a list of numbers, got z7`rope_parameters`'s long_factor field must have length r@   r   av  This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.z4Missing required keys in `rope_parameters`: 'factor'r   rT   g        r   )r   r   r   rJ   r!   rK   rL   rM   rc   r   allr   r   lenr   rP   )r'   r    r   r   r   r   r   rB   rD   rS   r   r   r@   r   rT   s                  r,   "_validate_longrope_rope_parametersz=RotaryEmbeddingConfigMixin._validate_longrope_rope_parameters{  sF   hDO0023#K0	!!)]M=fq!r / 3 34KS Q4T-=-=AYAY-YZ(223&**>:,-#5h[g5h2hNNcdpcqrs|q(NNJ3RS8*TZ[^_k[lZmn &))-8+t,4fZe4f1fNNbcnbopq{sax'NNI#QR(SYZ]^iZjYkl !$$X.+:;]+^( >>JE ^ @ HNNQRFE*fslNNXY_X`ab*../AB'<Le1TXhknXnNNabrast Yo'r.   c                    h d}|d   }t        |j                               }| j                  ||||       |d   }|t        |t              r|dk  rt
        j                  d|        |d   }|d   }|t        |t              st
        j                  d	|        |t        |t              st
        j                  d
|        ||k  rt
        j                  d| d|        |d   }	|	t        |	t              st
        j                  d|	        |	| j                  k\  r&t
        j                  d|	 d| j                          y y )N>   r@   r   rA   r   r   r   r   r   r@   rC   r   r   r   z?`rope_parameters`'s low_freq_factor field must be a float, got z@`rope_parameters`'s high_freq_factor field must be a float, got zf`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zS`rope_parameters`'s original_max_position_embeddings field must be an integer, got zj`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)	r   r   r   rc   rP   r   r   rM   rb   )
r'   r    r   r   r   r   r@   r   r   r   s
             r,    _validate_llama3_rope_parametersz;RotaryEmbeddingConfigMixin._validate_llama3_rope_parameters  sz   
 $K0	O0023!!)]MWb!c *>FE!:fslNNXY_X`ab)*;<*+=>"*_e*LNN\]l\mno#:6F+NNN]^n]opq.NNx#$$9/9JL
 ,;;]+^(+3:Ffhk;lNNe346 ,t/K/KKNN|344QRVRnRnQoq Lr.   c                     ddh}|d   }t        |j                               }| j                  ||||       |j                  d      }|t        j                  d       y y )Nr   rA   r   rB   z`rope_parameters`'s partial_rotary_factor is None. This will default to 1.0 in the computation, making this equivalent to the linear_scaling RoPE type. Provide a value in the range [0.0, 1.0) to make use of the proportional RoPE funcitonality.)r   r   r   rJ   r   r   )r'   r    r   r   r   r   rB   s          r,   &_validate_proportional_rope_parameterszARotaryEmbeddingConfigMixin._validate_proportional_rope_parameters  sp    $l3#K0	O0023!!)]MWb!c / 3 34K L (NNC )r.   r   r   r   r   c                    d|v r|dhz  }|j                  d       |xs
 t               }d|vr|j                  d       ||t        |      z  }||z
  }|rt        d|  d|       ||z
  |z
  }|rt        j	                  d|  d|        yy)z\Compare the received keys in `config.rope_parameters` against the expected and optional keysr   r   rB   Nz<Missing required keys in `rope_parameters` for 'rope_type'='z': z8Unrecognized keys in `rope_parameters` for 'rope_type'=')addr   KeyErrorr   r   )r   r   r   r   r   missing_keysunused_keyss          r,   r   z/RotaryEmbeddingConfigMixin._check_received_keys  s     ]"fX%Mk*%."-756 "S--M$}4YZcYddghtguvww#m3mCNNUV_U``cdocpqr r.   )r'   r   r<   )NN)r   r   r   r   r   r   r   r   rI   r   dictr   r   r   r   r   r   r   staticmethodr   r   r   r.   r,   r   r     s>    M%(U"*./`:
 
TWZ^T^ 
c cSVY]S] cc cTWZ^T^ c2d 2QTW[Q[ 2h0$ 0UX[_U_ 0d' 'SVY]S] 'Rd Y\_cYc  
 %)"&sss s Tz	s
 4Zs sr.   r   r   c                 x    t        j                  dt               | j                          | j	                          y)zq
    This is a deprecated function.
    It has been kept for backward compatibility with custom code models.
    aX  `rope_config_validation` is deprecated and has been removed. Its functionality has been moved to RotaryEmbeddingConfigMixin.validate_rope method. PreTrainedConfig inherits this class, so please call self.validate_rope() instead. Also, make sure to use the new rope_parameters syntax. You can call self.standardize_rope_params() in the meantime.N)warningswarnFutureWarningrI   r   )r   r   s     r,   rope_config_validationr     s5    
 MM	G
 	 ""$
r.   )NNNN)NNNNrD   )NNNr<   )%rl   r   collections.abcr   	functoolsr   typingr   r   r   utilsr
   r   
get_loggerr   r   r   configuration_utilsr   r=   rM   r   tuplerP   rU   r`   rg   r   r   r   r#   r   r   r   r   r   r   r   r.   r,   <module>r      s
     $  5 5 . 
		H	% 5`H ,0'+!	3&'(3&^$3& 4Z3& d
	3&
 >5 !3&n ,0'+!"C&'(C&^$C& 4ZC& d
	C&
 C& >5 !C&N ,0'+!	C&'(C&^$C& 4ZC& d
	C&
 >5 !C&P (,!	D&D&^$D& 4ZD& d
	D&
 >5 !D&R (,!	U&U&^$U& 4ZU& d
	U&
 >5 !U&t (,!	L,L,^$L, 4ZL, d
	L,
 >5 !L,f 6.$,(9O T#xU>53H-I(IJJK 5#Y 5#pFs FsR
#= CRVJ r.   