
    ii                         d dl mZ ddlmZmZmZmZ ddlmZ ddl	m
Z
  e       rd dlZerddlmZ dd	lmZ  ej                   e      Z G d
 de      Zy)    )TYPE_CHECKING   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModel)FineGrainedFP8Configc                        e Zd ZU dZdZded<    fdZd Zddd	ed
e	fdZ
ddd	eddd
ef fdZ	 	 ddZd Zd Zed
e	fd       Zed
e	fd       Zd Zd Z xZS )FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    Fr   quantization_configc                 &    t        |   |fi | y )N)super__init__)selfr   kwargs	__class__s      /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   z"FineGrainedFP8HfQuantizer.__init__   s    ,77    c                 *   t               st        d      | j                  j                  ry t        j
                  j                         sHt               s>| j                  r't        j                  d       d| j                  _        y t        d      t        j
                  j                         r`t        j
                  j                         }|\  }}|dk  s
|dk(  r3|dk  r.t        j                  d| d| d	       d| j                  _        y |j                  d
      }|t        j                  d       y t        |t              rJ| j                  s t!        |      dkD  rd|j#                         v sd|j#                         v rt%        d      y y )NzMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)zUsing FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is availableTzANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.z`. We will default to dequantizing the model to bf16. Feel free to use a different quantization method like bitsandbytes or torchao
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. r	   cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   
dequantizetorchcudais_availabler   pre_quantizedloggerwarning_onceRuntimeErrorget_device_capabilityget
isinstancedictlenvalues
ValueError)r   argsr   compute_capabilitymajorminorr   s          r   validate_environmentz.FineGrainedFP8HfQuantizer.validate_environment   s{   &(mnn##..zz&&(1G1I!!## [ 7;((3"#fgg::""$!&!A!A!C-LE5	uzeai####('5' 2Z[
 7;((3ZZ-
6
 
D)&&
Oa'Z..00Z..00 k  1 *r   modelr   
param_namereturnc                 t    ddl m}m} t        ||      \  }}t	        |||f      r| j
                  s|dk(  ryyy)Nr   )
FP8Experts	FP8LinearbiasFT)integrations.finegrained_fp8r9   r:   r   r+   r%   )r   r5   r6   r   r9   r:   moduletensor_names           r   param_needs_quantizationz2FineGrainedFP8HfQuantizer.param_needs_quantizationO   s>    H25*Efy*56!![F%:r   paramztorch.Tensorc                 L    | j                  ||      ryt        | 	  |||      S )z4Return the element size (in bytes) for `param_name`.r	   )r?   r   param_element_size)r   r5   r6   r@   r   s       r   rB   z,FineGrainedFP8HfQuantizer.param_element_sizeZ   s*    ((
;w)%UCCr   c                     ddl m} | j                  || j                  j                  |j
                        | _         ||| j                  | j                  | j                        }y )Nr   )replace_with_fp8_linear)modules_to_not_convertr   r%   )r<   rD   get_modules_to_not_convertr   rE   _keep_in_fp32_modulesr%   )r   r5   r   rD   s       r   $_process_model_before_weight_loadingz>FineGrainedFP8HfQuantizer._process_model_before_weight_loadinga   s^    
 	K&*&E&E4++BBED_D_'
# (#'#>#> $ 8 8,,	
r   c                 f    d|j                   j                  v rddddddddddddddd}||_        |S )NQwen3colwiserowwise)z layers.*.self_attn.q_proj.weightz*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightz*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_inv)r   __name__base_model_tp_plan)r   config	text_plans      r   update_tp_planz(FineGrainedFP8HfQuantizer.update_tp_plans   sV    f&&///4=>G4=>G4=>G4=>G1:;D/89B1:;DI" )2F%r   c                      yNT r   s    r   is_serializablez)FineGrainedFP8HfQuantizer.is_serializable   s    r   c                      y)NFrT   rU   s    r   is_trainablez&FineGrainedFP8HfQuantizer.is_trainable   s    r   c                      yrS   rT   rU   s    r   is_compileablez(FineGrainedFP8HfQuantizer.is_compileable   s    r   c                     ddl m}  ||       S )Nr   )Fp8Quantize)r<   r\   )r   r\   s     r   get_quantize_opsz*FineGrainedFP8HfQuantizer.get_quantize_ops   s    >4  r   c                     ddl m} ddlm} | j                  r+| j
                  j                  r |g dd ||       g      gS g S )Nr   )WeightConverter)Fp8Dequantize)zweight$weight_scale_invactivation_scaleweight)source_patternstarget_patterns
operations)core_model_loadingr_   r<   r`   r%   r   r!   )r   r_   r`   s      r   get_weight_conversionsz0FineGrainedFP8HfQuantizer.get_weight_conversions   sK    8@$":":"E"E  $W$, -d 34  	r   )r5   r   )rM   
__module____qualname____doc__requires_calibration__annotations__r   r4   strboolr?   floatrB   rH   rQ   rV   propertyrX   rZ   r]   rh   __classcell__)r   s   @r   r   r      s    
 !//8/b	.? 	S 	_c 	D(9 Ds DSa Dfk D
 
$. d     !
r   r   )typingr   utilsr   r   r   r   baser
   quantizers_utilsr   r"   modeling_utilsr   utils.quantization_configr   
get_loggerrM   r&   r   rT   r   r   <module>rz      sH      ` `  2 0@			H	%V Vr   