
    i3                         d dl mZ ddlmZ erddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ  e       r
d d	lZdd
lmZ  ej&                  e      Zd	Z G d de      Zy	)    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)Mxfp4Config)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameN)WeightConverterc                        e Zd ZU dZdZded<    fdZd Zd Zdd	d
e	de
fdZddZ	 ddd	de
fdZd Zd Zd Zd Zede
fd       Zd Zd Z xZS )Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    Fr   quantization_configc                 4    t        |   |fi | d | _        y N)super__init__triton_kernels_hub)selfr   kwargs	__class__s      x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__2   s    ,77"&    c                     | j                    	 ddlm}  |d      | _         | j                   S | j                   S # t        $ r t        d      w xY w)z3Lazy import and initialize kernels only when neededr   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsz2kernels package is required for MXFP4 quantization)r   integrations.hub_kernelsr   ImportError)r   r   s     r   _lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels6   s]    ""*XA*45_*`' &&&t&&&  X!"VWWXs	   9 Ac                 >   t               st        d      | j                  j                  ry t	               st        d      t
        j                  j                         xs t        j                  d      }|j                  dvrF| j                  r+t        j                  d| d       d| j                  _        y t        d| d	      t
        j                  j                         rd}t!        d
      }t#               }nt
        j$                  j                         r9t
        j$                  j'                         }|dk\  }t!        d      }t#               }n-|j                  dk(  rd}t!        d
      }t#               }nd}d}d}| j                  r{|s't        j                  d       d| j                  _        y |s't        j                  d       d| j                  _        y |sNt        j                  d       d| j                  _        y |st)        d      |st)        d      |st)        d      | j                  s| j+                          |j-                  d      }|<t/        |t0              r+| j                  sd|j3                         v rt)        d      y y y y )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z9Using mxfp4 requires Accelerate: `pip install accelerate`cpu)cudaxpur#   zGUsing MXFP4 quantized models requires model on cuda/xpu/cpu, but found zj, we will default to dequantizing the model to bf16. To use mxfp4, please disable the current accelerator.TzIQuantizing a model using MXFP4 requires model on cuda/xpu/cpu, but found z7. To use mxfp4, please disable the current accelerator.z3.5.0)      z3.4.0Fu   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series). We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton: CUDA requires Triton >= 3.4.0, XPU/CPU requires Triton >= 3.5.0. Please install triton: `pip install triton`. We will default to dequantizing the model to bf16.zMXFP4 quantization requires the `kernels` package: `pip install kernels>=0.12.0`. We will default to dequantizing the model to bf16.u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) or CPUzMXFP4 quantization requires Triton: CUDA requires Triton >= 3.4.0, XPU/CPU requires Triton >= 3.5.0. Please install triton: `pip install triton`zPMXFP4 quantization requires the `kernels` package: `pip install kernels>=0.12.0`
device_mapdiskzYou are attempting to load an FP4 model with a device_map that contains a disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the disk device from the device_map.)r   r    r   
dequantizer	   torchacceleratorcurrent_acceleratordevicetypepre_quantizedloggerwarning_onceRuntimeErrorr%   is_availabler   r
   r$   get_device_capability
ValueErrorr!   get
isinstancedictvalues)	r   argsr   r.   is_device_supported_mxfp4triton_availablekernels_installedcompute_capabilityr(   s	            r   validate_environmentz%Mxfp4HfQuantizer.validate_environmentA   s   !#] 
 ##..&(YZZ""668OELL<O;;44!!##]^d]e  fP  Q 7;((3"_`f_g  h_  `  99!!#(,%27; 4 6ZZ$$&!&!A!A!C(:f(D%27; 4 6[[E!(,%27; 4 6(-%$ %,##I
 7;((3###I
 7;((3$##I
 7;((3*l  "`  #opp!!%%'ZZ-
!jT&B%%&J4E4E4G*G g  +H% 'C!r   modelr   
param_namereturnc                 R    ddl m} t        ||      \  }}t        ||      r|dv ryyy)Nr   Mxfp4GptOssExperts)down_proj_biasgate_up_proj_biasFT)integrationsrF   r   r8   )r   rA   rB   r   rF   moduletensor_names          r   param_needs_quantizationz)Mxfp4HfQuantizer.param_needs_quantization   s3    525*Ef01EEr   c                     t         j                  j                         rt         j                  j                          y t         j                  j                         rt         j                  j                          y y r   )r+   r$   r4   empty_cacher%   )r   rA   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading   sG    ::""$JJ""$YY##%II!!# &r   use_kernelsc                    ddl m} t        j                  j	                         xs t        j
                  d      }|r4|j                  dvr&t        j                  d       d| j                  _
        |s4|j                  dv r&t        j                  d       d| j                  _
        | j                  || j                  j                  |j                        | _         ||| j                  | j                        }y )	Nr   )replace_with_mxfp4_linearr#   )r#   zYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseTzMXFP4 inference on CPU requires use_kernels=True, but use_kernels is disabled. We will dequantize the model to bf16. To run MXFP4 natively on CPU, please set use_kernels=True.)modules_to_not_convertr   )rI   rR   r+   r,   r-   r.   r/   r1   r2   r   r*   get_modules_to_not_convertrS   _keep_in_fp32_modules)r   rA   rP   r   rR   r.   s         r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading   s     	= ""668OELL<O6;;g5e 37D$$/v{{g5s 37D$$/&*&E&E4++BBED_D_'
# *$*E*E[_[s[s
r   c                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrrY   updater   configs     r   update_tp_planzMxfp4HfQuantizer.update_tp_plan   R    V--666v3T:F))00DRDRAOAO	 r   c                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NrX   base_model_ep_planrZ   r[   )r   r\   r]   rd   r^   r_   s     r   update_ep_planzMxfp4HfQuantizer.update_ep_plan   rb   r   c                 0   ddl m} |j                         }t        |j                  dd      }t        |j                  dd      }|j                         D ]9  \  }}t        ||      rt        |d      rt        |d      s,d	D ]  }t        ||      }	t        || d
      }
|	j                  j                  j                  |	j                  j                        j                  dd      }|dk(  r|j                  |ddd      }n|j                  ||dd      }|
j                  j                  j                  j                  |
j                  j                  j                        j                  dd      }||| d| d<   ||| d| d<    < i }||fS )Nr   rE   num_local_experts    hidden_sizei@  gate_up_proj	down_proj)rj   rk   _precision_configZ      ._blocks_scales)rI   rF   
state_dictr]   r`   named_modulesr8   hasattrstoragelayoutunswizzle_datadata	transposereshapeweight_scale)r   rA   rF   rt   rg   ri   namerJ   projtriton_tensorprecision_configblocksscalesmetadatas                 r   get_state_dict_and_metadataz,Mxfp4HfQuantizer.get_state_dict_and_metadata   s   5%%'
#ELL2ErJellM4@!//1 	=LD&6#56FN3FK05 = ' 5#*6dV;L3M#N &..55DD]EZEZE_E_`jjkmoqr>)#^^,=r2rJF#^^,={BPRSF)66>>EETT$1199>>)B#  7=
dV1TF'236<
dV1TF'23=	=2 8##r   c                      y)NT r   s    r   is_serializablez Mxfp4HfQuantizer.is_serializable  s    r   c                 .    t         j                  d       y)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r1   r2   r   s    r   is_trainablezMxfp4HfQuantizer.is_trainable  s     x	
 r   c                     ddl m}  ||       S )Nr   )Mxfp4Quantize)integrations.mxfp4r   )r   r   s     r   get_quantize_opsz!Mxfp4HfQuantizer.get_quantize_ops  s    6T""r   c                    ddl m}m} | j                  rE| j                  j
                  r/t        ddgd ||       g      t        ddgd	g ||       g      gS t        ddgd	 ||       g      t        ddgd ||       g      gS )
Nr   )Mxfp4DequantizeMxfp4Deserializedown_proj_blocksdown_proj_scalesz
down_proj$)source_patternstarget_patterns
operationsgate_up_proj_blocksgate_up_proj_scaleszgate_up_proj$)r   r   r   r0   r   r*   r   )r   r   r   s      r   get_weight_conversionsz'Mxfp4HfQuantizer.get_weight_conversions  s    J$":":"E"E%79K$L$1 / 56
  %:<Q$R%4$5 / 56  !68M N 0,T23
 !35G H -,T23
 	
r   )rA   r   )F)r\   
__module____qualname____doc__requires_calibration__annotations__r   r!   r@   strboolrL   rO   rV   ra   re   r   r   propertyr   r   r   __classcell__)r   s   @r   r   r   *   s     !&&'	'^@.? S _c $ "
 
 
B!$F d  #

r   r   )typingr   baser   modeling_utilsr   utils.quantization_configr   utilsr	   r
   r   r   r   quantizers_utilsr   r+   core_model_loadingr   
get_loggerr\   r1   r   r   r   r   r   <module>r      s[    !  07  3 4			H	% Q
{ Q
r   