
    i)              
       ~   d Z ddlmZmZ ddlmZ ddlmZmZ  e       r
ddl	Z	ddl
mZ  ej                  e      Zdad Z G d d	ej"                        Z	 	 	 dd
ee   dz  defdZde	j.                  dedefdZde	j.                  de	j.                  de	j.                  dedef
dZ G d de      Z G d de      Zy)a  
Metal affine quantization integration for transformers.

This module provides:
  - ``MetalLinear``: a drop-in replacement for ``nn.Linear`` that stores weights
    as affine-quantized uint32 packed tensors and uses the ``quantization-mlx``
    Metal kernels for the forward pass.
  - ``replace_with_metal_linear``: walks a model and swaps every eligible
    ``nn.Linear`` with ``MetalLinear``.
  - ``MetalQuantize`` / ``MetalDequantize``: weight conversion operations that
    participate in the new ``WeightConverter`` pipeline.

Weight layout (transposed, matching ``affine_qmm_t``):
  - ``weight``: ``[N, K_packed]`` (``uint32``) -- K is the packed dimension.
  - ``scales``:  ``[N, K // group_size]`` (``float16 / bfloat16``)
  - ``qbiases``: ``[N, K // group_size]`` (same dtype as scales)

The kernel call is ``affine_qmm_t(x, weight, scales, qbiases, group_size, bits)``
which computes ``y = x @ dequant(weight).T``, identical to ``nn.Linear``.
   )ConversionOps_IdentityOp)should_convert_module)is_torch_availablelogging    Nc                      t         	 ddlm}   | d      a t         S t         S # t        $ r}t	        d| d      |d}~ww xY w)z>Lazily load the quantization-mlx kernel from Hugging Face Hub.N   )
get_kernelz0kernels-community/mlx-quantization-metal-kernelsz9Failed to load the quantization-mlx kernel from the Hub: zm. Make sure you have `kernels` installed (`pip install kernels`) and are running on an Apple Silicon machine.)_metal_kernelhub_kernelsr   	ExceptionImportError)r   es     }/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/integrations/metal_quantization.py_get_metal_kernelr   3   s`     		/&'YZM =  	KA3 O? ? 		s   " 	A ;A c                       e Zd ZdZdej
                  ddfdedededed	ef
d
Zdej                  dej                  fdZ
y)MetalLinearz
    A quantized linear layer that stores weights in affine uint32 packed format
    and uses the ``quantization-mlx`` Metal kernels for the forward pass.

    Parameters match ``nn.Linear`` with additional quantization metadata.
    F      in_featuresout_featuresbiasbits
group_sizec                 :   t         j                  j                  |        || _        || _        || _        || _        d|z  }||z  }||z  }	|t        j                  k(  rAt        j                  t        j                  ||t        j                        d      | _        n2t        j                  t        j                  |||      d      | _        |t        j                  k(  rt        j                  nd }
t        j                  t        j                  ||	|
      d      | _        t        j                  t        j                  ||	|
      d      | _        |r.t        j                  t        j                  |            | _        y | j!                  dd        y )N    )dtypeF)requires_gradr   )nnModule__init__r   r   r   r   torchuint32	Parameterzerosweightfloat32scalesqbiasesr   register_parameter)selfr   r   r   r   r   r   elems_per_intk_packedn_groupsscales_dtypes              r   r"   zMetalLinear.__init__Q   s'    			4 &(	$d
-/*,ELL ,,u{{<QVQ]Q]'^nstDK,,u{{<TY'ZjopDK(-(=u}}4ll5;;|X\#Zjop||EKKhl$[kpqU[[%>?DI##FD1    inputreturnc                    | j                   j                  t        j                  k7  r5t        j
                  j                  || j                   | j                        S t               }|j                  || j                   | j                  j                  |j                        | j                  j                  |j                        | j                  | j                        }| j                  || j                  z   }|S N)r'   r   r#   r$   r    
functionallinearr   r   affine_qmm_tr)   tor*   r   r   )r,   r2   kerneloutputs       r   forwardzMetalLinear.forwards   s    ;;,==''t{{DIIFF"$$$KKKKNN5;;'LLOOEKK(OOII
 99 dii'Fr1   N)__name__
__module____qualname____doc__r#   r$   intboolr"   Tensorr<    r1   r   r   r   I   sj     ll 2 2  2 	 2  2  2DU\\ ell r1   r   modules_to_not_convertpre_quantizedc           
         |j                   r| S |j                  }|j                  }d}| j                         D ]z  \  }}t	        ||      st        |t        j                        s.|ri nddi}	t        d|j                  |j                  |j                  du||d|	}
| j                  ||
       d}| |st        j                  d       | S )a`  
    Replace every eligible ``nn.Linear`` with ``MetalLinear``.

    Args:
        model: the ``PreTrainedModel`` (on the meta device at this point).
        modules_to_not_convert: module names to leave untouched.
        quantization_config: the ``MetalConfig`` instance.
        pre_quantized: ``True`` when loading from a quantized checkpoint.
    Fr   N)r   r   r   r   r   TzYou are loading a model with Metal quantization but no nn.Linear modules were found. Please double check your model architecture.rD   )
dequantizer   r   named_modulesr   
isinstancer    Linearr   r   r   r   set_submoduleloggerwarning)modelrE   quantization_configrF   r   r   has_been_replacedmodule_namemodulemodule_kwargs
new_modules              r   replace_with_metal_linearrV      s     %%##D$//J$224 %V$[2HIfbii("/Bgt_M$ "..#00[[,%  J Z8 $!%$ ;	

 Lr1   r'   r   r   c                 
   | j                   \  }}d|z  }d|z  dz
  }||z  }| j                         j                  |||      }|j                  d      j                  }	|j                  d      j                  }
|
|	z
  |z  j                  d      }|	}||j                  d      z
  |j                  d      z  }|j                         j                  d|      j                  t        j                        j                  ||      }||z  }t        j                  ||t        j                  | j                        }t        |      D ]  }||d	d	|d	|f   ||z  z  z  } |j                  t        j                        ||fS )
aP  
    Quantize a 2-D float weight ``[N, K]`` into packed uint32 + scales + biases.

    Returns ``(w_packed, scales, biases)`` with:
      - ``w_packed``: ``[N, K // (32 // bits)]`` uint32
      - ``scales``:   ``[N, K // group_size]`` float32/float16/bfloat16
      - ``biases``:   ``[N, K // group_size]`` float32/float16/bfloat16
    r   r
   )dimg:0yE>)minr   r   deviceN)shapefloatreshaperZ   valuesmaxclamp	unsqueezeroundr9   r#   int32r&   r\   ranger$   )r'   r   r   NKr-   max_valr/   	w_groupedw_minw_maxr)   biasesw_intr.   w_packedis                    r   _affine_quantize_tensorrq      sm    <<DAq$JMDyAoGJH&&q(J?IMMbM!((EMMbM!((Eu}'..4.8FF))"--1A1A"1EEEKKM7+..u{{;CCAqIE M!H{{1hekk&--PH=! =E!Q---.4!8<<= ;;u||$ff44r1   ro   r)   rm   c                 2   | j                   d   }d|z  }d|z  dz
  }| j                   d   |z  }| j                  t        j                        }	t        j                  ||t        j
                  | j                        }
t        |      D ]%  }|	||z  z	  |z  j                         |
dd|d|f<   ' |
j                  |d|      }||j                         j                  d      z  |j                         j                  d      z   }|j                  ||      S )zv
    Dequantize a packed uint32 weight ``[N, K_packed]`` back to float.

    Returns a ``[N, K]`` float32 tensor.
    r   r   r
   r[   NrX   )r]   r9   r#   re   r&   r(   r\   rf   r^   r_   rc   )ro   r)   rm   r   r   rg   r-   ri   rh   
w_packed_iw_flatrp   rj   w_deqs                 r   _affine_dequantize_tensorrv      s    	qA$JMDyAoGqM)AU[[)J[[AU]]8??KF=! U(2tax(@G'K&R&R&Tq!"]""#U q"j1I0044v||~7O7OPR7SSE==Ar1   c                   &    e Zd ZdZd ZdedefdZy)MetalQuantizez
    Quantize a full-precision weight tensor into (weight, scales, qbiases).

    Used during quantize-on-the-fly.  The float ``weight`` is replaced in-place
    by the packed uint32 tensor.
    c                     || _         y r5   hf_quantizerr,   r{   s     r   r"   zMetalQuantize.__init__   
    (r1   
input_dictr3   c                    t        t        |j                                     \  }}t        |t              r|d   n|}| j
                  j                  j                  }| j
                  j                  j                  }t        |||      \  }}}	d|v r|j                  dd      d   nd}
|
r|
 dnd}|
r|
 dnd}|j                  }||||j                  |      ||	j                  |      iS )	Nr   .r
    z.scalesr)   z.qbiasesr*   )nextiteritemsrJ   listr{   rP   r   r   rq   rsplitr   r9   )r,   r~   kwargs
target_keyvaluer   r   ro   r)   rm   base	scale_keybias_key
orig_dtypes                 r   convertzMetalQuantize.convert   s     j&6&6&8!9:
E&ud3a  4499&&::EE
#:5*d#S &&/2j/@z  a(+b(,tfG$(	(,dV8$)[[
vyy,fii
+
 	
r1   N)r=   r>   r?   r@   r"   dictr   rD   r1   r   rx   rx      s    )
$ 
T 
r1   rx   c                   D    e Zd ZdZd Zd	dededz  defdZed
d       Z	y)MetalDequantizez
    Dequantize (weight, scales, qbiases) back to a full-precision tensor.

    Used when ``dequantize=True`` is set in the config to fall back to a normal
    ``nn.Linear`` on devices without MPS.
    c                     || _         y r5   rz   r|   s     r   r"   zMetalDequantize.__init__  r}   r1   Nr~   full_layer_namer3   c                 4   | j                   j                  j                  }| j                   j                  j                  }t	        |      dk  r||d   iS |d   d   }|d   d   }|d   d   }t        |||||      }	||	j                  |j                        iS )Nr   zweight$r   r)   r*   )r{   rP   r   r   lenrv   r9   r   )
r,   r~   r   r   r   r   	quantizedr)   r*   ru   s
             r   r   zMetalDequantize.convert  s      4499&&::EE
z?Q#Z	%:;;y)!,	H%a(Y'*))VWjRVW&,,!788r1   c                     t               S r5   )r   )r,   s    r   
reverse_opzMetalDequantize.reverse_op*  s
    }r1   r5   )r3   r   )
r=   r>   r?   r@   r"   r   strr   propertyr   rD   r1   r   r   r     s?    )9$ 9t 9Y] 9  r1   r   )NNF)r@   core_model_loadingr   r   quantizers.quantizers_utilsr   utilsr   r   r#   torch.nnr    
get_loggerr=   rM   r   r   rK   r   r   r   rB   rV   rC   rA   rq   rv   rx   r   rD   r1   r   <module>r      s   * < ? /  
		H	%,;")) ;@ 04	/ I,/ 	/d5ELL 5c 5 5Bll$)LL:?,,TW_b.
M 
@m r1   