
    i                        d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ d	d
lmZ d	dlmZmZ  ej.                  e      Ze j4                  Z e j8                  e      j:                  Z e j8                  e      j>                  Z dZ!da"da#da$da%da&da'da(da)da*d Z+d Z,e jZ                  fde j\                  de j\                  de j\                  de j\                  de/e0   de jb                  de j\                  fdZ2 G d dejf                        Z4de j                  jj                  de j\                  de j\                  de j\                  de j\                  f
dZ6de j                  jj                  de j\                  de j\                  de j\                  de j\                  f
dZ7de j\                  d e0d!e0de8fd"Z9de j\                  d#e j\                  d$e j\                  d%e0de8e j\                  e j\                  f   f
d&Z:d'e j\                  d$e j\                  de j\                  fd(Z;de j                  jj                  de j\                  de j\                  de j\                  de j\                  f
d)Z< G d* d+ejj                        Z= G d, d-e      Z> e>       Z?	 d4d.e/e@   dz  fd/ZA G d0 d1e	      ZB G d2 d3e	      ZCy)5    N)
functional   )ACT2FN)ConversionOps_IdentityOp)should_convert_module)logging)get_cuda_runtime_versionresolve_internal_import   )lazy_load_kernel)ExpertsInterfaceuse_experts_implementation   c                  b   t         t         st        d      yda t        d      } t        | d      at        | d      at        | d      at        | d      adt        fdt
        fdt        fdt        ffD cg c]	  \  }}|| }}}|rt        d	d
j                  |       d      da yc c}}w )zLazily load the finegrained-fp8 Triton kernel and extract functions.

    Uses the hub kernels lazy loading pattern. Raises an error if the kernel
    cannot be loaded or required functions are missing. Only attempts loading once.
    NzGfinegrained-fp8 kernel is not available (previous load attempt failed).Fzfinegrained-fp8w8a8_fp8_matmulfp8_act_quantw8a8_fp8_matmul_batchedw8a8_fp8_matmul_groupedz6finegrained-fp8 kernel is missing required functions: , A. Please update the `kernels` package (`pip install -U kernels`).T)	_triton_availableImportErrorr   getattrtriton_fp8_matmultriton_fp8_act_quanttriton_batched_fp8_matmultriton_grouped_fp8_matmuljoin)kernelnameattrmissings       z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/integrations/finegrained_fp8.py_load_triton_kernelr%   8   s     $ ghh/0F(9:"6?; '0I J '0I J
  1223&(AB&(AB	
	D$ < 		G 	 DTYYwEWDX YN N
 	

 !	s   7B+c                  V   t         t         st        d      yda t        j                  j	                         st        d      t        j                  j                         d   } | dk  rt        d|  d      t               \  }}|d	k  s
|d	k(  r|d
k  rt        d| d| d      t        d      }t        |d      a	t        |d      a
t        |d      adt        fdt        fdt        ffD cg c]	  \  }}|| }}}|rt        ddj                  |       d      da yc c}}w )a  Lazily load the DeepGEMM kernel and extract functions with proper names.

    Uses the hub kernels lazy loading pattern. Raises an error if the kernel
    cannot be loaded, required functions are missing, or the hardware is insufficient.
    Only attempts loading once.
    Nz@DeepGEMM kernel is not available (previous load attempt failed).FzcDeepGEMM kernel requires CUDA, but CUDA is not available. Use a different `experts_implementation`.r   	   z_DeepGEMM requires a Hopper (SM90+) or newer GPU, but the current device has compute capability z-.x. Use a different `experts_implementation`.      z0DeepGEMM requires CUDA runtime 12.3+, but found .zO. Please upgrade your CUDA toolkit or use a different `experts_implementation`.z	deep-gemmfp8_gemm_nt m_grouped_fp8_gemm_nt_contiguouszutils.per_token_cast_to_fp8)chained_pathz/DeepGEMM kernel is missing required functions: r   r   T)_deepgemm_availabler   torchcudais_availableget_device_capabilityr
   r   r   deepgemm_fp8_matmuldeepgemm_grouped_fp8_matmulr   deepgemm_per_token_cast_to_fp8r   )major
cuda_major
cuda_minorr    r!   r"   r#   s          r$   _load_deepgemm_kernelr9   e   s    &"`aa ::""$q
 	

 JJ,,.q1Eqy&&+W,Y[
 	
 67J
B:+
Q>zl!J< X\ \
 	

 k*F!&-8")&2T"U%<VRo%p"
 /0/1LM*,JK
D$
 < 	G  =dii>P=Q RN N
 	

 s   1D%ABAsBs
block_sizeoutput_dtypereturnc                 n   ||d   |d   cxk(  rdk(  rn n	 t                | j                  d| j                  d         }|j                  d|j                  d         }t        j                  |j                  d   |j                  d   | j
                  |      }t        ||j                         f||j                         f|       |j                  | j                  dd |j                  d   fz         S t                t        | |||||      S # t        $ r t        j                  d       Y :w xY w)u  FP8 matmul: C = dequant(A, As) @ dequant(B, Bs)^T.

    Supports both per-tensor and block-wise quantization:
      - block_size=None or block_size=[N, K]: per-tensor mode (As is scalar/per-row, Bs is scalar)
      - block_size=[block_n, block_k]: block-wise mode (As and Bs are per-block scale grids)

    Dispatch order:
      1. DeepGEMM (Hopper+, block_size 128x128) if available
      2. Triton finegrained-fp8 kernel (universal fallback)

    Args:
        A:  (M, K) float8_e4m3fn — quantized activations
        B:  (N, K) float8_e4m3fn — quantized weights
        As: block-wise: (M, K//block_k) float32; per-tensor: (M,) per-row scales
        Bs: block-wise: (N//block_n, K//block_k) float32; per-tensor: scalar or (1,) single weight scale
        block_size: [block_n, block_k] for block-wise quantization, or None/[N, K] for per-tensor
        output_dtype: desired output dtype
    Nr   r   r   devicedtypea  DeepGEMM kernel is not available or compatible, falling back to Triton finegrained-fp8 kernel. To use DeepGEMM FP8 matmul, ensure you have a Hopper (SM90+) or newer GPU with CUDA runtime 12.3+, and that the `kernels` package is installed and up to date (`pip install -U kernels`).)r9   viewshaper/   emptyrD   r3   floatr   loggerwarning_oncer%   r   )	r:   r;   r<   r=   r>   r?   A_2dAs_2doutputs	            r$   r   r      s   4 *Q-:a="GC"G	=!# 66"aggbk*DGGB-E[[A
188S_`Fu{{} 5288:O;;qwws|qwwqzm;<<Q2r:|DD!  	i	s   
D D43D4c                        e Zd Zdddefdededeeef   dz  dedef
 fd	Zd
e	j                  de	j                  fdZ xZS )	FP8LinearNdynamicFin_featuresout_featuresr>   activation_schemehas_biasc                    t         	|   ||       || _        || _        || _        t
        j                  j                  t        j                  |||            | _	        | j                  >t        j                  t        j                  dt
        j                              | _        n|| j                  d   z   dz
  | j                  d   z  }|| j                  d   z   dz
  | j                  d   z  }t        j                  t        j                  ||t
        j                              | _        | j                  dk(  r>t        j                  t        j                  dt
        j                              | _        n| j                  dd        | j                  r8t        j                  t        j                  | j                              | _        y | j                  dd        y )NrE         ?r   r   staticactivation_scalebias)super__init__rU   r>   rT   r/   nn	ParameterrH   weighttensorfloat32weight_scale_invrZ   register_parameterrS   r[   )
selfrR   rS   r>   rT   rU   rE   scale_out_featuresscale_in_features	__class__s
            r$   r]   zFP8Linear.__init__   sf    	l3 $!2hh((\;V[)\]??"$&LLc1W$XD!".1C"Ca"GDOO\]L^!^!,tq/A!AA!E$//Z[J\ \$&LL.0AW%D! !!X-$&LLc1W$XD!##$6===U[[1B1B%CDDI##FD1    inputr@   c                 Z   | j                   j                         dkD  r+t        j                  || j                   | j                        S t        | j                   t        j                  j                  j                        rI| j                   j                  j                         }| j                  j                  j                         }n4| j                   j                         }| j                  j                         }| j                  dk(  rBt                t        || j                   | j                   d   n|j"                  d         \  }}n| j                  dk(  r[| j$                  j'                  t        j(                        }||z  j+                  t,        t.              j'                  t0              }nt3        d| j                         t5        ||||| j                   |j6                        }| j                  || j                  z   }|j'                  |j6                        S )	Nr   rQ   rB   rY   minmaxzUnsupported activation scheme: r?   rW   )r`   element_sizeFlinearr[   
isinstancer/   distributedra   DTensor_local_tensor
contiguousrc   rT   r%   r   r>   rG   rZ   torb   clamp_FP8_MIN_FP8_MAX
_FP8_DTYPENotImplementedErrorr   rE   )re   rj   r`   	scale_invqinputscalerN   s          r$   forwardzFP8Linear.forward   s   ;;##%)88E4;;		::dkk5#4#4#;#;#C#CD[[..99;F--;;FFHI [[++-F--88:I!!Y.!0T__-Htq)ekkZ\oMFE ##x/)),,U]];Eem**xX*FII*UF%(GH^H^G_&`aa OO
 99 dii'Fyyu{{y++ri   )__name__
__module____qualname__r|   inttuplestrboolr]   r/   Tensorr   __classcell__rh   s   @r$   rP   rP      so    
 .2!*"2"2 "2 #s(Od*	"2
 "2 "2H$,U\\ $,ell $,ri   rP   re   hidden_statestop_k_indextop_k_weightsc                    | j                   dk(  rt        d      t                |j                  }|j	                  d      }|j	                  d      }|j	                  d      }t        j                  ||      j                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
||   }t        || j                  r| j                  n| j                  | j                  r| j                  n| j                  | j                   |
      }| j                  r| j#                  |      }n| j%                  |      }t        || j&                  | j(                  | j                   |
      }||	j+                  |j,                        j                  d      z  }|j/                  |||      j1                  d      }|j+                  |j,                        S )	NrY   zbatched_mm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.rB   r   rD   r   )r>   
expert_idsdim)rT   r}   r%   rD   sizer/   arange	unsqueezeexpandreshaper   has_gategate_up_projup_projgate_up_proj_scale_invup_proj_scale_invr>   _apply_gateact_fn	down_projdown_proj_scale_invrx   rE   rF   sum)re   r   r   r   rD   	num_top_k
num_tokens
hidden_dim	token_idxsample_weightsr   selected_hidden_statesproj_outweighted_outfinal_hidden_statess                  r$   fp8_batched_mm_experts_forwardr     s    )!W
 	

  !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J +95 )!]]'+}}##$:P:P??H }}##H- ;;x( )  ??H n//?II"MML '++J	:NRRWXRY!!-"5"566ri   c                    | j                   dk(  rt        d      t                |j                  }|j	                  d      }|j	                  d      }|j	                  d      }t        j                  ||      j                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
t        j                  |
      }t        j                  |      }t        j                  |j	                  d      |      ||<   |
|   }|	|   }|||      }|j                  dk(  r|j                         n|j                         }t        j                  || j                   d| j                   dz
        }t        j"                  |dt
        j$                  	      }t'        || j(                  r| j*                  n| j,                  | j(                  r| j.                  n| j0                  || j2                  |
      }| j(                  r| j5                  |      }n| j7                  |      }t'        || j8                  | j:                  || j2                  |
      }||j=                  |j>                        j                  d      z  }||   }|jA                  |||      jC                  d      }|j=                  |j>                        S )NrY   zgrouped_mm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.rB   r   r   r   cpubinsrm   rn   )r   rE   )tokens_per_expertr>   offsetsr   )"rT   r}   r%   rD   r   r/   r   r   r   r   argsort
empty_liketyperI   r   histcnum_expertscumsumint32r   r   r   r   r   r   r>   r   r   r   r   rx   rE   rF   r   )re   r   r   r   rD   r   r   r   r   r   r   perminv_permexpert_ids_gsample_weights_gselected_hidden_states_ghistc_inputr   r   r   r   r   s                         r$   fp8_grouped_mm_experts_forwardr   _  s    )!W
 	

  !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J ==$D%H\\$))A,v>HTNd#L%d+,Yt_=
 +1++*>,$$&LDTDTDVKKd6F6FASWScScfgSghll,!5;;GG ) !]]'+}}##$:P:P+??H }}##H- ;;x( )  +??H .11(..AKKBOOL  )L '++J	:NRRWXRY!!-"5"566ri   expert_ids_sortedr   	alignmentc                 h   | j                   }| j                  d      }t        j                  | j	                         |d|dz
        j                         }||z   dz
  |z  |z  }|t        ||      |dz
  z  z   }||z
  }|j                  d      |z
  }	t        j                  ||      |	|    z   }
t        j                  j                  |      d   dk\  r |j                  d      j	                         }n;t        j                  |fd|t        j                        }| j	                         ||
<   |
||fS )a&  Build a TMA-aligned contiguous layout for DeepGEMM grouped GEMM.

    DeepGEMM requires M-dimension alignment per expert for TMA. This computes
    the mapping from sorted token positions to padded row positions, and the
    layout tensor that DeepGEMM uses to identify expert boundaries.

    Returns:
        sorted_to_padded: (num_tokens,) index map from sorted position to padded row
        grouped_layout: expert layout tensor (format depends on GPU architecture)
        total_padded_rows: total number of rows including alignment padding
    r   r   r   r   
   rB   rC   )rD   r   r/   r   r   longrm   r   r   r0   r2   fullr   )r   r   r   rD   r   r   aligned_tokens_per_experttotal_padded_rowspadding_per_expertcumulative_paddingsorted_to_paddedgrouped_layouts               r$   !_build_deepgemm_contiguous_layoutr     s>    %%F"''*J$5$9$9$;+STZehiZijooq"3i"?!"C	!QU^ ^"S[%AYQR]%SS25FF+22158JJ||Jv>ASTeAffzz''/2b8*11!488: %6$8"VSXS^S^_+<+@+@+B'(^->>>ri   scalesr   r   c                    t        j                  || j                  d   | j                  | j                        }| ||<   t        j                  ||j                  d   | j                  t         j
                        }|||<   ||fS )zKPad sorted hidden states and scales into the TMA-aligned contiguous layout.r   rC   )r/   zerosrG   rD   rE   rb   )r   r   r   r   hidden_paddedscales_paddeds         r$   "_pad_to_deepgemm_contiguous_layoutr     s     KK=..q1-:N:NVcViViM '4M"#KK 16<<?=K_K_glgtgtuM&,M"#-''ri   hidden_states_paddedc                     | |   S )z;Remove padding rows from the TMA-aligned contiguous layout. )r   r   s     r$   &_unpad_from_deepgemm_contiguous_layoutr     s       011ri   c                 F   | j                   dk(  rt        d      | j                  t        d      | j                  d   dk7  s| j                  d   dk7  rt        d| j                         t	                |j
                  }|j                  d      }|j                  d      }|j                  d      }t        j                  ||	      j                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
t        j                  |
      }t        j                  |      }t        j                  |j                  d      |	      ||<   |
|   }|	|   }|||      }t        || j                  t         
      \  }}}| j"                  r| j$                  n| j&                  }| j"                  r| j(                  n| j*                  }t-        |d      \  }}t/        ||||      \  }}t        j0                  ||j2                  d   |t        j4                        }t        j6                  j9                  |      d   dk\  }t;        ||f||j=                         f|||       | j"                  r| j?                  |      }n| jA                  |      }| jB                  }| jD                  }t-        |d      \  }}t        j0                  |||t        j4                        }t;        ||f||j=                         f|||       tG        ||      }||jI                  |jJ                        j                  d      z  }||   }|jM                  |||      jO                  d      }|jI                  |jJ                        S )NrY   zdeepgemm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.zuDeepGEMM requires block-wise quantization (block_size=[128, 128]), but got per-tensor quantization (block_size=None).r   r   r   z-DeepGEMM requires block_size=(128, 128), got rB   r   )r   F)	use_ue8m0rC   r   )use_psum_layoutr   )(rT   r}   r>   
ValueErrorr9   rD   r   r/   r   r   r   r   r   r   r   r   _DEEPGEMM_M_ALIGNMENTr   r   r   r   r   r5   r   r   rG   bfloat16r0   r2   r4   rI   r   r   r   r   r   rx   rE   rF   r   )re   r   r   r   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   w_upws_upact_fp8
act_scalesr   r   w_downws_downproj_fp8proj_scalesr   r   s                                  r$   fp8_deepgemm_experts_forwardr     si    )!W
 	
 A
 	
 qS DOOA$6#$=HHYZ[[ !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J ==$D%H\\$))A,v>HTNd#L%d+,Yt_= ;\d&&2G;7n&7
 !%44<<D+/==D''d>T>TE89Q]bcGZ<WjRbduvGZ{{,djjmFRWR`R`aHjj66v>qARGO	*ekkm4h`o
 }}##H-;;x( ^^F&&G:8uUHk{{,ju~~^H	;&'--/!:Hnfu
 6h@PQH .11(..AKKBOOL  )L '++J	:NRRWXRY!!-"5"566ri   c                   v    e Zd Zddddefdeeef   dz  dededef fd	Zd
e	j                  de	j                  fdZde	j                  de	j                  de	j                  de	j                  fdZ	 dde	j                  de	j                  de	j                  de	j                  dz  de	j                  f
dZ xZS )
FP8ExpertsNrQ   FTr>   rT   rU   r   c                 	   t         |           |du sJ d       || _        || _        || _        || _        |j                  | _        || _        t        |d|j                        | _
        t        |d|j                        | _        t        t        |d|j                           | _        | j                  rd| j                  z  | j                  }}t!        j"                  t%        j&                  | j                  |||            | _        | j
                  #t+        j,                  || j
                  d         nd	}	| j
                  #t+        j,                  || j
                  d	         nd	}
t!        j"                  t%        j&                  | j                  |	|
t$        j.                              | _        | j3                  d
d        n| j                  | j                  }}t!        j"                  t%        j&                  | j                  |||            | _        | j
                  #t+        j,                  || j
                  d         nd	}| j
                  #t+        j,                  || j
                  d	         nd	}t!        j"                  t%        j&                  | j                  ||t$        j.                              | _        | j3                  dd        | j                  | j                  }}t!        j"                  t%        j&                  | j                  |||            | _        | j
                  #t+        j,                  || j
                  d         nd	}| j
                  #t+        j,                  || j
                  d	         nd	}t!        j"                  t%        j&                  | j                  ||t$        j.                              | _        | j3                  dd        | j                  dk(  rt!        j"                  t%        j<                  | j                  t$        j.                              | _        t!        j"                  t%        j<                  | j                  t$        j.                              | _         y y )NFzWFP8Experts does not support bias for now, please open an issue if you want this featurenum_local_expertsmoe_intermediate_sizehidden_activationr   rW   r   r   gate_up_proj_biasup_proj_biasdown_proj_biasrY   )!r\   r]   configrU   r   r>   hidden_sizer   rT   r   r   intermediate_sizeintermediate_dimr   
hidden_actr   r^   r_   r/   rH   r   tritoncdivrb   r   rd   r   r   r   r   onesgate_up_proj_activation_scaledown_proj_activation_scale)re   r   r>   rT   rU   r   rE   gu_proj_out
gu_proj_ingu_scale_outgu_scale_in
u_proj_out	u_proj_inu_scale_out
u_scale_in
d_proj_out	d_proj_ind_scale_out
d_scale_inrh   s                      r$   r]   zFP8Experts.__init__B  sd    	5  	
e	
    $ ,,!2"6+>@R@RS '0GIaIa bWV-@&BSBSTU==&'$*?*?&?K "U[[9I9I;Xbjo-p qDKO??Kf6;;{DOOA4FGlmLIMId&++j$//!2DEjkK*,,,D,,lKu}}]+D' ##$7>$($9$94??	J<<D4D4DjR[ch(ijDLIMId&++j$//!2DEjkKGKGbY0BChiJ%'\\D,,k:U]][&D" ##ND9 $1F1FI
ekk$2B2BJPYaf&ghEI__E`fkk*dooa.@AfgCG??C^V[[DOOA,>?de
#%<<KK((+zW$
  	 0$7!!X-13ejjIYIYafanan>o1pD..0ll5::dFVFV^c^k^k;l.mD+ .ri   gate_upr@   c                 V    |j                  dd      \  }}| j                  |      |z  S )Nr   rB   r   )chunkr   )re   r  gateups       r$   r   zFP8Experts._apply_gate{  s,    ===+b{{4 2%%ri   r   r   r   c                 
   t        j                  |t         j                        }t        j                         5  t         j                  j
                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                  d	      j                  d
      }d d d        D ]  }|| j                  k(  rt        j                  |         \  }}	||	   }
| j                  dk(  r| j                  |   nd }| j!                  |
| j"                  r| j$                  |   n| j&                  |   | j"                  r| j(                  |   n| j*                  |   |      }| j"                  r| j-                  |      n| j/                  |      }| j                  dk(  r| j0                  |   nd }| j!                  || j2                  |   | j4                  |   |      }||	|d f   }||j7                  |j8                        z  }|j;                  d|	|j7                  |j8                                |j7                  |j8                        S # 1 sw Y   xY w)NrW   )num_classesr   r   r   )rB   r   F)as_tuplerB   rY   )rZ   )r/   
zeros_likerb   no_gradr^   r   one_hotr   permutegreaterr   nonzerorF   whererT   r   rr   r   r   r   r   r   r   r   r   r   r   rx   rE   
index_add_)re   r   r   r   r   expert_mask
expert_hit
expert_idx	top_k_posr   current_stategate_up_act_scaler   down_act_scalerouting_weightsr   s                   r$   r   zFP8Experts.forward  sX   
 $..}EMMR]]_ 	j((--55ktO_O_5`K%--aA6K{8'DaHPPZ_P`eefhiJ	j
 % 	eJT---#(;;{:/F#G Iy))4MBFBXBX\dBd22:>jn  {{15!!*-DLLQ[D\;?==++J7dNdNdeoNp!2	 # H 6:]]t''1T\H]H?C?U?UYa?a//
;gk  {{z*((4!/	 # H ,Iy$,FGO#o&8&8&HHL**1iI\IbIb9cd7	e8 #%%m&9&9::C	j 	js   BI88Jrj   r`   rc   rZ   c                    |j                         dkD  rt        j                  ||d       S | j                  dk(  rS|Q|j	                  t
        j                        }||z  j                  t        t              j	                  t              }nAt                t        || j                  | j                  d   n|j                  d         \  }}t        ||||| j                  |j                         }|j	                  |j                         S )Nr   rY   rl   rB   ro   rW   )rp   rq   rr   rT   rx   r/   rb   ry   rz   r{   r|   r%   r   r>   rG   r   rE   )re   rj   r`   rc   rZ   r   r   rN   s           r$   rr   zFP8Experts.linear  s      1$88E6400!!X-2B2N$''6Eem**xX*FII*UF!0T__-Htq)ekkZ\oMFE !OO
 yyu{{y++ri   N)r   r   r   r|   r   r   r   r   r]   r/   r   r   r   rr   r   r   s   @r$   r   r   A  s     .2!*7n #s(Od*7n 	7n
 7n 7nr&5<< &ELL &(;"\\(;8=(;UZUaUa(;	(;^ 15,||, ,  ,,	,
  ,,-, 
,ri   r   c                       e Zd ZdZeeedZy)FP8ExpertsInterfacez?Interface for registering custom FP8 experts forward functions.)
batched_mm
grouped_mmdeepgemmN)r   r   r   __doc__r   r   r   _global_mappingr   ri   r$   r"  r"    s    I 540Ori   r"  modules_to_not_convertc                 
   |j                   r| S d}| j                         D ]:  \  }}t        ||      s|ri nddi}d}t        j                  d      5  |j                  d      ryt        |dd      }	t        |dd      }
t        |d	| j                  j                               }t        t        t        |
|	
      } |d||j                  |j                  |
|	d|}n_t        |t        j                         rEt#        d|j$                  |j&                  |j                  |j                  |j(                  dud|}|| j+                  ||       d}ddd       = |st,        j/                  d       | S # 1 sw Y   axY w)a  
    A helper function to replace all `torch.nn.Linear` modules by `FP8Linear` modules.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
        quantization_config (`FbgemmFp8Config`):
            The quantization config object that contains the quantization parameters.
        pre_quantized (`book`, defaults to `False`):
            Whether the model is pre-quantized or not
    FrE   Nmetaz.expertsr   TrU   r   )experts_classexperts_interfacerU   r   )r   r>   rT   rU   r   )rR   rS   r>   rT   rU   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.r   )
dequantizenamed_modulesr   r/   rD   endswithr   r   get_text_configr   r   ALL_FP8_EXPERTS_FUNCTIONSweight_block_sizerT   rs   r^   LinearrP   rR   rS   r[   set_submodulerJ   warning)modelr(  quantization_configpre_quantizedhas_been_replacedmodule_namemodulemodule_kwargs
new_moduler   rU   r   	new_classs                r$   replace_with_fp8_linearr?    s   " %%$224 %)V$[2HI ,'4
\\&! 	)##J/"6:t<"6:u= 5<<3O3O3QR6",&?%%		 ' !2DD&9&K&K%% $
 FBII.&  & 2 2!'!4!42DD&9&K&K#[[4 $
 %##K<$(!=	) 	)%)N <	
 LK	) 	)s   D E88F	c                   X    e Zd ZdZd Zdej                  deeej                  f   fdZ	y)Fp8Quantizez^
    A quantization operation that creates two tensors, weight and scale out of a weight.
    c                     || _         y r   hf_quantizerre   rD  s     r$   r]   zFp8Quantize.__init__  
    (ri   
input_dictr@   c                 l   t        |j                               d   \  }}|d   }d }| j                  j                  kt	        | j                  j                  t
              r&| j                  j                  j                  d      }n!t        | j                  j                  dd       }||j                  d   |j                  d   f}|\  }}|j                  d   |j                  d   }	}||z  dk7  s|	|z  dk7  rt        d| d|	 d| d| d| 
      |j                  d d }
||z  }|	|z  }|j                  }|j                  t        j                        } |j                  g |
|||| }|j                         j                  d	
      }t        j                   |dkD  |t        j"                  |            }t$        |z  }t        j                   |dkD  |t        j"                  |            }|j'                  d      j'                  d      }||z  }t        j(                  |t*        t$              j                  t,              }|j                  |      }d|z  j                  t        j                        }|j/                  d      r|j1                  dd      d   dz   }n|dz   }||||iS )Nr   r2  r  rB   Matrix dimensions (r   $) must be divisible by block sizes (z). for )rB   r   rK  rl   rX   r`   r*   r   z.weight_scale_inv
_scale_inv)r   itemsrD  r7  rs   dictgetr   rG   r   rx   r/   rb   r   absamaxr  	ones_liker{   r   ry   rz   r|   r/  rsplit)re   rG  kwargstarget_keysvaluer>   block_mblock_nrowscolsleading_shape
rows_tiles
cols_tilesoriginal_shape
value_fp32reshapedmax_abssafe_max_absr   scales_broadcastscaled	quantized
inv_scales	scale_keys                           r$   convertzFp8Quantize.convert!  s   ":#3#3#56q9Ua 
00<$++??F!..BBFFGZ[
$T%6%6%J%JL_aef
++b/5;;r?;J%[[_ekk"od '>Q$.A"5%dV2dV3WX_W``bcjbkkrs~r  A 
 CR(W_
W_
XXemm,
 &:%%_}_j_'_:_W^_ ,,.%%(%3{{7Q;9QR L(Wq[&%//&2IJ "++B/99"=,,KKH(CFFzR	%%n5	Fl&&u}}5
)#**32158KKI#l2I z
 	
ri   N)
r   r   r   r&  r]   r/   r   rN  r   rh  r   ri   r$   rA  rA    s1    )?
%,, ?
T#u||BS=T ?
ri   rA  c            	           e Zd ZdZd Z	 d	deeej                  f   dedz  deeej                  f   fdZ	e
d
d       Zy)Fp8DequantizeziInverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor.c                     || _         y r   rC  rE  s     r$   r]   zFp8Dequantize.__init__f  rF  ri   NrG  full_layer_namer@   c                 l   t        |      dk  r||d   iS |d   d   }|d   d   }|j                  dd  \  }}| j                  j                  j                  }||j                  d   |j                  d   f}|\  }	}
||	z  dk7  s||
z  dk7  rt        d| d| d	|	 d|
 d
	      |j                  |j                        }|j                  d||	z  |	||
z  |
      }|j                  d||	z  ||
z        }|j                  d      j                  d      }||z  }||j                  |j                        iS )Nr   zweight$r   rc   r  rB   rI  r   rJ  z).)
lenrG   rD  r7  r2  r   rx   rE   r   r   )re   rG  rl  rT  re  r   rY  rZ  r>   rW  rX  r`  expanded_scalesdequantizeds                 r$   rh  zFp8Dequantize.converti  ss    z?Q#Z	%:;;y)!,	./2__RS)
d&&::LL
#//"-yr/BCJ%'>Q$.A"5%dV2dV3WX_W``bcjbkkmn  LL.	$$R'47?T[\ ..TW_dgoN)33B7AA!D0 [00A
 	
ri   c                     t               S r   )r   )re   s    r$   
reverse_opzFp8Dequantize.reverse_op  s
    }ri   r   )r@   r   )r   r   r   r&  r]   rN  r   r/   r   rh  propertyrr  r   ri   r$   rj  rj  c  se    s) '+ 
ell*+ 
 t 

 
c5<<	  
D  ri   rj  )NNF)Dr/   torch.nnr^   r   r   rq   activationsr   core_model_loadingr   r   quantizers.quantizers_utilsr   utilsr	   utils.import_utilsr
   r   hub_kernelsr   moer   r   
get_loggerr   rJ   float8_e4m3fnr|   finform   rz   rn   r{   r   r   r   r   r   r   r3   r4   r5   r.   r%   r9   rb   r   listr   rE   r   r3  rP   Moduler   r   r   r   r   r   r   r   r"  r1  r   r?  rA  rj  r   ri   r$   <module>r     sD      $   ; ?  R ) = 
		H	%   
5;;z"&&5;;z"&&
            " !%  *Z:F !&.E||.E||.E 	.E 		.E
 S	.E ++.E \\.EbI,		 I,X=7
((//=7<<=7 =7 <<	=7
 \\=7@O7
((//O7<<O7 O7 <<	O7
 \\O7d? ?TW ?dg ?lq ?B(<<(LL( ll( 	(
 5<<%&( 2,,2:?,,2
\\2U7
((//U7<<U7 U7 <<	U7
 \\U7pD, D,N*  01  ejA#'9t#3AHG
- G
T*M *ri   