
    i(I                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZm Z  d dl!m!Z! d dlm"Z" d dl	m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5 d d	lm6Z6 d dl7Z7d dl8Z8d dl9m:c m;Z< d d
l=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF d dl9mGZGmHZH ddgZId dlJmKZKmLZLmMZMmNZN e-rhd dlmOZOmPZPmQZQ d dlRmSZS d dl8mTZTmUZUmVZV d dlWmXZX d dlYmZZZ d dl[m\Z\ d dl]m^Z^ ddl_m`Z` ddlambZb ddlcmdZd ddlemfZf ddlgmhZhmiZimjZjmkZkmlZlmmZm dd lnmoZo dd!lpmqZqmrZr g d"Zs e0d#      Ztej                  dGd$       Zvd d%lwmxZx d d&lymzZz d d'l{m|Z| d d(l}m~Z~ d d)lmZ d d*lmZ d d+lmZmZmZmZmZ d d,lmZmZ d d-lmZmZ dd.lmZ dd/lmZ ej                  d0k(  Z ej*                  e      Z e0d1      Zee7j4                  e7j4                  f   Ze+e1e8j8                  ee8j                  f      Zerd2n ej>                  d3d4      Zd5d6d7e d8Zd9Zd9Zd9Zd:Z eFe8jL                  e8jN                  e8jP                  e8jR                  e8jT                  e8jV                  e8jX                  e8jZ                  e8j\                  e8j^                  e8j`                  e8jb                  e8jd                  e8jf                  e8jh                  g      Zd;ed<<   d=Zeedz
  z  d k(  red>k\  sJ d?       dHd@ZdIdAZ G dB dCe7jt                        Z ejx                  dDE       G dF dG             ZdJdKdHZ	 	 	 dL	 	 	 	 	 	 	 	 	 dMdJZ	 	 	 dL	 	 	 	 	 	 	 	 	 dMdKZej                  dNdL       ZdOdMZdPdNZÐdQdOZĐdRdPZ	 	 	 	 	 	 dSdQZdTdRZ	 	 	 	 dUdSZǐdVdTZȐdWdUZ	 	 	 	 dXdVZʐdYdWZdX f	 	 	 	 	 dZdYZ	 	 	 	 	 	 	 	 d[d[Z͐d\d]d\Z	 	 d^	 	 	 	 	 	 	 	 	 d_d]Z	 	 	 	 	 d`	 	 	 	 	 	 	 	 	 	 	 	 	 dad^ZАdbd_Zѐdcd`ZҐdddaZӐdedbZԐdfdcZ e4dd      Z e0dedDf      Zee'e%ef   ef   Z G dg dhe,e(eef         ZِdgdiZ	 	 	 	 dgdjZ	 	 	 	 dhdkZ	 	 	 	 didlZ	 	 	 	 	 	 djdmZ	 	 	 	 	 	 dkdnZ	 dl	 	 	 	 	 dmdoZ	 	 	 	 	 	 dndpZdodqZdpdrZdqdsZdrdtZdsduZdtdvZdudwZdvdxZdwdyZ eg dz      Z	 	 	 	 dxd{Zdyd|Zdzd}Zd dlZd{d~Zg ZdZed<   d|dZd{dZej                  d}d       Zej                  	 	 	 d~	 	 	 	 	 	 	 dd       ZeZeZeZdIdddZdId	 	 	 	 	 	 	 ddZ ejD                  d>      dd       Z G d de*      Zejx                   G d d             Z G d d      Z  G d de       Zej                  dd       Z G d d      Z G d de      Zej                  ddd       ZejD                  dd       ZejD                  dNd       ZddZ	 dl	 	 	 	 	 	 	 ddZ		 	 	 	 	 	 ddZ
ddZddZdIdIdDd	 	 	 	 	 	 	 	 	 ddZddId	 	 	 	 	 	 	 ddZdId	 	 	 	 	 	 	 ddZdId	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZ ejD                  d      dNd       Z ejD                  d      dNd       Z ejD                  d      dNd       Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZe1ee7j4                  f   Zded<   ej                  	 d	 	 	 	 	 	 	 	 	 dd       Zej                  dd       Zej                  dd       Zej                  dd       Zej                  dd       ZddZddZ ddZ!ddZ"ddZ#	 	 	 	 	 	 	 	 ddZ$	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ%dNdZ& G d d      Z'	 	 	 	 	 	 	 	 ddZ(	 	 	 	 	 	 	 	 ddZ)ddZ*ddZ+ddZ,	 	 	 	 	 	 	 	 ddÄZ-	 	 	 	 	 	 	 	 ddĄZ.ej                  	 	 	 	 	 	 ddń       Z/	 dl	 	 	 	 	 ddƄZ0ddǄZ1ddȄZ2ddɄZ3ddʄZ4dd˄Z5dd̄Z6ej                  dd̈́       Z7dd΄Z8ej                  ddτ       Z9ej                  ddЄ       Z:ej                  ddф       Z;dd҄Z<ddӄZ=ddԄZ>ddՄZ?dNdքZ@dNdׄZAdd؄ZBdwdلZC G dڄ dej                        ZE	 	 	 	 	 	 	 	 	 	 dd܄ZFdd݄ZG	 	 	 	 ddބZH	 dl	 	 	 	 	 dd߄ZIddZJ	 dl	 	 	 	 	 ddZKddZL	 	 	 	 	 	 ddZM	 	 	 	 	 	 	 	 ddZNd f	 	 	 	 	 	 	 	 	 	 	 ddZOd f	 	 	 	 	 	 	 	 	 	 	 ddZPddZQddZRejx                   G d d             ZSej                  dd       ZTddZUddZVdNdZWddZXddZY	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZZddZ[ddZ\ddZ]ddZ^	 	 	 	 	 	 	 	 ddZ_ddZ`	 	 	 	 	 	 	 	 ddZaddZb	 dl	 	 	 	 	 	 	 ddZc	 	 	 	 	 	 ddZdddZe	 	 	 	 	 	 ddZfdNd ZgddZhddddddddZieij                         D  ci c]  \  } }|| 
 c}} Zk ej                  d	      Zmdːd
Znd̐dZod͐dZpd͐dZqej                  dΐd       Zrejx                   G d d             Zsi Ztded<   	 	 	 	 	 	 	 	 dϐdZu eF       Zvded<   dАdZwdldѐdZxdҐdZy e0d      Zz e0d      Z{ G d deeze{f         Z| e3dD      dldDdEdӐd       Z}dԐdZ~ G d  d!ej                        Zej                  dՐd"       ZdNd#Zd֐d$Zdאd%Zdאd&Zdؐd'ZdGd(Zdِd)ZdNd*Zdڐd+Zd,Zdېd-Zdېd.Zdܐd/Z	 	 d	 	 	 	 	 	 	 	 	 dݐd0Zdސd1Zdߐd2ZdNd3Zdd4Zdd5Z ejx                  dDE       G d6 d7             Zed8e%f   Zeeegef   Z G d9 d:      Z e       Zdd;Zdd<Zdd=Zdd>Zdd?Z eFg d@      ZddAZe"ddB       ZddCZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddDZ	 	 	 	 	 	 ddEZ	 d	 	 	 	 	 	 	 ddFZyc c}} w (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)	lru_cache)StringIO)AnycastConcatenateGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVarUnion)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)Path)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node)ScalingType   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     t         D  cg c]#  } t        t        |       j                         s"| % }} t	        |      dk  sJ t	        |      dk(  rd}|S |j                         }|S c c} w )Nr7   r   rH   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      f/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/torch/_inductor/utils.pyget_gpu_typerX   j   sg    &K'%*;*H*H*J!KJKz?aZA-vHO 4>>>3CHO Ls
   #A'A')get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32_TspvTORCHINDUCTOR_XPU_KERNEL_FORMATzebinz.cubinz.hsaco.)rH   hiprJ         zOrderedSet[torch.dtype]_TMA_SUPPORTED_DTYPES@      zmust be power of 2c                *    | t         z   dz
  t          z  S )z/Round up to the nearest multiple of ALIGN_BYTESr7   )ALIGN_BYTES)nbytess    rW   _alignrz      s    [ 1$44    c                   t        | t        j                  t        j                  f      r#t	        t        t        | j                              S t        | t              xs! t        j                  | t              t        k(  S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrx   )vs    rW   r   r      sQ    !eii+,3{AFF+,,aK599Q#<#KKr{   c                  *    e Zd ZdZdZdZedd       Zy)r   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr7   Tc                    t        |t        t        j                  f      rt	        t        |            S t        |      r|S y N)r}   intr~   Integerrz   r   )clsvalues     rW   evalz
align.eval   s6    ec5==12#e*%%uL r{   N)r   
sympy.ExprreturnzOptional[sympy.Expr])__name__
__module____qualname____doc__nargs
is_integerclassmethodr    r{   rW   r   r      s!    FEJ r{   r   T)frozenc                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    r   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__r   r{   rW   r   r      s$    
 	G -,-- r{   r   c           
         |         t         j                  j                          t        j                  t	        d      t         j
                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t        |	      D cg c]"  }t         j                  j                  d      $ }}t        |	      D cg c]"  }t         j                  j                  d      $ }}t         j                  j                  t         j                  j                  j                  g      5 }
t         j                  j                          t        |	      D ]q  }|j                          ||   j                          t         j                  j                   j                  d	      5   |         d
d
d
       ||   j                          s t         j                  j                          t        j"                  t%        ||      D cg c]  \  }}|j                  |       c}}      }d
d
d
       t        j&                        j)                         }t*        j-                  d       t*        j-                  
j/                         j1                  dd             t3        |
j5                         D cg c]A  }|j6                  t8        j                  k(  r"t;        j<                  d|j>                        |C c}      }|r"|tA        j&                  d |D              dz  z  }t*        j-                  d|       |S c c}w c c}w # 1 sw Y   xY wc c}}w # 1 sw Y   3xY wc c}w )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArH   dtypedeviceTenable_timing   r7   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  4   K   | ]  }|j                     y wr   device_time_total.0events     rW   	<genexpr>zfp8_bench.<locals>.<genexpr>(  s     QE33Q        @@profiling results: %s ms)!rP   rH   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestabler\   eventsdevice_typer[   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rW   	fp8_benchr      sT    D	JJKKJu}}VLE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 BGxQA5::##$#7QKQ?DXO!!!!5OIO			NN++00
 
  
 
 


 x 	"AKKMN!!#&&7 aL!	" 	

 +.{I+FG41aQ^^AG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
!!Z__4HH0%**=I	 	
	O OOQQQ	

 II(#.JO RO 
 H
 
*	
sE   "'P'PA9P2=PAP2P,9P2AP?P)$P22P<Fc                8    ddl m}   |t              | |||      S )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr   _do_bench_using_profiling)r   r   r   is_vetted_benchmarkingr   s        rW   do_bench_using_profilingr   0  s(    " UE*+DE
FC/ r{   c           
        |sddl m}  |        t               }|j                         }t	        |      } |         |j                          t        j                  t        d      t        j                  |      }|j                  d      }	|j                  d      }
|	j                          t        d      D ]  }|j                           |          |
j                          |j                          |	j                  |
      dz  }t        dt        ||z              }t        dt        ||z              }t        |      D ]	  } |          |j                          t        j                  j!                  t#        t        j                  j$                  |      g	      5 }t        |      D ]  }|j                           |          |j                          d
d
d
       t&        j)                  d       t&        j)                  j+                         j-                  dd             t/        |j1                         D cg c]0  }|j2                  t#        t4        |      k(  r|j6                  dk7  r|2 c}      }t9        |      |z  dk7  rt;        d|t9        |      |      t9        |      |z  }t/        t=        |      D cg c]  \  }}||z  dk7  r| c}}      }|j?                          |j+                         }t&        j)                  d       t&        j)                  |j-                  d             tA        d |D              dz  |z  }t&        j)                  d|       |S # 1 sw Y   xY wc c}w c c}}w )r   r   )may_ban_benchmarkingr   r   Tr   r   r7   r   Nr   r   r   r   zContext SynczWFailed to divide all profiling events into #repeat groups. #%s events: %d, #repeats: %szprofiling time breakdown)r   c              3  4   K   | ]  }|j                     y wr   r   r   s     rW   r   z,_do_bench_using_profiling.<locals>.<genexpr>  s     A%e%%Ar   r   r   )!r   r   rX   upperrY   r   rP   r   r   r   r   r   r   r   r   r   r   rO   r   r   r   r   r   r\   r   r   r[   r   rR   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   device_type_upperdevice_interfacer   r   r   r   r   r   r   r   r   r   num_event_per_groupr   actual_eventsr   s                         rW   r   r   H  s'    "M.K#))+/<D  "KKJuyyME #((t(<K &&T&:I1X 
   "**959K 1c&;./0H1c#+,-H 8_ 
   "			ENN335FG
 
  
 ' 
x 	AKKMD		 	$$&' IIlIIann$$-EQS$TU 	
  GJ8I$JJ

n, 	
O ?h&!++ 
 	
 o.9 &o6	
5&&!+ 	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.Jc' '$	
"	
s   $8M/5M<	N
/M9c                    	 ddl m}  t        j                  j	                  dd       | d uxr% t        t        t        j                  dd       d      S # t        $ r Y yt        $ r}dt        |      v sJ Y d }~yd }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr  Fztorchvision::nms does not exist)torchvision.opsr  rP   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrO   opsImportErrorr   str)r  r   s     rW   has_torchvision_roi_alignr    s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 	A?A?&A::A?c                b   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dvrZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        )cpumeta)index)
rP   r   r   r}   r  typer  rY   Workercurrent_devicer   r   s     rW   decode_devicer    s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMr{   c                |    t        j                  t        j                  | t        j
                  j                        S r   )	functoolsreduceoperatormulr~   SOne)its    rW   sympy_productr    s#    HLL"eggkk::r{   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wr   r   )r   abs      rW   r   zsympy_dot.<locals>.<genexpr>  s     >daAE>s   )rR   r~   expandr   r   )seq1seq2s     rW   	sympy_dotr&    s8    t9D	!!!<<>c$o>>??r{   c                \    | D ci c]  }t        |      | c}j                         S c c}w r   )r   values)r  rT   s     rW   uniquer)    s'     !BqE1H!((**!s   )c           
     n   t        | t        j                        st        |t        j                        r2t        t        j                  |       t        j                  |            S t        | t
              rt        |t
              s$J |  dt        |        d| dt        |              t        | |      S )Nz: , )r}   r~   Exprr_   sympifyr   r  runtime_ceildiv)numberdenoms     rW   rj   rj     s     &%**%E5::)Fu}}V,emmE.BCC fc"z%'= ("T&\N"UG2d5k];= 65))r{   c                f   | yt        |       j                  d      d   }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|j                  t        |j	                               D ci c]  }|| c}       t        | t               r| S d'||    S c c}w )(Nz*i8rp   r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64*)r  splitupdatelistr(  r}   )key	dtype_strtysr   s       rW   _type_ofr^    sR   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /C4 JJd3::<01112S#&3@aI/?,@@ 2s   
B.c                R    | D cg c]  }t        j                  |       c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )r~   r-  lstr   s     rW   convert_shape_to_inductorrb    s!     '**EMM!***s   $c                f    t        | t        j                        r| j                  j                  S | S )z
    Convert SymInt to sympy.Expr, leave int as is.

    Unlike sympy.sympify() which converts int to sympy.Integer,
    this function preserves int as int and only converts SymInt to Expr.
    )r}   rP   r2   nodeexprvals    rW   convert_symint_to_exprrh    s%     #u||$xx}}Jr{   c                    ddl m} t        | t              r| S t        | t        j
                        rt        |       S |j                  j                  j                  j                  | d      S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r7   VN)hint)
virtualizedrk  r}   r   r~   r   graphsizevars	shape_envcreate_symintnode)r   rk  s     rW   convert_to_symintrr    se      a 	

 !U]]+ F	 !!++==ad=Kr{   c                >    | D cg c]  }t        |       c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    )rr  r`  s     rW   convert_shape_to_symintrt  .  s     +..Qa ...s   c                N    t        d | j                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wr   )
alias_infor   r!  s     rW   r   zis_view.<locals>.<genexpr><  s     FAq||4'Fs   )any_schema	argumentsops    rW   is_viewr~  8  s     F1E1EFFFr{   c                     yNFr   )r   s    rW   <lambda>r  A      r{   c                   | j                   dk7  ryt        | j                  t        j                  j
                        s| j                  t        j                  u syt        t        j                  j
                  | j                        }|t        j                  u st        |      rt        fd| j                  D              S t        j                  j                  |j                  v xs  |      S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  6   K   | ]  }t        |        y wr   )is_pointwise_use)r   uis_pointwise_fns     rW   r   z#is_pointwise_use.<locals>.<genexpr>R  s     KA#A7Ks   )r}  r}   targetrP   _ops
OpOverloadr  getitemr   r~  r   usersTag	pointwisetags)user  r  s    ` rW   r  r  ?  s     vv 3::uzz445xGWGW9W%**''4F!!!WV_KKKK99&++-H1HHr{   	list[Any]c           	        t         j                  j                         g dfd} j                  | gt	        t         j
                  |||f       }t        | j                  j                        dk(  r2t        | j                  j                  d   j                        dk(  r|f}j                  |       t         j                  j                  i       }|fS )Nc                `    j                  |        j                  dt                     S )Narg)appendplaceholderrR   )r  g
graph_argss    rW   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg]  s,    #}}s3z?"3455r{   r7   r   Tensor)r  torch.Tensorr   r5   )rP   fxGraphr  r%   r  rR   rz  returnsr  r  outputr4   )r  r   kwargsr  rd  gmr  r  s         @@rW   gen_gm_and_inputsr  W  s     	A%'J6 1??u||^dF^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>r{   c                h    | dk(  ry t        |       }|j                         r|j                          y y Nr  )rY   rQ   r   r  s     rW   r   r   o  s4    /7$$&$$& 'r{   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )r   rP   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rW   timedr  w  sr     	d				B5\ 'F 
			B7Nr{   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |j                         S c c}w )Nz.6f)rP   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rW   print_performancer    sg     ll>CFmLuneV	4LG << 5(D	TH_S!#99;	 	Ms   A1c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r   r   )r  s   rW   r  z#precompute_method.<locals>.<lambda>  s     r{   N)rO   setattr)objmethodr  s     @rW   precompute_methodr    s     !WS&!#FC(r{   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rW   precompute_methodsr    s     '#v&'r{   c                <    t        | |kD        t        | |k        z
  S r   )r   r!  r"  s     rW   cmpr    s    q1u:AE
""r{   c                ~    t        | t              r| g|z  S t        |       dk(  r t        |       | d   g      |z  S | S )Nr7   r   )r}   r   rR   r  )rT   sizes     rW   pad_listliker    sC    !SsTz
1v{tAw!v%%Hr{   c                D    t        |       dk(  rg S dd}t        | |      S )Nr   c                n    t        | t              r| S ddlm} t        | |      sJ | j	                         S )Nr7   )rF   )r}   r  	schedulerrF   get_name)elemrF   s     rW   	sort_funcztuple_sorted.<locals>.sort_func  s1    dC K0$ 1222}}r{   r[  )r  rl   r   r  )rR   sorted)rT   r  s     rW   tuple_sortedr    s&    
1v{	 !##r{   PRV)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y r   r   )r   s    rW   clear_cachezCachedMethod.clear_cache  s    ),r{   c                     y r   r   selfr   r  s      rW   __call__zCachedMethod.__call__  r  r{   N)r   r   r   None)r   P.argsr  P.kwargsr   r  )r   r   r   staticmethodr  r  r   r{   rW   r  r    s    , ,Dr{   r  c           	         | j                   }d| dd| i}t        d| d d dj                         |        t        j                  |       || d         }d
fd	}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                8    t        |       rt        |        y y r   r  delattrr  r[  s    rW   r  z"cache_on_self.<locals>.clear_cache  s    4D# r{   r  r   r   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  r[  s        @rW   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH "ioob!#n&=">?G &GNr{   c                    t        |       S )z]
    Variant of cache_on_self for properties. The only difference is the type signature.
    )r  )r   s    rW   cache_property_on_selfr    s     r{   c                     	 	 	 	 d fd}|S )Nc           	         d d| j                    dd| i}t        d d d dj                         |        t        j                  |       |d	         }dfd
}||_        |S )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerc                8    t        |       rt        |        y y r   r  r  s    rW   r  z<cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s    tS!c" "r{   r  r  )r   r  r  r  r[  
class_names       @rW   r  z'cache_on_self_and_args.<locals>.wrapper  s     :,a}F3 Rj' (+e ,//2e 4!U #$ )	
, $	#CL1	# (r{   )r   FN_TYPE[P, RV]r   r  r   )r  r  s   ` rW   cache_on_self_and_argsr    s     
$$	$L Nr{   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr7   irrd  ) r  r}   rZ  r  r  r  or_r  rd  originsr#   r?   )node_scheduler  rd  s      rW   aggregate_originsr  $  s     -&LL * 4(TYY 		!! L	
 		
 
M2??	3$$$|s   5B*
c                   t        |       }|dk(  rYd }|D cg c]6  }|j                  dk(  r%d|j                  v r|j                  d    ||      8 }}t        t	        |            }n|dk(  rg }|D ]  }|j                  dk(  sd }d}d|j                  v r|j                  d   d   }n"d|j                  v r|j                  d   d   }d	}|s]t        |d
   t              r|j                  |d
   |z          |j                  |d
   j                  |z           t        t	        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        dj                  dg|z         S c c}w c c}w )Noriginal_atenc                   | j                   d   }d}t        |t        j                  j                        r|j
                  j                  }|S t        |t        j                  j                        rt        |j                               }|S )Nr  r  )
r  r}   rP   r  r  _overloadpacketr   HigherOrderOperatorr  r   )originr  r[  s      rW   get_origin_meta_strz2get_fused_kernel_name.<locals>.get_origin_meta_strA  so    "KK8MC-)>)>?#33<< J M5::+I+IJ-,,./Jr{   r  rP   r  source_fn_stackr   fwd_source_fn_stackbackwardr7   inductor_noder   fused)r  r}  r  r  r#   r}   r  r  r   r   NotImplementedErrorjoin)r  descriptive_namesall_originsr  r  sources	source_fnsuffixs           rW   get_fused_kernel_namer  :  s    $M2KO+	 &
yyO+6;;.O,8	  '
 
 G,-	g	%! 	CFyyO+ 	$3 &,= >r BI*fkk9 &,A B2 FI'F ilC0NN9Q<&#89NN9Q<#8#86#AB	C" G,-	o	-&1
"VYY/5QFKK
 
 "!88WI'((G
<
s   ;E=<FFc                   ! t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }d |rt        d |D              }t        |      dk(  r_|d   j                   t         d      s/t         j                        D 	ci c]  \  }}	|	|
 }
}}	|
 _        |j                   fd       |D ]Z  }d	|j                  v r|j                  d	   |j                  d	   }d}t        |t        j                   j"                        rt%        |j&                        }n=t        |t        j                   j(                        rt%        |j+                               }|r||   j-                  |j*                         d
|j                  v r<|j                  d
   d   j*                  }||   j-                  |j*                         |j                  j/                  d      dk(  s3||j*                     j-                  |j*                         ]  dnd}|j0                   d| ddj3                  |j5                                ddj3                  |j5                                d}|j0                   dg}t7        |j9                               D ]@  \  }}|j-                  |j0                   d| ddj3                  t7        |                    B  dddlm |j-                  |j0                   d       t               }g }t        | j>                        sddl m!} 	 	 	 	 	 	 d)fd}d*d!d+!fd}| D ]  }	t        |	d      r|	jD                  t        |	jD                  d      r|	jD                  jF                  |	jD                  jF                  D ]  }|j*                  |v r|jI                  |j*                         |j                  jK                  |j*                        }|U |||j*                        \  }}|j-                  |j0                   d| d  ||       d!| d        t        |	jD                  d"      s|	jD                  jL                  )|	jD                  jL                  D ]T  }|j                  jK                  |j*                        }|+ |||j*                        \  }}|j-                  d#|z          V  |D ]2  }|j-                  |j0                   d|jO                  d$%              4 |j-                  |j0                   d&d'j3                  |              |d(j3                  |      fS c c}w c c}	}w ),aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    r  Nc              3  4   K   | ]  }|j                     y wr   )rn  )r   ns     rW   r   z&get_kernel_metadata.<locals>.<genexpr>  s     "Cq177"Cr   r7   r   )_inductor_kernel_metadata_node_to_idx_mapc                "    j                   |    S r   )r  )r  single_graphs    rW   r  z%get_kernel_metadata.<locals>.<lambda>  s    lTTUVW r{   r  r  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [r+  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:rj  c                >   t        | j                        rAt        | j                  j                        r!| j                  j                  j                  }n| j                  }||}n|j
                  }	 | j                         }||fS # t        $ r d }Y ||fS w xY wr   )r}   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )bufferrw_namer$  r   layoutr  s        rW   get_buffer_infoz,get_kernel_metadata.<locals>.get_buffer_info  s     fbll3
KK9 #)++"2"2">">K"("4"4K&"D&++D"#..0F V|# + "!FV|#"s   7B BBc           	     d    ddj                  | D cg c]  }t        |       c}       dS c c}w )N[r+  r  )r  r  )shaperT   s     rW   stringify_shapez,get_kernel_metadata.<locals>.stringify_shape  s-    499e%<c!f%<=>a@@%<s   -
c                    | y | j                          } | j                         }| j                   }dt        | j                      | | | dS )Nr  ")r  strider   r"   r   )r(  shape_annotationstride_annotationdevice_annotationr-  s       rW   stringfy_layoutz,get_kernel_metadata.<locals>.stringfy_layout  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?r{   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r&  z2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]r'  r  r   ztuple[str, ir.Layout | None])r,  zIterable[int]r   r  )r(  zir.Layout | Noner   r  )(r  r}  collectionsdefaultdictrZ  r#   rR   rn  r  r   nodesr  sortr  r}   rP   r  r  r  r  r  r   r  getcommentr  keysr  itemsr  r  r?   rm  rk  r5  r6  addtry_get_bufferr7  format_node)"r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr  node_to_idx_maprd  r  r[  sort_strmetadatadetailed_metadataoriginal_noder>  	all_reads
all_writesrk  r)  r4  rr&  
input_namer(  woutput_namer   r  r  r-  s"                                  @@@rW   get_kernel_metadatarW  q  s   $ $M2K+6W&)):VfWNW ,,T2N$006
 L""CN"CC}")!,22L<)TU8A,BTBT8U"Vfc11c6"V"VIXFW     8dii'DIIo,F,R IIo6MC-)>)>?-778M5::+I+IJ-,,./"3'..tyy9$))#))K(+00C3&&tyy1YY]],->499%,,TYY78  *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= > 
u  s=/diiu6N5OP	

   GOO#44D!EF%/\	 "
-9&$J$UX$-$(A
 # =q-0AMM4I1=='2q}}7J7J7V]]00 66Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ AMM84,,8]]11 =!"!7!7!?!>$)8)HQ"))#*;<=-=< # 	D$$??#3t'7'7PT'7'U&VW	
 	  GOO#4Jsxx
?S>T!UVTYY0111I X #Ws   VV7Vc                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)rZ  r#   rS   r  rD  r  )initial_queueskip_filterdominated_setrd  users        rW   dominated_nodesr]    sz    
 'M}-M
  "JJ 	+D{40=(!!$'$$T*	+  r{   c                4  	 ddl m d	fd	t        |      \  }}|D cg c]  } 	|      s|j                   }}t        |       \  }}|D cg c]  } 	|      s|j                   }}t	        t        j                  g ||       S c c}w c c}w )Nr7   r  c                F   t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr9 t        | j
                  j                  j                  j                  f       S r   )	r}   r!  r"  r#  r@   ComputedBufferInputsKernelInputBufferTemplateBuffer)r  r  is_unrealized_nodes    rW   rd  z*gather_origins.<locals>.is_unrealized_node$  s    a&%aff--a'%aff--!RYY' 

!!!!	1
 -
 	
r{   )r  r@   r   r2  )r  r  r$   r  r#   	itertoolschain)
r   r  kwargs_flattenr   rg  kwargs_originsargs_flattenargs_originsr  rd  s
           @@rW   gather_originsrk    s     
" %V,NA-;Wc?QRU?VckkWNW"4(OL!+7SC;Mc;RCKKSLSiooE|EnEFF XSs   BBB Bc                J    dddfddfddfd |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    c                    t        | t        j                        xr, t        | j                        dk(  xr | j                  d   dk(  S )N   r   r   )r}   r~   MulrR   r   )re  s    rW   is_neg_leadzsympy_str.<locals>.is_neg_leadC  s:    tUYY'VC		Na,?VDIIaLTVDV	
r{   c                `   t        | t        j                        rt        | j                        dk(  rO | j                  d         r: | j                  d          d | j                  d   j                  d          S dj                  t        | j                              S  |       S )Nrn  r7   r   z - z + )r}   r~   r   rR   r   r  r   )re  rp  sympy_str_muls    rW   sympy_str_addz sympy_str.<locals>.sympy_str_addH  s    dEII& 499~"{499Q<'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&r{   c                    t        | t        j                        rE |       rd | j                  d          S dj	                  t        | j                              S  |       S )N-r7   z * )r}   r~   ro  r   r  r   )re  rp  sympy_str_atoms    rW   rr  z sympy_str.<locals>.sympy_str_mulS  s[    dEII&4  >$))A,7899zz#ndii"@AA!$''r{   c                   t        | t        j                        r| j                  S t        | t        j                  t        j
                  f      rd |        dS t        | t        t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t!        |       S )N()r+  )r}   r~   Symbolr   r   ro  rc   r`   ra   rb   funcr   r  r   	sympy_strr   r  )re  rs  s    rW   rv  z!sympy_str.<locals>.sympy_str_atom^  s    dELL)99uyy%))45}T*+1--(HMNii(()499SDII5N+O*PPQRRt9r{   )re  r   r   r2  re  r   r   r  r   )re  rp  rs  rv  rr  s    @@@@rW   r|  r|  <  s$    

	'	( r{   c                    ddl m} t        j                  r3t	        |j
                  dd       x}r|j                  dk7  rt        |       S t        j                         S )Nr7   rj  current_node
index_expr)
rm  rk  ri   compute_all_boundsrO   interpreterr  rf   rg   unknown)r  rk  fx_nodes      rW   get_bounds_index_exprr  k  sN     	!!~tDDWDNNl*5!!""$$r{   c                    | d   dk(  S )Nr   rS  r   )prefixs    rW   prefix_is_reductionr  y  s    !9r{   c                J    | t         j                  k7  sJ t        | |dd      S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)re   SIZErd   )r  rK  s     rW   sympy_index_symbol_with_prefixr  }  s)     TYY vsDdCCr{   c                N    | xs t         j                  xr t         j                  S r   )ri   debug_index_assertsassert_indirect_indexing)checks    rW   generate_assertr    s    /V//TV5T5TTr{   c                F    | d   dk7  sJ t        j                  | dd      S )r  r   r   Tr  )r~   rz  r   s    rW   sympy_index_symbolr    s)     7c>> <<d==r{   c                    	 	 	 	 	 	 dd}t        j                  |       j                  |j                         D ci c]  \  }}| |||       c}}      S c c}}w )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                    t        | t        j                        sJ t        |t              r,t        j                  || j
                  | j                        S |S )Nr  )r}   r~   r,  r  rz  r   is_nonnegative)replacedreplacements     rW   	to_symbolzsympy_subs.<locals>.to_symbol  sP     (EJJ///k3'<< ++$33  r{   )r  r   r  zUnion[sympy.Expr, str]r   sympy.Symbol)r~   r-  xreplacerC  )re  replacementsr  kr   s        rW   
sympy_subsr    sf    +A	 ==''(4(:(:(<=1IaO	= =s   A
c                    t        | t        j                        xs( t        | t        j                        xr | j                  S r   )r}   rP   r2   r  _has_symbolic_sizes_strides)r!  s    rW   is_symbolicr    s3    a& 1ell#E(E(Er{   c                 &    t        d | D              S )Nc              3  2   K   | ]  }t        |        y wr   )r  rx  s     rW   r   z"any_is_symbolic.<locals>.<genexpr>  s     ,!{1~,   ry  )r   s    rW   any_is_symbolicr    s    ,t,,,r{   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarc                    ddl m} | j                  j                  D ];  }t	        |      r|c S |j
                  j                  d      x}0 ||      s9|c S  y )Nr   )r)   rg  )%torch.fx.experimental.symbolic_shapesr)   rn  r>  is_cudagraph_unsafe_fx_noder  r@  )r  r)   rd  rg  s       rW   %get_first_incompatible_cudagraph_noder    sV     L &t,K99==''C49Ns9SK r{   c                    t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphr  )nextiterreversedrn  r>  r}  )r  	last_nodes     rW   output_noder    s6    T(288>>234I<<8###r{   c                    | j                   j                  d      }t        d |D              }t        |       j                  d   }t        |t              r|n|f}t        d |D              }||z  S )Nr  r|  c              3     K   | ]P  }t        |j                  j                  d       t        j                        r|j                  d    j
                   R ywrg  N)r}   r  r@  rP   r  r   )r   rd  s     rW   r   z"get_all_devices.<locals>.<genexpr>  sB      9diimmE*ELL9 			%9s   AAr   c              3     K   | ]t  }t        |t        j                  j                        rNt        |j                  j                  d       t        j                        r|j                  d    j                   v ywr  )r}   rP   r  r5   r  r@  r  r   )r   r  s     rW   r   z"get_all_devices.<locals>.<genexpr>  sS      7c588==)sxx||E*ELL9 	7s   A:A<)rn  
find_nodesr#   r  r   r}   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rW   get_all_devicesr    s}    ++}+=.8 9%9 /M "o""1%G$We4w7*H,6 77 -K ;&&r{   c                 |   t        t        j                  j                               D ]  } | j	                  d      st        j                  |    }|j
                  D ]  }|j	                  d      st        ||      }t        |t        j                  j                  j                  j                        sZ|j                  D ]i  }t        |t        j                  j                  j                  j                        s<|j                  j                   j"                  j%                          k  t        j                  | =  dt        j                  v rRt        j                  d   }t'        |j(                  j*                  j,                        `|j(                  j*                  `t1        j2                          y )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rZ  sysmodulesrB  
startswith__dict__rO   r}   rP   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rW   unload_xpu_triton_pydsr    sK   CKK,,./ %%%&NOKK$ 	<I##I. I.EOO33EEVV #)"8"8 <%"!OO33EEYY
 #MM--1199;<	< KK$#%( #++-kk12""(()2JJ#JJLr{   _registered_cachesc                    t        | d      rt        | j                        st        |  d      t        j                  |        | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r  callabler  AttributeErrorr  r  r  s    rW   clear_on_fresh_cacher    s?    
 3&hs.Gu$GHIIc"Jr{   c                 :    t         D ]  } | j                           y)z&
    Clear all registered caches.
    N)r  r  r  s    rW   clear_cachesr  )  s     " r{   c              #  \  K   t         j                  j                  |       }	 |t         j                  | <   d |!t         j                  j                  | d       y|t         j                  | <   y# |!t         j                  j                  | d       w |t         j                  | <   w xY ww)a  Thread-safe env var set/restore using atomic C-level lookups.

    We avoid mock.patch.dict(os.environ, ...) because it internally calls
    os.environ.copy(), which iterates all env var keys then fetches values in
    separate steps. That approach is not atomic and can race with background threads
    (e.g. Triton async compilation) modifying the environment, causing KeyError,
    so we use os.environ.get() for individual keys which is an atomic C-level lookup.
    N)osenvironr@  rS   )r[  r   olds      rW   _set_envr  1  sz      **..
C"

3;JJNN3%!BJJsO ;JJNN3%!BJJsOs    B,A1 7B,18B))B,c              #    K   t                ddlm}  |t        j                  |            	 t        d      5  t        j                  d        |t        j                  j                  d            }t        d|      5  d t        | t              rt        |       dk(  sJ d	       t        j                  j                  |      rtt        j                  |      }| j!                  |D ci c]D  }d
|vr>|t        j                  j#                  t        j                  j                  ||            F c}       ddd       ddd       |rUt%               r(t&        j(                  j+                         r
t-                t/        j0                  t%               fd       t                yc c}w # 1 sw Y   xxY w# 1 sw Y   |xY w# t2        $ r t        j5                  d        w xY w# t                w xY ww)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                4    t         j                  d|      S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r{  pathr  inductor_cache_dirs      rW   r  zfresh_cache.<locals>.<lambda>t  s    S[[@&% 6A 6 r{   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr  r   r   r  r  r  r}   dictrR   existslistdirrY  getsize
is_windowsrP   rJ   rQ   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter  triton_cache_dirfilesfr  s          @rW   fresh_cacher  F  s     ND1(2B2Bs2KL'/1CD 	II35GH7/:  ,.>? mT2}-2W4WW2ww~~&67 "

+; <%,, */$%#*!#3 !"277??277<<@PRS3T#U U	$ |		 6 6 8&(MM" )l  	5 	 	D  >@RS 	sm   -HG A	GA-F:3A	F5<F:GAG *H5F::G	?GGG !G33G6 6HH)reversec                   | j                   }t        t        |             }t        t	        ||d            }|st        t        |            S |S )NTr[  r	  )__getitem__r   rR   rZ  r  r  )seqr	  gettera_rsort_idxs        rW   argsortr    sE    __F
C/C F3FD9:HHX&''Or{   c          	     4    d fd}t        |      D cg c]9  \  }}|t        |t        j                        r|j                  j
                  n|f; }}}t        |t        j                  |      |      }|D cg c]  \  }}|	 }}}|S c c}}w c c}}w )Nc                n    | \  }}|\  }}dfd} |||k        ry |||kD        ry||k  ry||kD  ryy)Nc                N    t        | t              r| S j                  | d      S )NT)size_oblivious)r}   r2  evaluate_expr)re  rp  s    rW   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate  s(    $%**4*EEr{   r   r7   r   )re  z%Union[bool, torch.SymInt, sympy.Expr]r   r2  r   )r!  r"  a_idxa_valb_idxb_valr  rp  s          rW   r  zargsort_sym.<locals>.cmp  sT    uu	F
 EEM"EEM"
 5=5=r{   r  )r!  tuple[int, sympy.Expr]r"  r  r   r   )	r   r}   rP   r2   rd  re  r  r  
cmp_to_key)	rp  r  r	  r  rK  r   exprsr   r  s	   `        rW   argsort_symr    s    4  nC 
Z5<<8affkka@E  5i2237IE %&fc1c&F&M
 's   >B=Bc                t    | t         j                  k(  ryt        j                  d|       j                         S )Nrv   r   r   )rP   rV  r   element_sizer!  s    rW   get_dtype_sizer#    s-     ;;r'4466r{   c                      e Zd ZU ded<   y)LineContextr   contextNr   r   r   r   r   r{   rW   r%  r%    s    Lr{   r%  c                  "    e Zd ZU ded<   ded<   y)ValueWithLineMapr  r   zlist[tuple[int, LineContext]]line_mapNr'  r   r{   rW   r)  r)    s    J++r{   r)  c                      e Zd ZdZdddZej                  dd       ZddZddZ	ddZ
ddZddZdd	Zdd
ZddZ	 	 	 	 ddZdd dZdd!dZdd!dZ	 d"	 	 	 	 	 d#dZd$dZddZd%dZd&dZy)'IndentedBuffer   c                     g | _         || _        y r   )_lines_indent)r  initial_indents     rW   __init__zIndentedBuffer.__init__  s    GI%r{   c              #  b   K   | j                   }	 || _         d  || _         y # || _         w xY wwr   )tabwidth)r  r4  prevs      rW   set_tabwidthzIndentedBuffer.set_tabwidth  s,     }}	!$DM DMDDMs   /# /	,/c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|1t        |t              r|j                  ||j                  f       K|}t        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } t        |j                         |      S )Nr7   r;  )r   r/  r}   DeferredLineBaser%  r  r&  r  writecountr)  getvalue)r  bufr   linemaplilines         rW   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap  s    j13++ 	&B"./t<B,2::/dC(((IIdOIIdOTZZ%%%A	&  88r{   c                6    | j                         j                  S r   )r@  r   r  s    rW   r;  zIndentedBuffer.getvalue  s    '')///r{   c                f   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.|}t        |t
              sJ |j                  d      r|j                  |d d        h|j                  |       |j                  d        |j                         S )N\r   r;  )	r   r/  r}   r8  r%  r  endswithr9  r;  )r  r<  r>  r?  s       rW   getrawvaluezIndentedBuffer.getrawvalue  s    j++ 	 B"./t<B,dC(((}}T"		$s)$		$		$	   ||~r{   c                8    | j                   j                          y r   )r/  clearrB  s    rW   rH  zIndentedBuffer.clear  s    r{   c                ,    t        | j                        S r   )r2  r/  rB  s    rW   __bool__zIndentedBuffer.__bool__  s    DKK  r{   c                :    d| j                   | j                  z  z  S )Nr  )r0  r4  rB  s    rW   r  zIndentedBuffer.prefix  s    dllT]]233r{   c                &    | j                  d       y )Nr;  	writelinerB  s    rW   newlinezIndentedBuffer.newline  s    tr{   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nr  )r}   r%  r/  r  r8  with_prefixr  stripr  r?  s     rW   rN  zIndentedBuffer.writeline  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"r{   c                4    |D ]  }| j                  |        y r   rM  )r  linesr?  s      rW   
writelineszIndentedBuffer.writelines"  s      	!DNN4 	!r{   c                H     t         j                  d fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwr   r0  )offsetr  s   rW   r  z"IndentedBuffer.indent.<locals>.ctx)  s9     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  r[  r  s   `` rW   indentzIndentedBuffer.indent(  s$    		"	"	' 
#	' ur{   c                .    | xj                   |z  c_         y r   rZ  r  r[  s     rW   	do_indentzIndentedBuffer.do_indent3      r{   c                .    | xj                   |z  c_         y r   rZ  rb  s     rW   do_unindentzIndentedBuffer.do_unindent6  rd  r{   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   r;  )r}   r,  floatr/  r%  minrR   r  mathisinfr  rN  r   textwrapdedentrstriprX  )r  
other_coderS  rn  r?  r   s         rW   splicezIndentedBuffer.splice9  s    j.15\F")) I!$4 TS5G)GHFI zz&!")) HdK0KK&&t,",,T4F3FG	H "4J'..0
#**,J%%d+ "q!"r{   c                    t        | j                        }| j                  D cg c]
  } ||       c}|_        |S c c}w N)r1  )r,  r0  r/  )r  r{  r   r?  s       rW   r   zIndentedBuffer.mapS  s4    DLL9-1[[9Td4j9

 :s   >c                @    t        |        d| j                          dS )Nrx  ry  )r  r;  rB  s    rW   __repr__zIndentedBuffer.__repr__X  s     t*Qt}}/q11r{   c                    | j                   |j                   k(  sJ t        | j                         }|j                  | j                         |j                  |j                         |S rs  )r0  r,  rW  r/  )r  otherr   s      rW   __add__zIndentedBuffer.__add__[  sK    ||u}},,,DLL9t{{#u||$
r{   c                    || j                   v S r   )r/  )r  new_lines     rW   containszIndentedBuffer.containsc  s    4;;&&r{   Nr   )r1  r   r   r  )r4  r   r   r]  )r   r)  r   r  r   r  r   r2  )r?  z)Union[LineContext, DeferredLineBase, str]r   r  )rV  z3Sequence[Union[LineContext, DeferredLineBase, str]]r   r  r   )r[  r   r   'contextlib.AbstractContextManager[None])r[  r   r   r  F)rp  zUnion[IndentedBuffer, str]rS  r2  r   r  )r{  zCallable[[Any], Any]r   r,  )rw  r   r   r,  )rz  z)Union[DeferredLineBase, LineContext, str]r   r2  )r   r   r   r4  r2  r^  r_  r6  r@  r;  rF  rH  rJ  r  rO  rN  rW  r`  rc  rf  rq  r   ru  rx  r{  r   r{   rW   r,  r,    s    H& ! !9(0(!4#!H!	!	 EJ"4"=A"	"4
2'r{   r,  c                  (     e Zd Zd fdZddZ xZS )FakeIndentedBufferc                "    t         |           y r   )superr2  )r  	__class__s    rW   r2  zFakeIndentedBuffer.__init__h  s    r{   c                V    |dk(  rt         j                  | |      S t        d| d      )Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rW   r  z#FakeIndentedBuffer.__getattribute__k  s;    ;**466!$ (= =
 	
r{   r~  )r   r  r   r   )r   r   r   r2  r  __classcell__r  s   @rW   r  r  g  s    
r{   r  c               #     K   t         j                  t         j                  }} 	 d  | |ct         _        t         _        y # | |ct         _        t         _        w xY wwr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rW   restore_stdout_stderrr  v  s@     %(ZZNN@!/
CJ
CJs   !AA  A AAc                  P    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zy
)r8  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y rQ  )rS  r?  rT  s     rW   r2  zDeferredLineBase.__init__  s    zz|D	r{   c                    t         )zJReturns either self.line or None to indicate the line has been 'unwritten'r  rB  s    rW   r  zDeferredLineBase.__call__      !!r{   c                    t         )z3Returns a new deferred line with the same conditionr  rT  s     rW   	_new_linezDeferredLineBase._new_line  r  r{   c                @    | j                  | | j                         S r   r  r?  )r  r  s     rW   rR  zDeferredLineBase.with_prefix  s    ~~455r{   c                T    | j                  | j                  j                               S r   )r  r?  r  rB  s    rW   r  zDeferredLineBase.lstrip  s    ~~dii..011r{   c                >    | j                  | j                  |         S r   r  )r  r  s     rW   r  zDeferredLineBase.__getitem__  s    ~~dii.//r{   c                ,    t        | j                        S r   )r2  r?  rB  s    rW   rJ  zDeferredLineBase.__bool__  s    DIIr{   c                ,    t        | j                        S r   )rR   r?  rB  s    rW   __len__zDeferredLineBase.__len__  s    499~r{   N)r?  r  )r   zUnion[str, None])r?  r  r   r   )r  r  r   r   )r   r   )r  zUnion[int, slice]r   r   r  r   r   )r   r   r   r   r2  r  r  rR  r  r  rJ  r  r   r{   rW   r8  r8    s-    8
""620r{   r8  c                  4     e Zd ZdZd fdZddZddZ xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`c                @    t         |   |       || _        || _        y r   )r  r2  r[  value_fn)r  r[  r  r?  r  s       rW   r2  zDelayReplaceLine.__init__  s     r{   c                j    | j                   j                  | j                  | j                               S r   )r?  replacer[  r  rB  s    rW   r  zDelayReplaceLine.__call__  s#    yy  4==?;;r{   c                D    t        | j                  | j                  |      S r   )r  r[  r  rT  s     rW   r  zDelayReplaceLine._new_line  s    $-->>r{   )r[  r  r  zCallable[[], str]r?  r  r}  )r?  r  r   r  )r   r   r   r   r2  r  r  r  r  s   @rW   r  r    s    @!
<?r{   r  c                   t        | t        j                        r| }nt        j                  t               |       }t	        j
                  |      }t        j                  j                  rC|j                  J |j                  dk  s|j                  dk(  rt        j                  d       yy|j                  dk(  rdnd}|j                  }||k  rt        j                  d	||d
       yy)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrJ   rr   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r}   rP   r   rX   r    createversionrq   majorr   r  r  multi_processor_count)index_or_devicer   propr  r  s        rW   
is_big_gpur    s    /5<<0 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I7:%I> 	 	
 r{   c                     t         j                  j                         r(t         j                  j                         j                  S t         j
                  j                  d      j                  S )NrH   )rP   rJ   rQ   get_device_propertiesgpu_subslice_countrH   r  r   r{   rW   get_max_num_smsr    sF    yyyy..0CCC::++F3IIIr{   c                     t         j                  j                         syt         j                  j                  t         j                  j	                               } | j
                  dk(  S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rP   rH   rQ   r  r  r  )device_propertiess    rW   
using_b200r    sJ     ::""$

889R9R9TU""b((r{   c                     t         j                  j                         r
t               S t         j                  j                         } t               | | z
  S dz
  S )zFHandle experimental carveout if set otherwise return hardware SM countr   )rP   rJ   rQ   r  r  _get_sm_carveout_experimental)carveouts    rW   get_num_smsr    sJ     yy  xx557HH,@HHaHHr{   c                    ddl m}m} |
t               }|j	                  d      }|| z  t
        z  } |||| |j                               S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r7   )r8   WorkspaceZeroModeF)r:  	zero_moder   
outer_name)codegen.commonr8   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr8   r  r  r  s          rW   get_tma_workspace_argr    sZ     @"}!++E2I--0CCD+<++-	 r{   c                    | j                   |vr!t        j                  d| j                   |       t        | j                  j
                        xr% | j                   |v xr t        | j                        S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r(  allowed_layout_dtypess     rW   _use_template_for_gpur    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%r{   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w Nr:  )r   ri   max_autotune_gemm_backendsrX  rS  backendrT   s     rW   _use_autotune_backendr  	  M    ==?!<<BBDJJ3O	      Ac                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w r  )r   ri   max_autotune_conv_backendsrX  rS  r  s     rW   _use_conv_autotune_backendr    r  r  )enable_int32enable_float8check_max_autotunec                  ddl m}m} t        j                  t        j
                  t        j                  g}|r>t        j                  t        j
                  t        j                  t        j                  g}|r/|j                  t        j                  t        j                  g       t        | j                  j                        xr t        | |      xs) | j                  j                  dk(  xr | j                  |v xrS t         j"                  xs t         j$                  xs | xr* t'        d      xr  || j                  |j(                        S )Nr7   )BackendFeaturehas_backend_featurer  TRITON)r  r  r  rP   r   rB  rD  rL  extendr<  r=  r  r   r  r  r   ri   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r(  r  r  r  r  r  layout_dtypess          rW   use_triton_templater    s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&-@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOr{   output_layout
add_guardsc                   	 ddl m} ddlm dfddfd}d	fd	 	 	 	 	 	 	 	 dfd	 	 	 	 	 	 	 	 dfd		 |       xr t	        fd
|D              xr  ||       S )u^  
    Return True iff *all* supplied tensors satisfy the CUDA TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 1 ≤ rank ≤ 5 (cuTensorMapEncodeTiled)
      * dtype in _TMA_SUPPORTED_DTYPES (CUtensorMapDataType enum)
      * Base pointer 16-byte aligned
      * Exactly one contiguous ("inner") dim with stride 1
      * All "outer" dims have 16-byte aligned strides
      * Inner dim size × itemsize is a multiple of 16
      * For 1-byte dtypes (e.g. FP8), inner dim ≥ 32
    r   )has_triton_tma_devicer7   rj  c                X    j                   j                  j                  | t              S r   )rn  ro  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesrk  s    rW   _alignedzcan_use_tma.<locals>._alignedG  s     ww<<ZWWr{   c                    | y| j                   }| j                  }| j                  } | j                        sy |||      S )NTF)r  r0  r   r[  )r(  sizesstridesr   r  _is_tma_compatibles       rW   _is_tma_compatible_layoutz.can_use_tma.<locals>._is_tma_compatible_layoutJ  sF    >-- &!%%88r{   c                   | j                         }| j                         }| j                         }| j                         j                  j
                  v ry| j                         x}|j                  dk(  r
 |||      S  |||      S )NFrJ   )get_size
get_stride	get_dtyper  rn  unaligned_buffers
get_devicer  )r  r  r  r   m_devicerk  r  _is_tma_compatible_xpus        rW   _is_tma_compatible_matrixz.can_use_tma.<locals>._is_tma_compatible_matrixW  s{    

,,. ::<177444&H38N)%%@@!%%88r{   c                R   t        |       }|j                  }|dk  s|dkD  ry|t        vryrKj                  j                  j                  |       }j                  j                  j                  |      }nd| D cg c]'  }j                  j                  j                  |      ) }}|D cg c]'  }j                  j                  j                  |      ) }}t        |      D 	cg c]-  \  }	}j                  j                  j                  |d      r|	/ }
}	}t        |
      dk7  ry|
d   }t        |      D ]  \  }	}|	|k(  r ||z        r y ||   } ||z        sy|dk(  r'j                  j                  j                  |d      syyc c}w c c}w c c}}	w )Nr7   r   Fr       T)
rR   itemsizert   rn  ro  guard_int_seqsymbolic_hintr   statically_known_equalsstatically_known_geq)r  r  r   rankr  sizes_i	strides_ir   str   r  	inner_idx	inner_dimrk  r  r  s                rW   r  z'can_use_tma.<locals>._is_tma_compatiblee  s   
 5z>>!8tax--gg&&44U;G((66w?IBGHQqww''55a8HGHFMN))77;NIN
 #9-
2ww77A> 
 

 u:?!H	 y) 	EArI~BM*		 I&		H,- q=!1!1!F!FyRT!U; IN
s   >,F0,F,2F#c                T   |d   }j                   j                  j                  |      }j                   j                  j                  |d      syd}| D ]O  }j                   j                  j                  |      }j                   j                  j	                  ||      sO y y)Nr   r7   Fl    T)rn  ro  r  r  statically_known_gt)	r  r  r   last_stridelast_stride_hint
MAX_UINT32r  	size_hintrk  s	           rW   r  z+can_use_tma.<locals>._is_tma_compatible_xpu  s     bk77++99+Fww778H!L 
 	D((66t<Iww33IzJ	
 r{   c              3  .   K   | ]  } |        y wr   r   )r   r  r  s     rW   r   zcan_use_tma.<locals>.<genexpr>  s     ?)!,?   )r  Union[int, sympy.Expr]r   r2  )r(  Optional[Layout]r   r2  )r  r@   r   r2  )r  Sequence[sympy.Expr]r  zSequence[_IntLike]r   torch.dtyper   r2  )torch.utils._tritonr  rm  rk  r   )
r  r  matricesr  r  rk  r  r  r  r  s
    `   @@@@@rW   can_use_tmar  2  s    " :X99/#/#/ / 
	/b##  
	* 	 	5?h??	5%m4r{   )r  c                    t         j                  j                  r| nd }t        d |D              xr( t	        |||dxr t         j                  j
                  S )Nc              3  T   K   | ]   }t        |j                               d k(   " yw)rn  N)rR   r  )r   r  s     rW   r   z*use_triton_tma_template.<locals>.<genexpr>  s      5qC

"5s   &(r  )ri   r  enable_template_tma_storer   r  enable_persistent_tma_matmul)r  r  r  r(  s       rW   use_triton_tma_templater     sM     %mmEE]4F5H55 	7JO	7MM66r{   c                T    t        || |dsyddlm} ddlm}  |       xr  |       S )Nr  Fr   )%has_triton_tensor_descriptor_host_tmar7   is_datacenter_blackwell_arch)r   r  r"  codegen.cuda.cuda_envr$  )r  r  r  r"  r$  s        rW   !use_triton_blackwell_tma_templater&    s2     #	: IC 12U7S7UUr{   c                    | |v xr ||v S r   r   )scale_option_ascale_option_bscaling_typess      rW   use_triton_scaling_templater+    s    
 ]*N~/NNr{   )maxsizec                 d    	 t         j                  j                  d      duS # t        $ r Y yw xY w)zCheck if CuTeDSL is importable; cache the result for reuse.

    Call ensure_cute_available.cache_clear() after installing CuTeDSL
    in the same interpreter to retry the import.
    cutlassNF	importlibutil	find_specr
  r   r{   rW   ensure_cute_availabler3    s3    ~~''	2$>>     # 	//c                 d    	 t         j                  j                  d      duS # t        $ r Y yw xY w)zCheck if NVIDIA Universal GEMM (cutlass_api) is importable; cache the result for reuse.

    Call ensure_nv_universal_gemm_available.cache_clear() after installing cutlass_api
    in the same interpreter to retry the import.
    cutlass_apiNFr/  r   r{   rW   "ensure_nv_universal_gemm_availabler7    s3    ~~''6dBB r4  c                 d    	 t         j                  j                  d      duS # t        $ r Y yw xY w)aG  Check if nvMatmulHeuristics is importable; cache the result for reuse.

    nvMatmulHeuristics provides performance model-based kernel selection
    for NVIDIA GEMM operations.

    Call ensure_nvmatmul_heuristics_available.cache_clear() after installing
    nvMatmulHeuristics in the same interpreter to retry the import.
    nvMatmulHeuristicsNFr/  r   r{   rW   $ensure_nvmatmul_heuristics_availabler:    s4    ~~''(<=TII r4  c                n   t               syt        d      syddlm} t	        |j
                  j                        sy |       syt        j                  g}	t        ||	      syt        j                  st        j                  syt        | ||      syt        d | |fD              ry|r|ry|y||yy)a  
    Returns True if we can use the blackwell kernel for grouped mm.
    Required conditions:
        1. CuTeDSL backend is enabled
        2. CuTeDSL is available
        3. We are on a blackwell arch
        4. The dtype is bf16
        5. Max autotune or max autotune gemm is enabled
        6. A, B, and the output are 16B aligned
        7. We are not using dynamic shapes
        8. A is 2d
        9. B is 3d
        10. Offsets are provided
        11. Bias and Scale are not provided
    FCUTEDSLr7   r#  )r  c              3  2   K   | ]  }t        |        y wr   )
is_dynamicr   rT   s     rW   r   z3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>0  s     
1Q:a=
1r  T)r3  r  r%  r$  r  r   r  rP   rB  r  ri   r  r  r  ry  )
mat_amat_br(  a_is_2db_is_2doffsbiasscale_resultr$  r  s
             rW    use_blackwell_cutedsl_grouped_mmrG    s    2 !" +C&--$$%')^^$M 76#;#; ue6:

15%.
11g|<3r{   c                <   ddl m} |j                  j                  j	                  ||z  |z  d      }|dk  s|t
        j                  j                  k  ryddlm	} t        j                  j                  ryt        j                  t        j                  t        j                  g}t!        | |      xr/ t
        j"                  xs t
        j$                  xr t'        d      }|r6 |       s/t(        j+                  d	t
        j                  j,                         y|S )
Nr7   rj  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cutlass.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rm  rk  rn  ro  optimization_hintri   r.  cutlass_backend_min_gemm_sizecodegen.cutlass.utilsrK  rP   r  rq   r   rB  rL  r  r  r  r  r   r  cutlass_dir)	r(  r  r  r  rk  	gemm_sizerK  r  r   s	            rW   use_cutlass_templaterR  ?  s      221q519r2JIA~V^^%Q%QQ9 }} ]]ENNEKK@Mfm4 	-  <F$<$<	-!),  !#KK4 **	 Jr{   _IntLikec                  
 ddl m t               syt               syt	        d      syddlm
 
j                  ry| j                  j                  dk7  st        j                  j                  ryt        j                  st        j                  sy|||g}||j!                  |       t#        fd|D              ry||g}	||	j!                  |       t#        
fd	|	D              ryy
)a3  
    Return True if we can use the NVIDIA Universal GEMM Template.

    Required conditions:
        1. NVGEMM backend is enabled
        2. cutlass_api is available
        3. We are on a NVIDIA GPU
        4. Max autotune or max autotune gemm is enabled
        5. Not in AOT Inductor mode (requires runtime JIT compilation)
        6. Base pointers are 16-byte aligned
        7. Shape dimensions are not unbacked symbols

    Note:
        - Shape and stride constraints are handled internally by
          cutlass_api.get_kernels() which filters incompatible kernels.
        - GroupedGemm currently only supports TN layout (column-major B).
          Any other layout will act as a noop and fall back to ATen.
        - Dynamic shapes are supported as long as they have hints
          (from example inputs).
    r   )has_free_unbacked_symbolsFNVGEMMr7   rj  rH   c              3  .   K   | ]  } |        y wr   r   )r   dimrU  s     rW   r   z1use_nv_universal_gemm_template.<locals>.<genexpr>  s     
Cc$S)
Cr  c              3  j   K   | ]*  }|j                         j                  j                  v  , y wr   )r  rn  r  )r   trk  s     rW   r   z1use_nv_universal_gemm_template.<locals>.<genexpr>  s&     
O1::<177444
Os   03T)r  rU  r3  r7  r  rm  rk  aot_compilationr   r  rP   r  rq   ri   r  r  r  ry  )r(  r  r  r  r@  rA  rD  r  dims_to_checktensors_to_checkrk  rU  s             @@rW   use_nv_universal_gemm_templater^  `  s    < P "-/ *}}V#u}}'8'86#;#;
 1IM}Q

C]
CC u~%

O>N
OOr{   c                    t         j                  j                  j                         }|dk(  ry| j                         |j	                  d      D cg c]  }|j                          c}v S c c}w )z8Check if CUTLASS should be used for the given operation.ALLTr:  )ri   r.  cutlass_enabled_opsr   rX  rS  )op_nameenabled_opsrT   s      rW   _use_cutlass_for_oprd    sU    ..44::<Ke==?+2C2CC2HIQqwwyIIIIs   A,r   c           
        ddl m} t        j                  j                  |z  }|j
                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xrO |j
                  j                   xr6 |j
                  j                   xr t        j                  j                  dkD  S )Nr   rj  )torch._inductor.virtualizedrk  ri   r  decompose_k_thresholdrn  ro  statically_known_truer~   AndGeaot_modecpp_wrappernum_decompose_k_splits)r  r  r  threshold_multiplerk  rg  s         rW   use_decompose_k_choicero    s     ."MM??BTT 	
..II1A561A56	
 	5    	5 ###	5 MM0014
r{   c           
        t         j                  j                  }ddlm} t        t        j                  j                        xr |j                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xr0 |j                  j                   xr |j                  j                    S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   rj  )ri   rocmcontiguous_thresholdrf  rk  r2  rP   r  rq   rn  ro  rh  r~   ri  rj  rk  rl  )r  r  r  rr  rk  s        rW   use_contiguousrs    s     ";;;; . 	U]] 	$GG22II01450145
	$    	$ ###
r{   c                   t         j                  j                  }g d}t        |t        j
                        r|j                  s|S |dk(  rg S t        | t        j
                        r| j                  r&t        |t        j
                        r|j                  sd}nt        || z  ||z        }d}t	        j                  |      }|D cg c]  }||k  r||k\  r| }}g g g }}
}	|D ]Z  }||z  }|dk  r||dz
  z  dk(  r|dk\  r|	j                  |       0|dz  dk(  r|
j                  |       J|j                  |       \ t         j                  dk(  r|	|
z   |z   S |	|
z   |z   }|d | S c c}w )	N)rr   r  ru   rs      r   ru  rn  rs   r7   r  
EXHAUSTIVE)ri   r  rm  r}   r~   r,  	is_numberrj  divisorsr  max_autotune_gemm_search_space)r  r  r  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitrx  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rW   get_k_splitsr    s    ]]99N .!UZZ 	1		1ejj!!++1ejj!!++!q&!q&)K~~a H  k!g&< 	H  =?B>) %Q 3; EAI!#$$Q'RZ1_%%a( !!!$%" ,,< #55FF#&88>IK''=s   
E,c                T    t         j                  j                  |       j                  S r   )rP   rH   r  gcnArchNamer   s    rW   _rocm_native_device_arch_namer  	  s    ::++F3???r{   c                     	 dd l } ddlm}m} ddlm} t        j                  j                  | j                        }||||fS # t        $ r dd}dd} G d d      }d }Y %w xY w)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     g S r   r   r   r{   rW   r  z*try_import_ck_lib.<locals>.gen_ops_library/	      Ir{   c                     g S r   r   r   r{   rW   r  z.try_import_ck_lib.<locals>.gen_ops_preselected2	  r  r{   c                      e Zd Zy)*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   r   r{   rW   r  r  5	  s    r{   r  )r   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r  r  dirname__file__r
  )r  r  r  r  package_dirnames        rW   try_import_ck_libr  	  sl    	
	
 ''//+*>*>? O-@/QQ  			 	 s   ;A A#"A#c                    t         j                  st         j                  syt        j                  j
                  sy| j                  j                  dk7  ryt        | j                        }t         j                  j                  D ci c]  }|j                  d      d   | c}xs |j                  d      d   |i}|j                         t         j                  j                  z  D cg c]  }||   	 }}|sy| j                  t        j                  t        j                   t        j"                  fvryt%               \  }}}}|st&        j)                  d       y|t         j                  _        yc c}w c c}w )NFrH   :r   z,Please pip install Composable Kernel packageT)ri   r  r  rP   r  rq   r   r  r  rq  archrX  rB  ck_supported_archr   r   rB  rD  r  r   r  ck_dir)r(  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          rW   use_ck_templater  <	  s<   6#;#;==}}V# 0>K39;;3C3CDaqwws|A)D #q!;IO
 !%%'&++*G*GG! 	! ! %||EMM5>>5==II"3"51aBC+FKK+ E!s   E6,E;c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr7   rj  CKr   rI  r   rm  rk  r  r  rn  ro  rM  r(  r  r  r  rk  s        rW   use_ck_gemm_templater  a	  sR     	d# 	KF#	KGG..q1uqy2.FJr{   c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr7   rj  CKTILEr   rI  r   r  r  s        rW   use_ck_tile_gemm_templater  k	  sR     	h' 	KF#	KGG..q1uqy2.FJr{   c                2    t        d      xr t        |       S )Nr  )r  r  r(  s    rW   use_ck_conv_templater  u	  s    %d+G0GGr{   c                |    t         j                  xs t         j                  xr | j                  j                  dk(  S r  )ri   r  r  r   r  r  s    rW   _use_template_for_cpur  y	  s2    7v77&
--


%&r{   c                   ddl m} t        |j                  |      sJ |j                  j                  }|j                  j
                  }t        |       xrX |j                         t        j                  k(  xr5 t        |      dk(  xr% t        |      dk(  xr |d   |d   k(  xr |d   dk(  }t        | ||d      xr |j                  j                         xs |S )Nr7   )rA      rn  F)require_constant_mat2)r  rA   r}   r(  r  r0  r  r  rP   rD  rR   use_cpp_gemm_templateis_contiguous)r(  mat1mat2rA   	mat1_sizemat1_stridemat1_each_batch_is_contiguouss          rW   use_cpp_bmm_templater  	  s     dkk6***
   I++$$Kf% 	"NN-	"^q 	" "	" ^y|+		"
 ^q  " !t5Q !!#D'Dr{   c                ~   ddl m} ddlm} ddlm}	 ddlm}
 t        |       rt        d      syt        j                  j                  sy|j                         t        j                  t        j                   fv }t        j"                  t        j$                  t        j&                  t        j                  t        j                   g} |
|||r| j(                  nd ||      \  }}}} }}t+        ||f      ryt-        ||j.                        r|j1                         } |	|j                               \  }} |d	||||j                         |j                         |t3               | |

      }dd}| j(                  |v xr= |d uxr7  ||      xr- t-        ||j4                        xr |j7                         xs | S )Nr7   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    | j                          | j                         d   dk(  S )Nr   r7   )freeze_layoutr  rT   s    rW   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride1	  s"    	||~b!Q&&r{   )rT   r@   r   r2  )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r  ri   cppweight_prepackr  rP   rS  rH  rD  rB  halfr   has_free_symbolsr}   BaseViewunwrap_viewparallel_num_threadsr#  is_module_buffer)r(  r  r  r  r  is_woq_int4r  r  r  r  r  	int8_gemmr  r  r  r  r  r   r  r  s                       rW   r  r  	  s    9M) (0Ee0L::$$ U[[%**$==I]]ENNEJJUZZXM")"+&,,'#Aq!VT4 A$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C tR]]+	C ""$A,A(Ar{   c                 b    t         j                  xs t         j                   xs t        d      S )NATEN)ri   r  r  r  r   r{   rW   use_aten_gemm_kernelsr  	  s-    7v77 '	v	&'r{   c                  T    e Zd ZU  ej                  d      Zded<   ddZddZd	dZ	y)
DebugDirManagerr   r  prev_debug_namec                @    t        t        j                        | _        y r   )r  r  counterr   rB  s    rW   r2  zDebugDirManager.__init__	  s    ../r{   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)rP   _dynamori   debug_dir_rootr  r   new_namerB  s    rW   	__enter__zDebugDirManager.__enter__	  sM    $}}33BB//0dggY?.2mm+r{   c                    t        j                  | j                         | j                  t        j
                  j                  _        y r   )r   r  r  r  rP   r  ri   r  )r  r   s     rW   __exit__zDebugDirManager.__exit__	  s*    dmm$.2.B.B+r{   Nr~  )r   r   r   r  )
r   r   r   re  r:  r  r   r2  r  r  r   r{   rW   r  r  	  s(    iooa G0<
Cr{   r  c                   ddl m} t               dfd}t        j                  j                  |d|      5  t        j                  j                           | |i |}d d d        t              fS # 1 sw Y   xY w)Nr7   r<   c                (    j                  |        y r   )rD  codesource_codess    rW   save_output_codez*run_and_get_code.<locals>.save_output_code	  s    r{   r  r  r  r   r  )
rn  r=   r#   r   patchr  rP   r  resetrZ  )r   r   r  r=   r  r  r  s         @rW   run_and_get_coder  	  st    
 %$.LL 
		=*<>N	O %T$V$% 4%%%% %s   'A55A>c                    |j                  dd      }t        | g|i |\  }}g }|D ]K  }|j                  t        j                  d|t        j
                               |s:|D cg c]  }|dd 	 }}M ||fS c c}w )Nremove_quoteFz	'''.*?'''r  )rS   r  r  r   findallDOTALL)	r   r   r  r  r  r  kernelsr  r  s	            rW   run_and_get_kernelsr  
  s     ::ne4L+B@@@FLG ;rzz,bii@A29:va|:G:; 7? ;s   'A;c                &     d fd}t        |      S )Nc                 R            } | j                         j                          | S r   )r   r	  )r  r   s    rW   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backward
  s!    

r{   )r   r   )r  )r   r  s   ` rW   run_fw_bw_and_get_coder   
  s    
 -..r{   c                X   ddl m} g dfdd	fd}t        j                  j	                  |d|      5  t        j                  j	                  |d      5  t
        j                  j                           | |i |}ddd       ddd       S # 1 sw Y   xY w# 1 sw Y   S xY w)
zLGet the inductor-generated code, but skip any actual compilation or running.r7   r<   c                (    j                  |        y r   r  r  s    rW   r  z"get_code.<locals>.save_output_code
  s    D!r{   c                     G d d      }| j                   r| j                         n| j                         \  }} |j                         |r |j                          |       S )Nc                       e Zd ZdZddZddZy)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                     y r   r   rB  s    rW   r2  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__%
  s    r{   c                     y r   r   r  s      rW   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call(
  s    r{   Nr~  r   r   r  r   r   r  )r   r   r   r   r2  r	  r   r{   rW   DummyModuler  "
  s    Fr{   r  )rl  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_coder  s       rW   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_module!
  s]    	 	 04/?/?D))+T\\^ 	"k 	++,[../}r{   compile_to_moduler  Nr  )r  r=   r   r   )rn  r=   r   r  r  rP   r  r  )r   r   r  r=   r  r   r  r  s         @@rW   get_coder  
  s    $ L", 	

.0I	
  	

-);=MN	  	          s#   "B'BBB	BB)c                |    t        | g|i |}dt        |      cxk  rdk  sn J dt        |              |d   S Nr7   rn  z%expected one or two code outputs got r   )r  rR   )r   r   r  r  s       rW   get_triton_coder  C
  sQ    B000LL!&Q& 
/L0A/BC& ?r{   c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S r  )r  rR   )r   r   r  r   r  s        rW   run_and_get_triton_coder  M
  sW     'r;D;F;OA|L!&Q& 
/L0A/BC& ?r{   c                    ddl m ddlm} |j                  g dfd}t
        j                  j                  |d|      5   | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   r<   rD   c                 ^     | i | | d   }t        |      sJ j                  |       y )Nrn  )r}   r  )r   r  rn  r=   graph_lowerings	real_inits      rW   	fake_initz-run_and_get_graph_lowering.<locals>.fake_initb
  s7    4"6"Q%///u%r{   r2  r
  )torch._inductor.graphr=   torch._inductor.output_coderE   r2  r   r  r  )	r   r   r  rE   r  r  r=   r  r  s	         @@@rW   run_and_get_graph_loweringr  Y
  sq     4;((IO& 
		?J		B %T$V$% ?""% ?""s   	AA(c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr!  	loweringsr  partial)aten_opoverride_fnr!  orig_fns       rW   override_loweringr(  n
  s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                  d fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S r   r   )r  r>  outr'  post_fnpre_fns      rW   r  z(add_scheduler_init_hook.<locals>.wrapper
  s+    y% i'Iu%
r{   r2  )r  r   r>  r   r   r   )torch._inductor.schedulerr*  r2  unittestr   r  r  )r.  r-  r*  r  r'  s   ``  @rW   add_scheduler_init_hookr1  
  s9     4  G ==%%iWEEr{   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)ri   developer_warningsr   r  info)msgs    rW   developer_warningr6  
  s$       Cr{   c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr7   r   ru  z--only=N)r  argvr  rR   
ValueErrorr  )rK  r  s     rW   get_benchmark_namer:  
  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx )>>)$s9~'(()   s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr7   Nr   r?  s     rW   r   zis_ones.<locals>.<genexpr>
       %!qAv%   r   rC  s    rW   is_onesrB  
      %u%%%r{   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr   r?  s     rW   r   zis_zeros.<locals>.<genexpr>
  r>  r?  r@  rA  s    rW   is_zerosrF  
  rC  r{   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r  N)r}   rP   r  r   )r   r   s     rW   r   z is_cpu_device.<locals>.<genexpr>
  s8      dELL) 	u||E**s   AAr@  )inputss    rW   is_cpu_devicerJ  
  s       r{   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r}   r~   r,  r   rP   rN  rF  rf  s    rW   get_sympy_Expr_dtyperL  
  s=    c5::& B& ~~{{}}r{   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwr   )rP   r   r   )should_profiler   r  r   s       rW   maybe_profilerO  
  sE     ^^##T4V4 	G	 	 		 	s   "A7AA Ac                 l    t         j                  j                  } | dk  rt        j                         } | S Nr7   )ri   r  threadsrP   get_num_threads)rR  s    rW   r  r  
  s+    jj  G{'')Nr{   c                     ddl m}   |        }|j                  dt        j                  j
                  rd      S d      S )Nr7   )get_backend_options
num_stagesrn  r  )runtime.triton_helpersrU  r@  rP   r  rq   )rU  optionss     rW   get_backend_num_stagesrY  
  s2    ;!#G;;|%--*;*;QCCCCr{   c                   t        | t        j                  j                  j                  j
                  dk(        }||S ddlm}m} t        j                  j                         xr! t        j                  j                         dk\  }| t        j                  t        j                  t        j                  fv sJ t        j                  |      j                   j#                  d      rddlm}  |       }| t        j                  t        j                  fv r|r	 || |      S t        j                  j                  j                  j
                  dk(  r |t        j                  |      S  |t        j                  |      S | t        j                  t        j                  fv r
|r ||       S t        j                  j                  j                  j
                  dk(  r |t        j                        S  |t        j                        S )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    tf32)is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)rv   r   
clock_rate)max_clock_rate)r   rP   backendsrH   matmulfp32_precisiontriton.testingr]  r^  rQ   get_device_capabilityr   rB  rD  inspect	signature
parametersr@  torch._utils_internalr`  )r   ds_topsr]  r^  SM80OrLaterr`  sm_clocks          rW   get_device_tflopsrm  
  s    u~~**11@@FJG M**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\J8!#U]]ENN33,UH==>>%%44>,U]]HEE&u}}h??U]]ENN33,U33>>%%44>,U]];;&u}}55r{   c                     ddl m}   |        S )Nr   get_dram_gbps)rd  rp  ro  s    rW   get_gpu_dram_gbpsrq    s    ,?r{   c                 x    ddl m}  | j                  j                  j	                  d      j                  dd      S )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r@  rs  s    rW   get_gpu_shared_memoryrv  &  s.    %==44Q7;;<LaPPr{   c                     t         j                  j                         rUt         j                  j                         j                  } t         j                  j                         j
                  }|| z  S d} d}|| z  S )Nr  i   )rP   rH   rQ   r  	warp_sizemax_threads_per_block)rx  ry  s     rW   get_max_numwarpsrz  ,  sh    zz JJ446@@	 %

 @ @ B X X
 !I-- 	 $ I--r{   c                $    | j                  d      S )Nwelford)r  reduction_types    rW   is_welford_reductionr  8  s    $$Y//r{   c                (    t        |       ry| dk(  ryy)Nr  online_softmax_reducern  r7   )r  r}  s    rW   reduction_num_outputsr  <  s    N+	2	2r{   c                 0    t        j                         dk(  S )NLinux)platformsystemr   r{   rW   is_linuxr  E  s    ??''r{   c                 (    t         j                  dk(  S )Nrk   )r  r  r   r{   rW   r  r  I  s    <<7""r{   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wr   )r}   r~   r,  rw  r?  s     rW   r   z#has_free_symbols.<locals>.<genexpr>N  s)     Jz!UZZ(<_<Js   35r  )itrs    rW   r  r  M  s    JcJJJr{   c            	     x   ddl m} | D ]  }t        ||j                  |j                  |j
                  |j                  |j                  f      r=t        |j                         xs d      st        |j                         xs d      s yt        ||j                        st        dt        |              y)Nr7   r  r   Tzunexpected type for is_dynamic F)r  r  r}   r!  r#  r  r`  r>   r  maybe_get_sizemaybe_get_strider@   	TypeErrorr  )r   r  rZ  s      rW   r>  r>  Q  s     IbmmR[[":K:KRYYW
   0 0 2 8b9=M""$*> Aryy)=d1gYGHHI r{   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r  r  r   r{   rW   r  r  e  s      K *r{   r  c                ~   ddl m} t        j                  dd      5 }t	        j
                         }t	        j
                         } t        |t        |            j                  |  t        d|j                   |       t        |j                  |       t        j                         }t        ||      5   | |j                         d d d        t        j                         |z
  }	 ||j                         |j                  j                          |j                          t        d	|j                   |       t        |j                  |       |j!                         |j!                         k(  }
t"        j%                  d
||j&                  |
|	       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr7   )stable_topological_sortrU  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior   r^   rZ   	propagater  rn  r   nowr]   lint	recompiler;  r   r4  r   )r{  r  inpr5  r  r  	before_ioafter_io
start_timetime_elapsedrZ  s              rW   pass_execution_and_saver  o  sU    9		$	$
 
 
KKM	;;=C	R#3C#89CCSI	"(($1-bhhY'\\^
#B, 	N	||~
2)


#!,bhhX& H$5$5$77hFF	
+
 
	 	
 
s%   BF3;F'CF3'F0	,F33F<c                ~    ddl m} t        | |j                        xr  t        | j                  |j
                        S )zB
    Check if input buffer is a multi-outputs template buffer
    r7   r  )r  r  r}   CppTemplateBufferr(  MultiOutputLayout	input_bufr  s     rW   is_multi_outputs_templater    s9     i!5!56 :"..< r{   c                    ddl m} t        | |j                        xr2 t	        | j
                        dk(  xr t        | j
                  d         S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r7   r  r   )r  r  r}   MultiOutputrR   rI  r  r  s     rW   #is_output_of_multi_outputs_templater    sL      	9bnn- 	;	  !Q&	;%i&6&6q&9:r{   c                &   | yddl m} t        | |j                        xr- t        | |j                         xr |d u xs | j
                  |u xsA t        |       |j                  u xr' t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xs t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xsa t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  S )NFr7   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r}   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr  rP   r	  torchrecr  defaultr  r  rd  r}  r  s      rW   is_collectiver    sG    | 	4--. 	3400	34Z14++r1 	T
b''' 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/r{   c                <    ddl m} t        |       |j                  u S Nr7   r  )r  r  r  r  )rd  r  s     rW   is_waitr    s    :''r{   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        xr |d u xs  ||       S )Nr   GroupedSchedulerNodec              3  2   K   | ]  }t        |        y wr   )contains_collectiver?  s     rW   r   z&contains_collective.<locals>.<genexpr>  s     @a&q)@r  )r/  r  r}   ry  snodesr  rd  )snode	filter_fnr  s      rW   r  r    sJ     ?%-.@5<<@@@$P)t*;*Oy?OPr{   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   r  c              3  2   K   | ]  }t        |        y wr   )contains_waitr?  s     rW   r   z contains_wait.<locals>.<genexpr>  s     :=#:r  )r/  r  r}   ry  r  r  rd  )r  r  s     rW   r  r    s4    >%-.:U\\:::uzz""r{   c                    ddl m} t        |t        j                  j
                        r|g}t        | |j                        xr | j                  |v S r  )r  r  r}   rP   r  r  r  r  r  s      rW   is_fallback_opr    sE     "ejj++,TdB--.I43C3Cr3IIr{   c                B    |||    j                   j                            S r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rW   buf_name_to_fused_snoder    s#     k(3??HHJKKr{   c                     yr  r   r  s    rW   r  r    r  r{   c                     ||       ry |j                  |        | j                  D ].  }t        |j                  ||      }||v rt	        |||||       0 y )Ncriteria_cb)rD  unmet_dependenciesr  r   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_deps          rW   r  r    sn     55!'' 
5HHk#5
 "44##	

r{   c                     yr  r   r  s    rW   r  r    r  r{   c           	     z    ||       ry |j                  |        | j                         D ]  }|j                  D ]}  }|j                  J |j                  j	                         dk(  r/|j                  j	                         |vrL||j                  j	                            }||v rnt        |||||         y )NOUTPUTr  )rD  get_outputsr  rd  r  find_recursive_users_of_node)r  r  r  r  r  or\  user_ops           rW   r  r    s     55!  GG 	D99(((yy!!#x/yy!!#+==(););)=>G,,(""'	r{   c                b    t         j                  j                  j                  rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)rn  r   )rP   
_functorchri   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rW   num_fw_fixed_argumentsr  4  s6     $$::   "669SSSr{   c                    dd}d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                ~    d| j                   vxr. d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  s    rW   is_saved_tensorz'count_tangents.<locals>.is_saved_tensorD  sH    aff$ .!&&(.!/.  qvv-		
r{   r   r  r7   )rT   r5   r   r2  )rn  r>  r}  r  rZ  r   rR   )fx_gr  	arg_countstatic_arg_idxsr  s        rW   count_tangentsr  ?  s    

 IOZZ 44= q!&&y1NI	 d5_)=#>????r{   c                  2    e Zd ZU ded<   ddZedd       Zy)	BoxedBoolr2  r   c                    | j                   S r   )r   rB  s    rW   rJ  zBoxedBool.__bool__\  s    zzr{   c                6    t        | t              r	d| _        | S yr  )r}   r  r   r  s    rW   disablezBoxedBool.disable_  s    c9%CIJr{   Nr  )r  r   r   zUnion[BoxedBool, bool])r   r   r   r   rJ  r  r  r   r{   rW   r  r  X  s     K  r{   r  c              #      K   ddl m} |j                  	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fd}t        j                  j                  |d|      5  d  d d d        y # 1 sw Y   y xY ww)Nr7   r9   c                @    j                  |        | |||||      S r   r  )r  kernel_namer  rN  gpucpp_definitionkernel_listorig_define_kernels         rW   define_kernelz.collect_defined_kernels.<locals>.define_kernelm  s-     	;'!+{Hc>
 	
r{   r  )NTN)r  r:   r  r  r  r  rN  Optional[str]r  r2  r  r  r   r   )codegen.wrapperr:   r  r   r  r  )r  r:   r  r   s   `  @rW   collect_defined_kernelsr  g  s     5-;; #'(,
"

 
  	

 
 &
 

 
		/-	P   s   AA*A	A*A'#A*c                    | dz   S )N__original__r   r  s    rW    get_cloned_parameter_buffer_namer  ~  s    .  r{   c                    | t         v S r   )rN   r  s    rW   r  r    s    Yr{   c                 :    t         j                  j                  duS )z,Check if we're running on ROCm/HIP platform.N)rP   r  rq   r   r{   rW   is_rocmr
    s    ==D((r{   c                &    | dk7  xr t        |       S )NrI   )r  r  s    rW   device_need_guardr    s    U?-vf~-r{   c                N   | t         j                  k(  r?t         j                  j                         r!t         j                  j	                         dk  S | t         j                  k(  rt         j
                  j                         ry| t         j                  t         j                  fv S )N)r  r   T)rP   rB  rH   rQ   re  rJ   rN  r2  r!  s    rW   ,needs_fallback_due_to_atomic_add_limitationsr    sk    5::#:#:#<zz//1F::	%..	 UYY%;%;%=ejj111r{   c                   | j                   t        j                  j                  j                  t        j                  j                  j
                  fv r|y| j                   t        j                  j                  j                  k(  rdnd}|d |fvxs |xr t        |      xr t        |      xs | j                   t        j                  j                  j                  k(  xrW |dk(  xrP |xrL |dk(  xrE t        j                  j                  xr) t        j                  j                  xs t               dk7  xs? ||k(  xr" |t        j                  t        j                  fv xs t        j                          S )NFrD  r   r  r7   )overloadpacketrP   r	  atenscatter_reduce_scatter_reducescatter_r  r  ri   r  fallback_scatter_reduce_sumdynamic_threadsr  r2  rN  $are_deterministic_algorithms_enabled)r  r~  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rW   use_scatter_fallbackr    sZ    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 i'SJ5::u{{:S,S	8 557!r{   c                   ddl m}m} ddlm} t        dt        |        d       t        |       D ]  \  }}t        d|dd       ||u rt        d	       '||u rt        d
       7t        ||      r|j                         }t        |rdnd d       |r:|j                  J t        d|j                  j                  j                          t        d       |j                  j                  D ]  }t        |        t        d       |j                  j                  D ]  }t        |        t!        dt#        |              y)z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r   r/  r!  r  rR   r   r}   is_reductionrd  r"  reduction_hintr5  r6  r7  r   r  )r  r  r   r!  rK  rd  is_redr  s           rW   dump_node_scheduler)    s=   
 O7	M 236
:;}- H	T#al?"$%%%%&m,&&(FfU$/?@yy,,,01N1N0OPQ*''-- c
+''.. c
 !9$t*FGG+Hr{   c                z    ddl m}  || j                         t        | j                        z  t
        z  dk(        S )Nr   )rh  )r  rh  storage_offsetr#  r   GPU_ALIGN_BYTES)r   rh  s     rW   tensor_is_alignedr-    s:     L 				 >&,,#?	??RVWW r{   c                |    t        | j                  j                        syt        j                  xs t        |       S r  )r  r   r  ri   assume_aligned_inputsr-  )example_inputs    rW   should_assume_input_alignedr1    s2     -&&++,''K+<]+KKr{   c                 6   t         j                  j                  j                         } | st	        j
                         S | j                  r| j                  j                  st	        j
                         S | j                  j                  }|j                         S r   )	rP   _guardsTracingContexttry_getr^  nullcontextr  rp  suppress_guards)tracing_contextrp  s     rW   #maybe_get_suppress_shape_guards_ctxr9    sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&r{   c                   t         j                  j                  j                  t        dd      5  t
        j                  j                          dd l}dd l	} |j                         } |j                  |      }ddlm} |j                  |       |j                  }|j!                  |j"                          | |i |}	|j%                         }
|j!                  |       |j'                  |       d d d        |	|
fS # 1 sw Y   	
fS xY w)Nr   Tr   )output_code_log)r0  r   r  r  ri   rP   r  r  r  loggingr   StreamHandlertorch._inductor.codecacher;  
addHandlerlevelsetLevelDEBUGr;  removeHandler)r   r   r  r  r<  log_capture_stringchr;  
prev_levelr  r   s              rW   run_and_get_cpp_coderG    s     
			#	#FGT	: *(R[[]"W""#56=""2&$**
  /T$V$'')  ,%%b)*  19!*  19s   CC>>D
c                   t        |       }||j                  S | D ]  }t        |t        j                        r|j
                  j                  c S t        |t        j                        sP|j                         D ]6  }t        |t        j                        s|j
                  j                  c c S  |j                         D ]6  }t        |t        j                        s|j
                  j                  c c S   y r   )	rZ   rp  r}   rP   r2   rd  r  r  r0  )rI  r  inputr  r0  s        rW   shape_env_from_inputsrJ     s     (I """  1eU\\*::''' eU\\*

 /dELL199.../  ,,. 1fell3!;;00011 r{   c                <     t              dk(  r S d fd}|S )Nr   c                z    t        |       \  }} |       }t        |      rt        j                  ||       |S r   )copy_misaligned_inputsrR   rP   _foreach_copy_)
new_inputsold_tensorsnew_tensorsr,  inputs_to_checkr  mutated_input_idxss       rW   r  z)align_inputs_from_check_idxs.<locals>.runE  sE    #9);$
 [ J {  k:
r{   )rO  list[InputType]r   r   )rR   )r  rR  rS  r  s   ``` rW   align_inputs_from_check_idxsrU  =  s#    
 ?q  Jr{   c                T   d| j                         v rd}n;t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nr   c              3  2   K   | ]  \  }}|d z
  |z    ywr=  r   )r   r,  r0  s      rW   r   z)clone_preserve_strides.<locals>.<genexpr>[  s     Tf$Tr  r7   r   )r  r   r   r0  rP   
as_stridedclone)rT   needed_sizer&  s      rW   clone_preserve_stridesr[  U  s    AFFH} T#affh
:STTWXX 	 a+6<<>FFAFFHahhj99r{   c                2   g }g }|du}|D ]  }| |   }t        |t        j                        sJ dt        |              |j	                         t
        z  sMt        |      | |<   |s^||v sc|j                  |       |j                  | |           ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )r}   rP   r  r  data_ptr	ALIGNMENTr[  r  )rO  check_inputs_idxsreturn_pair_idxsrP  rQ  ret_pair_definedr   _inps           rW   rM  rM  a  s     ')K&(K (t3 
2!}$- 	
.tDzl;	
- ==?Y&248JqMA)9$9""4("":a=1
2 ##r{   c                    g }|D ]N  }| |   }t        |t        j                        s#|j                         t        z  dk(  s>|j                  |       P t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )r}   rP   r  r]  r^  r  rR   )rI  static_input_idxsaligned_static_input_idxsrK  rI  s        rW   remove_unaligned_input_idxsrf    st     !#  2seU\\*0@90LQR/R%,,S12 $%->)??((r{   c                P   ddl m} t        j                  t        j                        j
                  }|j                  j                  j                  }|j                  j                  j                  j                  }t        j                  r'|j                  j                  j                  | |       y|j                  j                  j                  | |k        ry|j                  r)|j                  j                  j                  | dk        ry ||       xr  ||       |k  S )Nr7   rj  Tg@xDF)rm  rk  rP   iinforL  r   rn  ro  r  rp  has_hintri   assume_32bit_indexing	check_leqrh  r[  )r   rk  int_maxr  ri  s        rW   expr_fits_within_32bitrm    s    kk%++&**G  **Iww))22H##	""1g. 	ww--a7l; 	 7711!d(;  A;29Q<722r{   c                   t         j                  j                  j                         }||j                  t        |j                        dk(  sJ t        |       |j                  J |j                  D ]  }||j                  j                  d        !dt         j                  j                  j                         x}r|j                  dfd|j                  j                  t        fd|D                      y y y )Nr   Fc                f    t        |       S rj                  |       S j                  |       S r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callrp  s    rW   map_exprz4set_tracing_context_output_strides.<locals>.map_expr  s7     ("1v((<<Q??$55a88r{   c              3  .   K   | ]  } |        y wr   r   )r   r   rs  s     rW   r   z5set_tracing_context_output_strides.<locals>.<genexpr>  s     5!(1+5r  )r   r   r   z,Union[float, int, SymInt, SymFloat, SymBool])
rP   r3  r4  r5  output_stridesrR   rJ  r  rr  r  )r  compiled_graphr&  r  r  rr  rs  rp  s        @@@rW   "set_tracing_context_output_stridesrw    s     mm**224Gw55A7))*a///).9	,,888#22 	E}&&--d3$)!--66>>@@3@(+(=(=%9 &&--5u55		  Br{   c                    t         j                  t         j                  S t        j                         syt        j                  j                         ry	 ddlm}  | t        j                  j                  d      k\  S # t        $ r Y yw xY w)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
ri   fx_graph_remote_cache	is_fbcoderP   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacherz  ModuleNotFoundErrorjustknobs_getval_intry  s    rW    should_use_remote_fx_graph_cacher    s    ##/+++,,.H  5#8#8#M#M8$    s   A> >	B
	B
c                0    t        j                  dd|       S )Nz[^a-zA-Z0-9_]r   )r   subr  s    rW   normalize_namer    s    66"C..r{   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                l    t         j                  dt        |             }t        j	                  ||      S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r  _triton_type_mappingr@  )r   triton_type_names     rW   triton_typer    s.    &**5#e*=##$46FGGr{   c                    t         j                  | |       }|j                  dd      }t        t        |      }t        |t        j                        sJ |S )Nr  r  )_torch_triton_mappingr@  r  rO   rP   r}   r   )r   adjusted_type	type_namer  s       rW   triton_type_to_torchr    sL    )--eU;M%%eR0Iy)Ii---r{   c                   | j                    xr | j                         |j                         k(  xr | j                         |j                         k(  xr | j                  |j                  k(  xr{ | j                  |j                  k(  xr` | j                         j                         |j                         j                         k(  xr! | j                         |j                         k(  S r   )	is_mkldnnr  r0  r   r   untyped_storager]  r+  r"  r   s     rW   is_same_tensorr    s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;r{   c                v   | j                   xr | j                         |j                         k(  xr | j                  |j                  k(  xrn | j                  |j                  k(  xrS t        j
                  j                  j                  |       t        j
                  j                  j                  |      k(  S r   )r  r  r   r   rP   r	  mkldnnr]  r  s     rW   is_same_mkldnn_tensorr    s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOr{   c                      y)N)rl  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   r{   rW   boolean_opsr  "  s    r{   c                  "    e Zd ZU ded<   ded<   y)OpDtypeRuler3   type_promotion_kindOptional[torch.dtype]override_return_dtypeNr'  r   r{   rW   r  r  6  s    8800r{   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                *    t        ||      t        | <   y r   )r  r  )r   r  r  s      rW   #register_op_dtype_propagation_rulesr  ?  s    
 (32(t$r{   zOrderedSet[str]op_requires_libdevice_fp64c                .    t         j                  |        y r   )r  rD  r  s    rW   #register_op_requires_libdevice_fp64r  L  s    ""4(r{   c                    ddl m} | s$|j                  j                         j                  } | dk(  rt
        j                  S | dk(  ry| dk(  rt
        j                  S t
        j                  S )Nr   rj  r  rI   rJ   )	rf  rk  rn  get_current_device_or_throwr  ri   cpu_backendxpu_backendcuda_backend)r   rk  s     rW   get_current_backendr  P  s_    -gg99;@@e!!!				!!!"""r{   c                    | t         j                  t         j                  fv r7t        j                  j
                  rt               dk(  rt         j                  S | S )z"Maybe upcast [b]float16 to float32r  )rP   r   rB  ri   r  codegen_upcast_to_fp32r  rD  r!  s    rW   upcast_compute_typer  _  s@     	%--00MM00!X-}}Lr{   KeyTypeValTypec                  Z    e Zd ZdZddZddZddZddZdddZddZ	dd	Z
dd
ZddZy)
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    c                     || _         i | _        y r   original_dict	new_items)r  r  s     rW   r2  zScopedDict.__init__v  s    *13r{   c                Z    || j                   v r| j                   |   S | j                  |   S r   r  r  r  s     rW   r  zScopedDict.__getitem__z  s.    $.. >>#&&!!#&&r{   c                "    || j                   |<   y r   )r  )r  r[  r   s      rW   __setitem__zScopedDict.__setitem__  s    #sr{   c                >    || j                   v xs || j                  v S r   r  r  s     rW   __contains__zScopedDict.__contains__  s!    dnn$At/A/A(AAr{   Nc                t    || j                   v r| j                   |   S | j                  j                  ||      S r   )r  r  r@  )r  r[  r  s      rW   r@  zScopedDict.get  s6    $.. >>#&&!!%%c733r{   c                z    t        | j                        }| j                  D ]  }|| j                  vs|dz  } |S rQ  )rR   r  r  )r  r  r  s      rW   r  zScopedDict.__len__  sC    ""# 	A***Q	 r{   c              #     K   | j                   E d {    | j                  D ]  }|| j                   vs|  y 7 )wr   r  )r  r  s     rW   __iter__zScopedDict.__iter__  s@     %%%% 	A***	 	&s   ><!>>c                H    t        | j                  xs | j                        S r   )r2  r  r  rB  s    rW   rJ  zScopedDict.__bool__  s    D&&8$..99r{   c                    t         r   r  r  s     rW   __delitem__zScopedDict.__delitem__  s    !!r{   )r  Mapping[KeyType, ValType])r[  r  r   r  )r[  r  r   r  r   r  )r[  r  r   r2  r   )r[  r  r  Optional[ValType]r   r  r  )r   zIterator[KeyType]r  )r[  r  r   r  )r   r   r   r   r2  r  r  r  r@  r  r  rJ  r  r   r{   rW   r  r  n  s5    4'
$B4
:"r{   r  )frozen_defaultc              (    dfd}| |S  ||       S )Nc                4    t        j                  | d      S )NT)kw_onlyr   )dataclasses	dataclass)r   r   s    rW   wrapzir_dataclass.<locals>.wrap  s    $$S$vFFr{   )r   rl   r   rl   r   )r   r   r  s    ` rW   ir_dataclassr    s    G {9r{   c                     t         j                  j                  j                         } | "| j                  r| j                  j
                  S y r   )rP   r3  r4  r5  fw_metadatabw_donated_idxs)r8  s    rW   get_donated_idxsr    s=    mm22::<O"'B'B**:::r{   c                       e Zd ZdZdZdZdZdZy)TritonAttrsDescriptorVersionr   r7   rn  r  r-  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   r{   rW   r  r    s     LKK	  Gr{   r  c                 P   t         j                  j                  d      t        j                  S dd l} dd l} t        | j                  j                  d      rt        j                  S t        | j                  j                  d      rt        j                  S t        j                  S )Nr  r   AttrsDescriptor)r0  r1  r2  r  r  triton.backends.compilertriton.compiler.compilerr  ra  compilerr  r  r  )r  s    rW   #get_triton_attrs_descriptor_versionr    s{    ~~)1+888##v''):; ,777	))+<	=+777 ,333r{   c                 8    t               t        j                  k(  S r   )r  r  r  r   r{   rW   triton_version_uses_attrs_dictr    s    .04P4X4XXXr{   c                    | j                         }t        | t        j                  j                        r| d| j
                   n|}||fS )Nrp   )r   r}   rP   r  r  _overloadname)r}  op_overload_packet_nameop_overload_names      rW   get_op_namesr    sR    #%779 b%**//0 #
#1R%5%5$67$ 
 #$444r{   c                j   ddl m} | j                  }t        |t        j
                  j                        sy|t        j                  j                  j                  j                  t        j                  j                  j                  j                  t        j                  j                  j                  j                  fv rq ||| j                  | j                  d      }|O|\  }}|d   }|D ]@  }||j                  d   j                   t        j"                  t        j$                  fv s@ y y)a  
    Check if an FX node is cudagraph-unsafe based on its input arguments.

    Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
    with boolean indices triggers .nonzero() during capture, but integer indices
    are safe).
    r   )normalize_functionFT)normalize_to_only_use_kwargsindicesrg  )torch.fx.operator_schemasr  r  r}   rP   r  r  r	  r  	index_putr  
index_put__unsafe_index_putr   r  r  r   r2  rS  )r  r  r  
normalizedr   r  r  rK  s           rW   ,_fx_node_is_input_dependent_cudagraph_unsafer    s     =^^Ffejj334 		  ((		!!))		((00 
 (GLL'..t

 !"IAvY'G  ?sxx'<'<JJKKA (    r{   c                   | j                   }t        |      t        v ryt        |t        j
                  j                        r1t        j                  j                  j                  |j                  v ryt        |       ry| j                  j                  d      x}Kt        |t        t        f      s|gn|}|D ]+  }t        |t        j                         s|j"                  s+ y y)a   
    Check if an FX node is cudagraph-unsafe.

    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - Input-dependent unsafe ops (e.g., index_put with boolean indices)
    - Ops with sparse tensor outputs
    Trg  F)r  r  FORBIDDEN_CUDAGRAPH_OPSr}   rP   r  r  r  r  cudagraph_unsafer  r  r  r@  rZ  r  r  	is_sparse)r  r  rg  valsr   s        rW   r  r    s     ^^F 6{-- 	65::001HHLL))V[[8 4G< ||&&3&sT5M:u 	A!U\\*q{{	 r{   c                    ddl m} t        | |j                  |j                  f      ryt        | |j
                  |j                  f      syt        | dd      }|t        |      ryy)ah  
    Returns True if the node is an op that is not cudagraphable.
    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - index_put_ with boolean indices (triggers .nonzero() during capture)
    - Control flow nodes (Conditional, WhileLoop)
    - Ops with sparse tensor outputs
    r7   r  TFr  N)	r  r  r}   Conditional	WhileLoopr  r?   rO   r  )rd  r  r  s      rW   is_cudagraph_unsafe_opr  *  sa      $67dR..@AdIt,G:7Cr{   c                    t         j                  j                  dd      } t        j                         rUddlm}  |       }|rFt         j                  j                  |dd      }| r!t         j                  j                  || g      n|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  r@  ri   r|  libfb.py.parutilr  r  r  pathsep)r  r  runtime_pathlib_paths       rW   get_ld_library_pathr  D  sg    ::>>+R0D5')ww||L)UCH8<2::??Hd#34(DKr{   c                F    ddl m} t        | |      xr | j                  d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr  r}   partition_signatures)r  r  s     rW   #is_codegen_graph_partition_subgraphr  Q  s*    L 	789 	5((4r{   c                     t         j                  j                  j                  j                  xs t
        j                  d uxr$ t         j                  j                  j                  S r   )rP   r  ri   r  
cudagraphs&_unstable_customized_partition_wrapperr  graph_partitionr   r{   rW   is_using_cudagraph_partitionr  Z  sN    %%00 	F199E1 //
 
 
0
01r{   c                    ddl m} |j                  j                  j	                  | d      r6|j                  j                  j                  | d      rt        j                  S t        j                  S )Nr7   rj  l        i   )	rm  rk  rn  ro  statically_known_ltr  rP   rL  rN  )r  rk  s     rW   dtype_from_sizer  a  sP    ww++e
''


/
/h
?{{{{r{   )r  rJ   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN BF16.
    r  rJ   TF)rP   r	  r  _is_mkldnn_bf16_supportedr   s    rW   is_mkldnn_bf16_supportedr  o  3     eyy99;;	+	r{   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN FP16.
    r  rJ   TF)rP   r	  r  _is_mkldnn_fp16_supportedr  s    rW   is_mkldnn_fp16_supportedr"  {  r  r{   c           
     n   |D cg c]  }t        t        |             }}| D ]R  }t        |      t        |      k(  sJ t        |      D ])  \  }}t        ||   t        t        |                  ||<   + T g }|j	                  dj                  d t        ||      D                     t        |      t        |      dz  z   t        |      dz
  z   }|j	                  d|z         | D ]3  }|j	                  dj                  d t        ||      D                     5 dj                  |      S c c}w )N|c              3  6   K   | ]  \  }}d || dd   ywr  r  Nr   )r   hrU  s      rW   r   ztabulate_2d.<locals>.<genexpr>  s$     H41aAa0tWA,H   rn  r7   ru  c              3  6   K   | ]  \  }}d || dd   ywr&  r   )r   r   rU  s      rW   r   ztabulate_2d.<locals>.<genexpr>  s$     Htq!!QCp4lHr(  r;  )rR   r  r   r   r  r  r   r   )elementsheadersr   widthsrowr   rV  total_widths           rW   tabulate_2dr/    s   #*+ac#a&k+F+ 43x3w<'''cN 	4DAqF1Is3q6{3F1I	44 E	LLH3w3GHHIf+Vq1S[1_EK	LL{"# JSXXHs37GHHIJ99U ,s   D2c              #     K   t        | j                               t        |j                               z  }|D ]3  }| j                  |      }|j                  |      }|||n|||n|f 5 yw)a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r#   rB  r@  )dict1dict2
d1_default
d2_defaultall_keysr[  value1value2s           rW   	zip_dictsr8    sv     ( %**,'*UZZ\*BBH  	
33 (Fj(Fj
 	
	
s   A-A/c                T   	 	 	 	 	 	 	 	 dd}	 	 	 	 	 	 	 	 dd}| j                  dt        j                  j                        }| j	                         } |rm || dd        || dd        || dt
        j                  j                           || dd	        || d
t        j                  j                          || dd       | j                  dt        j                  j                        }| j                  dt        j                  j                        }|dk(  r|rt        d      | S )a6  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor_mode.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    c                    | j                  |t        t        |            }||| |<   y |s||k7  rt        d| d| d      y y )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)r@  rO   ri   r   config_patchesconfig_nameconfig_valuer   s       rW   patch_configz2maybe_aoti_standalone_config.<locals>.patch_config  s]     "";0LM=*6N;'5L0";-q>qr  1r{   c                    | j                  |t        t        |            }||k7  rt        j	                  d||       || |<   y )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)r@  rO   ri   r   r  r<  s       rW   force_patch_configz8maybe_aoti_standalone_config.<locals>.force_patch_config  sF     "";0LML KKV
 '3{#r{   z$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.)r=  dict[str, Any]r>  r  r?  r   r   r  )r@  ri   aot_inductor_modecompile_standalonecopyrP   r  rq   test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor   )r=  r@  rB  rG  rL  rM  s         rW   maybe_aoti_standalone_configrN    sk   "	&	58	HK			
3&
358
3HK
3	
3 (++.  33
 $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 	(,,	

 	>+I5Q*..,11
 -00.33
 	).E]
 	

 r{   c                   t         j                  j                  r(t         j                  j                  dk(  rt	        d      t         j                  j                  r0t         j                  j
                  dk(  rt	        d      d}d}||fS t         j                  j                  dk(  rd}d}||fS | dk  ryd}t        j                          }||fS )	a  
    Decide whether we should mmap weights, and whether to store the weights with .so.

    If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

    Returns tuple (use_external_weights, use_mmap_weights).
    binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.rD  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)ri   rK  force_mmap_weights package_constants_on_disk_formatr   rL  r|  )consts_sizeuse_mmap_weightsuse_external_weightss      rW   determine_aoti_mmap_flagsrV    s     	..@@MQJ
 	

 --44	A]   $#%555;;}L# #%555m# !++--!111r{   c                     ddl m}  | j                  j                  }|yt	        |t
              st        d      |dk(  ryt        j                  d|      st        d      y)zL
    Validates if a model name is suitable for use in code generation.

    r   rh   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r"  ri   rK  model_name_for_generated_filesr}   r  r9  r   r   )ri   
model_names     rW   is_valid_aoti_model_namerZ  8  sh    
 '$$CCJj#&OPPR 88/<d
 	
 r{   c                2    |rt        |       S t        |       S r   )r)   r(   )rT   unbacked_onlys     rW   get_free_symbolsr]  S  s    $Q''Ar{   c                    i t         j                  dt         j                  j                  dt         j                  j	                  t
        j                              i} t        j                         rt        j                  d      | d<   | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr"  
PYTHONHOME)r  r  r@  r
  r  r  r  ri   r|  	sysconfigget_path)envs    rW   python_subprocess_envre  Z  sl    

** 	bjjnn%rzzsxx'@
	C  %..v6LJr{   c                  &    e Zd ZU dZded<   ded<   y)CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    r   num_partitionspartition_indexNr   r   r{   rW   rg  rg  u  s      r{   rg  .c                      e Zd ZU dZded<   y)CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   r   r{   rW   rk  rk    s    .2G+2r{   rk  c                    | t         _        y r   )r  r  )r  s    rW   !set_customized_partition_wrappersrm    s    5<*2r{   c                8   | j                   j                  }| j                   j                  g || j                   j                  | j                   j                        }| j                   j                  }t        j                  ||f      \  }}dd}|D cg c]7  } ||      r+t        j                  j                  j                  |d      n|9 }}dddfd}|D cg c]
  } ||       }}t        j                  ||      \  }}||fS c c}w c c}w )	Nc                    t        | t        j                  j                  j                        xr/ t        | t        j                  j                  j
                         S r   )r}   rP   r  r  r@   GeneratorStater  s    rW   _is_tensor_irz(snode_args_kwargs.<locals>._is_tensor_ir  sH    !U__//667 

u!!00A
 =
 	
r{   F)guard_shapec                2    t        j                  | ||      S )Nr   )rP   r   )r  r   r   s      rW   _tensorz"snode_args_kwargs.<locals>._tensor  s    {{4uV<<r{   c                    t        | t        j                        s| S  | j                         | j                  | j
                        }|S r   )r}   rP   r  r  r   r   )r   r,  rt  s     rW   to_real_tensorz)snode_args_kwargs.<locals>.to_real_tensor  s7    !U\\*Haffh2
r{   r  )r   r  )r   r   r   r   )rd  rI  fill_non_provided_argsconstant_argsr  pytreer$   rP   r  r  ir_node_to_tensortree_unflatten)	r  r   r  	flat_argsflat_args_pytree_specrq  r!  rv  rt  s	           @rW   snode_args_kwargsr~    s   ::D::,,*$*))*

D ZZF'-':':D&>'J$I$
 	   	,,QE,B	I = -66q"6I6((4IJLD&<%  7s   <D$Dc                    ddl m} | j                  }|j                  j                  r(|j	                  |j                  j                  dz         }|j                  d      S )Nr7   rj  r   )primals_r  fwd_rng_stater  r  )rm  rk  r   rn  removeprefixr  )r  rk  dep_names      rW   is_nonfreeable_buffersr    sN    xxH 	ww||(();<I r{   c                p    t        ||  dz        5 }|j                         cddd       S # 1 sw Y   yxY w)z,Load a template file and return its content.z	.py.jinjaN)openread)r   template_dirr  s      rW   load_templater    s6    	lvY//	0 Avvx  s   ,5c                v   | j                   }t        |t        j                  j                  t        j                  j
                  f      sJ dt        |              t        j                  syt        t        j                  j                  j                  j                  t        j                  j                  j                  j                  g      }||v ryt        t        j                  j                  j                   g      }t        |t        j                  j
                        r||v S t#        |        S )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r  r}   rP   r  r  r  r  ri   fallback_by_defaultr#   r	  r  _assert_scalarr  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr!   )rd  r  "skip_fallback_due_to_dynamic_shapefallback_hopss       rW   should_fallback_by_defaultr    s    [[F&&

(F(FG O	?V~NO  %% *4IINN))11IINN**22	
*& 33 				@	@AM &%**889&&&t,,,r{   )	z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultc                    | t         v S )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)rb  s    rW   is_collective_opr    s    n$$r{   c                 b    t        j                         r		 ddlm}  | S g S # t        $ r g cY S w xY w)Nr   tlx_only_cuda_options)ri   r|  )torch._inductor.fb.tlx_templates.registryr  r
  r  s    rW   r  r    s9    	W(( 		  	I	s     ..c                    | |z   dz
  |z  |z  S )z(Round x up to the nearest multiple of y.r7   r   )rT   ys     rW   	_round_upr  "  s    UQY1!!r{   c                   ddl m}m}  ||d      r|j                  |j                  fS t        |      dk\  r  ||d   | d         r ||d   d      s ||d   d      r' ||d   | d         r|j                  |j                  fS  ||d   | d         r ||d   t        | d   d            s( ||d   | d         r1 ||d   t        | d   d            r|j                  |j                  fS  ||d   t        | d   d            r1 ||d   t        | d   d            r|j                  |j                  fS |t        j                  k(  rdnd}|t        j                  k(  r|t        j                  k(  rt        | d   d      t        t        || d   z  d      d      z  }	t        | d   d      t        t        || d   z  d      d      z  }
 |||	      s	 |||
      r|j                  |j                  fS |t        j                   k(  rt        j"                  j$                  st        | d   d      t        t        || d   z  d      d      z  }	t        | d   d      t        t        || d   z  d      d      z  }
 |||	      s	 |||
      r|j&                  |j                  fS y	t        | d   d      |z  | d   z  }	t        || d   z  d      | d   z  }
 |||	      s	 |||
      r|j&                  |j                  fS y	)
z:
    Core implementation for scale/swizzle inference.
    r   )r6   SwizzleTyper7   rn  rs   rr   r-  r  NN)torch.nn.functionalr6   r  
TensorWise
NO_SWIZZLErR   RowWiserj   BlockWise1x128BlockWise128x128rP   r@  r<  r  BlockWise1x16SWIZZLE_32_4_4r>  r  rq   BlockWise1x32)mat_size
scale_sizescale_numel	mat_dtypescale_dtypeeq_fnr6   r  K_multiplierexpected_numel_aexpected_numel_bs              rW   _infer_scale_swizzle_implr  '  sD    = [!%%{'='=== :!*Q-!-%
1q2I*Q-#jmXa[(I&&(>(>>> *Q-!-jmWXa[#%>?*Q-!-jmWXa[#%>?--{/E/EEE AS 9:uqM78A;4@
 //1G1GGG "U%;%;;1L E***{e>Q>Q/Q$Xa[#6L8A;.3Q:
 
 %Xa[#6L8A;.3Q:
 
 ./5FV3W,,k.H.HHH e***}}  (!c:Yx{2B7>    )!c:Yx{2B7>   ["23u-8 #00+2L2LLL   'x{B7,FRST&|hqk'A2FRST["23u-8 #00+2H2HHHr{   c                    t        | j                  d   | j                  d   ft        |j                        |j                         | j                  |j                  d       S )a  
    Infer the scaling type and swizzle mode from matrix and scale tensor shapes/dtypes.

    This function determines how scale factors are laid out relative to the matrix:
    - TensorWise: Single scale for entire tensor
    - RowWise: One scale per row
    - BlockWise1x128/128x128: Block-scaled with float32 scales
    - BlockWise1x32: MXFP8 with float8_e8m0fnu scales (swizzled on NVIDIA)
    - BlockWise1x16: NVFP4 with float8_e4m3fn scales (swizzled)

    Args:
        mat: The matrix tensor (FP8 or FP4)
        scale: The scale factor tensor

    Returns:
        Tuple of (ScalingType, SwizzleType) or (None, None) if unrecognized
    r   r7   c                    | |k(  S r   r   r  s     rW   r  z%infer_scale_swizzle.<locals>.<lambda>  s
    16 r{   r  r  r  r  r  r  )r  r,  r  numelr   )matscales     rW   infer_scale_swizzler  w  sO    ( %))A,		!-%KKM))KK! r{   c                \   ddl m | j                         }|j                         }|r
|d   |d   f}|r%t        j                  t
        j                  |d      nd}dfd}t        t        |      dk\  r
|d   |d   fn|d   dft        |      || j                  |j                  |      S )z
    Infer the scaling type and swizzle mode for IR nodes (used during graph lowering).

    This is the IR-compatible version of infer_scale_swizzle, using symbolic
    size comparisons via V.graph.sizevars.statically_known_equals.
    r   rj  r7   c                P    j                   j                  j                  | |      S )z5Compare values using symbolic equality when possible.)rn  ro  r  )r!  r"  rk  s     rW   symbolic_eqz+infer_scale_swizzle_ir.<locals>.symbolic_eq  s     ww771==r{   rn  r  )r!  r   r"  r   r   r2  )rf  rk  r  r  r  r  r  r  rR   r  r   )r  r  	transposer  r  r  r  rk  s          @rW   infer_scale_swizzle_irr    s     .||~H!J QK!- DN)""8<<Q?STK> %/28}/A(1+x{+QRUVGW$))KK r{   r}  )ry   r   r   r   )r   r   r   r2  )   d   )r   Callable[[], Any]r   r   r   r   r   ri  )r  r  F)
r   r  r   r   r   r   r   r2  r   ri  r  )r   z"Union[Optional[torch.device], str]r   torch.device)r  zIterable[sympy.Expr]r   r   )r$  r  r%  r  r   r   )r  zIterable[_T]r   zValuesView[_T])r/  r  r0  r  r   r  )r[  r  r   r  )ra  z"Iterable[Union[int, torch.SymInt]]r   zlist[sympy.Expr])rg  Union[int, torch.SymInt]r   r  )r   r  r   r  )ra  z Iterable[Union[int, sympy.Expr]]r   zlist[Union[int, torch.SymInt]])r}  torch._ops.OpOverloadr   r2  )r  r5   r  z'Callable[[torch._ops.OpOverload], bool]r   r2  )r  r   r   r  r  rE  r   z&tuple[GraphModule, list[torch.Tensor]])rH   )r   r  r   r  )r7   rH   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r  r   ri  )r   r  r  g      ?rH   )r  r  r  r  r   r   r  r   r  ri  r   r  r   ri  )r  r   r  r  r   r  )r  r   r  r   r   r  )r!  r   r"  r   r   r   )rT   zUnion[int, Sequence[int]]r  r   r   Sequence[int])rT   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   zCachedMethod[P, RV])r  r  r   z*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r  )r  r  r  r:   r   tuple[str, str]r   )rY  zIterable[torch.fx.Node]rZ  zOptional[Callable[[Any], bool]]r   OrderedSet[torch.fx.Node])r   zSequence[IRNode]r  zdict[str, IRNode]r   r  r}  )r  r   r   zValueRanges[Any])r  r  r   r2  )r  re   rK  r   r   r  )r  r2  r   r2  )r   r  r   r  )re  r   r  zdict[sympy.Expr, Any]r   r   )r!  r   r   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])r   r   r   r2  )r  torch.fx.GraphModuler   zOptional[torch.fx.Node])r  r  r   r5   )r  r  r   zOrderedSet[torch.device]r~  )r  r   r   r   )r[  r  r   r  r   r]  )NNT)r  zOptional[dict[str, Any]]r  r  r  r2  r   r]  )r  r  r	  r2  r   	list[int])rp  r+   r  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r	  r2  r   r  )r   r  r   r   r\  r|  )r  zUnion[int, torch.device]r   r2  r  )r  r   r   r  r  Optional[int]r   r8   )r(  rA   r  zlist[torch.dtype]r   r2  )r  r  r   r2  )
r(  rA   r  r2  r  r2  r  r2  r   r2  )r  r@   r  r  r  r2  r   r2  )r  r@   r  rA   r  r2  r   r2  )r(  r6   r)  r6   r*  zlist[ScalingType]r   r2  )r@  r   rA  r   r(  rA   rB  r2  rC  r2  rD  Optional[Any]rE  r  rF  r  r   r2  )
r(  rA   r  r   r  r   r  r   r   r2  r  )r(  rA   r  rS  r  rS  r  rS  r@  r@   rA  r@   rD  zOptional[IRNode]r  zOptional[_IntLike]r   r2  )rb  r  r   r2  r   )
r  rS  r  rS  r  rS  rn  r   r   r2  )r  rS  r  rS  r  rS  r   r2  )r  rS  r  rS  r  rS  r   r  )r   r  r   r  )r   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r(  rA   r   r2  )r(  rA   r  zUnion[ReinterpretView, Buffer]r  r@   r   r2  )FTFN)r(  rA   r  r@   r  r@   r  r2  r  r2  r  r2  r  r  r   r2  )r   Callable[P, _T]r   r  r  r  r   ztuple[_T, list[str]])r   r  r   ztuple[Any, list[str]])r   r  r   r  r  r  r   r   )r   r  r   r  r  r  r   r  )r   r  r   r  r  r  r   ztuple[Any, list[GraphLowering]])r%  r  r&  r  r   r]  )r.  r  r-  zOptional[Callable[..., Any]]r   r   )r5  r  r   r  )r   r  )rC  r  r   r2  )rI  zSequence[torch.Tensor]r   r2  )rg  r   r   r  )rN  r2  r   r   r  r   r   zIterator[Any])r   r  r   ri  )r~  r  r   r2  )r~  r  r   r   )r  zIterable[Any]r   r2  )
r{  r  r  r4   r  r  r5  r  r   r  )r  z"Optional[Union[Buffer, Operation]]r   r2  )rd  z Optional[Union[Node, Operation]]r}  z!Optional[torch._ops.OperatorBase]r   r2  )rd  z"Optional[Union[IRNode, Operation]]r   r2  )r  rF   r  z-Optional[Callable[[BaseSchedulerNode], bool]]r   r2  )r  rF   r   r2  )rd  zOptional[Operation]r}  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r   r2  )r  r  r  rE  r  rE  r   r   )r  rF   r  zMutableSet[BaseSchedulerNode]r  zdict[str, SchedulerBuffer]r  zdict[str, BaseSchedulerNode]r  zCallable[[Any], bool]r   r  )r  r   r  r   r   r   )r  r  r   r   )r  r   r   r]  )r   r  r   r  )r   r  r   r2  )r   r  r   r2  )r   r  r   r2  )r  r  r~  r  r  r  r  r  r  r  r  r2  r   r2  )r  r  r   r  )r   r  r   r2  )r0  r  r   r2  )r   r  )r   r  r   r  r  r  r   ztuple[_T, str])rI  Sequence[InputType]r   zOptional[ShapeEnv])r  Callable[[list[InputType]], _T]rR  r  rS  zOrderedSet[int]r   r  )rT   r  r   r  )rO  rT  r_  r  r`  zOptional[OrderedSet[int]]r   z-tuple[list[torch.Tensor], list[torch.Tensor]])rI  r  rd  r  r   r  )r   r   r   r2  )r  r  rv  rE   r   r  )r   r  r   r  )r   r  r   r  )r"  r  r   r  r   r2  )r   ztuple[str, ...])r   r  r  r3   r  r  r   r  )r   r  r   r  )r   r  r   r  )r   r  r   r  )r   zOptional[type[Any]]r   r2  r   r   )r   zOptional[list[int]])r   r  )r}  ztorch._ops.OperatorBaser   r  )r  torch.fx.Noder   r2  )rd  rB   r   r2  )r  r:   r   r2  )r  r   r   r  )r   r  r   r2  )r*  zSequence[Sequence[T]]r+  zSequence[T]r   r  )
r1  r  r2  r  r3  ValType | Noner4  r  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])r=  rE  r   rE  )rS  r   r   ztuple[bool, bool])rT   r*   r\  r2  r   zOrderedSet[sympy.Symbol])r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )r  rF   r   z tuple[list[Any], dict[str, Any]])r  r;   r   r2  )r   r  r  r/   r   r  )rd  r  r   r2  )r   r   )rT   r   r  r   r   r   )r  ztuple[Any, Any]r  ztuple[Any, ...]r  r   r  r  r  r  r  zCallable[[Any, Any], bool]r   #tuple[Optional[Any], Optional[Any]])r  r  r  r  r   r  r  )r  r>   r  r>   r  r2  r   r  (  
__future__r   r<  r^  r  enumr  r0  rf  r  re  r<  rk  r  r  r  r   r   r   r  rb  r  rm  r  r0  collections.abcr   r   r   r   r   r	   r
   r   r   r   typingr   r   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r~   rP   torch.utils._pytreer  _pytreery  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr    !torch.fx.passes.regional_inductorr!   torch.utils._dtype_abbrsr"   torch.utils._ordered_setr#   r$   r%   OPTIMUS_EXCLUDE_POST_GRADr  r(   r)   r*   r+   r,   r-   r.   pathlibr/   r0   r1   r2   torch._prims_commonr3   torch.fxr4   torch.fx.noder5   r  r6   r  r8   r  r:   dependenciesr;   rn  r=   r  r>   r?   r@   rA   rB   rC   output_coderE   r  rF   rG   rN   rL   r   rX   torch._dynamo.device_interfacerY   torch._dynamo.utilsrZ   torch.autogradr[   torch.autograd.profiler_utilr\   (torch.fx.passes.graph_transform_observerr]   torch.fx.passes.shape_propr^   torch.utils._sympy.functionsr_   r`   ra   rb   rc   torch.utils._sympy.symbolrd   re   torch.utils._sympy.value_rangesrf   rg   r  ri   runtime.runtime_utilsrj   r.  _IS_WINDOWS	getLoggerr   r   rl   r  r,  	VarRangesr  r   	InputTypegetenvXPU_KERNEL_FORMATGPU_KERNEL_BIN_EXTSr,  r^  r  r  rS  rH  rT  rJ  rU  rL  rN  r   rB  rD  rF  r<  r=  float8_e4m3fnuzfloat8_e5m2fnuzrt   r   rx   rz   r   Functionr   r  r   r   r   r   r  r  r  r&  r)  r^  rb  rh  rr  rt  r~  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r  r  rW  r]  rk  r|  r  r  r  r  r  r  r  r  	frozensetr  r  r  r  r  r  r  r  r  r_  r  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r  r#  r%  r)  r,  r  r  r8  r  r  r  r  r  r  r  r  r  r  r  r   r&  r+  r3  r7  r:  rG  rR  r^  rd  rS  ro  rs  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r(  r1  r6  r:  rB  rF  rJ  rL  rO  r  rY  rm  rq  rv  rz  r  r  r  r  r  r>  Enumr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r)  r-  r1  r9  rG  rJ  rU  r[  rM  rf  rm  rw  r  r  r  rC  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r"  r/  r8  rN  rV  rZ  r]  re  rg  PartitionFnTyper  rk  r  rm  r~  r  r  r  r  r  r  r  r  r  r  )r  r   s   00rW   <module>r     sg   "        	     	  	   
              C B    $ $ ? : E 0 / ; ($ 
  >>//C$"/,5!$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%g! T]UZZ'(	U5<<ell:;<	 Eibii(I7S 
  !"  	 
 2<

2 . ( {Q'A-+2B XDX XB5
LENN  d#  $"GX #(	 
 !	
 4 #(	___ 
_ !	_
 _D  ;@
+*"*+A**#AL+	+++	"/	)/#/G @OI	I<I 
I0 *8+0' 	!  	
 ( %'!  	
    )'#$  cNTT"
;sAv&*
+E8WQU^ E:)++/+\C,4).4)O4) 	4)nW2CW2!W2 W2x 48*0 (G
G$5GG:,^%	DU	>2- $ $'& 
< !# I "	 " "( .27+7	7 7 	7 7v !5 $ "  49 ( 	$$	7$ 	$
 $N Q7 7*  , , ,
S' S'l
 
 @ @ @?' ?  8 J J ) )I #'   	(+<	  #  	
  
< :>RW||&6|KO|	|@ BG&,:>	 BGVV&,V:>V	V OOO %O 
	O Q	  	 Q	  	 Q  >>> > 	>
 > > >  > 
>BP " EEE E 	E
 E E E E 
EPJ CO,) , EF!)?B	 (  . 5( 5(p @ @ R R:"JH&8@F	: ""&"&==
= = 	=
  = =  = 
=@'C C"&&& & 	&$ &2:/(V		 &	2:		## &#2:#$#* ...@.. .$ IMFF)EFF*	B&&   D D '6 '6T  Q	.0(#K(*$)) * 

 
"- 
4A 
HK 
	 
F1	" -1#
*#)# 
#L( @D	Q	Q<	Q 
	Q#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!)
.2$&$!$ $ 	$
 $ $ 
$NHBL'  &2:2:*" ( %	0	: 37$$$$ 0$ 3	$<$ $$3N!3B	:&/ '#)* $%
  +?*D*D*FG$!QAG  "**Y'H	  & 1 1 1
 68 2 7
8 1 
	 /9l O :)# # )

)
-" 01 -"` D)t  *499  4 42Y5!H"J4
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
FRj&2R66 d#  $ 38$./@ 3 3 *:); &= F
"-L 
%
  "
MMM M 	M
 M &M )M`	*(B "	"" " )	"{ Hs   5v>