
    inI                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlmZ dd	l m!Z! erd d
l"m#Z#m$Z$m%Z% d dl&m'Z' d dl(Z(d dl)Z)d dl*Z)d dl+m,c m-Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZA d dlBmCZC ddlDmEZEmFZFmGZGmHZHm Z mIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZQmRZR ddlHmSZSmTZTmUZUmVZV ddlWmXZXmYZY ddlZm[Z[ ddl m\Z\m]Z]m^Z^m_Z_m`Z`maZa ddlbmcZc ddldmeZemfZf ddlgmhZh ddlimjZjmkZk dd llmmZm dd!l,mnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZ dd"lmZ  ej                  e      Ze)j
                  j                  ed#      Ze)j
                  j                  ed$      Ze)j
                  j                  ed%      Ze)j
                  j                  ed&      Zed'   Zd(ed)<    ed*      Z ed+      Zej                    G d, d-             Zej                    G d. d/             Z G d0 d1      Zej                    G d2 d3             Zej                    G d4 d5e             Z G d6 d'      Zej.                  dcd7       Zddd8Zded9Zdfd:Z ej                   d;<       G d= d>             Zdgd?Z G d@ dA      Z	 	 	 	 	 	 	 	 dhdBZ G dC dDe      Z G dE dFe      Z G dG dHe      Z	 	 	 	 didIZ	 	 	 	 	 	 	 	 djdKZ G dL dMe      Z G dN dOe      Z G dP dQe      Z G dR dSe      Z	 dk	 	 	 	 	 	 	 dldTZ	 	 	 	 	 	 dmdUZdndVZej                    G dW dX             Z ejZ                         ZdodYZdpdZZ	 	 	 	 dqd[Zdrd\Zdrd]Zdrd^Zdrd_Z G d` dJ      Z G da db      Zy)s    )annotationsN)Counterdefaultdict)as_completedFuture)AnyGenericOptionalTYPE_CHECKING	TypeAliasTypeVarUnion)	ParamSpec
OrderedSet   )ComputedBuffer)CallableIteratorSequence)
ModuleType)countersdynamo_timed)use_pipelined_autotuning)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)ReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsget_op_namesGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingcompute_dependencies
cudagraphsBaseSchedulerNoder   PartitionType_T_Pc                  l    e Zd ZU dZded<   dZded<   dZded<   d Zedd	       Z	e	 d	 	 	 dd
       Z
y)FusionResultNzOptional[bool]should_fusezOptional[Callable[[], bool]]callable_fnOptional[LambdaFuture]futurec                L    | j                   d u| j                  d uz  sJ d       y )NzLFusion result should contain either fusion decision or callable_fn, not both)rb   rc   selfs    j/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/torch/_inductor/scheduler.py__post_init__zFusionResult.__post_init__n   s0      ,1A1A1MN 	
Z	
N    c                    t        |      S )N)rb   ra   )clsrb   s     ri   fusezFusionResult.fuses   s    44rk   c                    t        ||      S )Nrc   re   rm   )rn   rc   re   s      ri   from_callablezFusionResult.from_callablew   s     FCCrk   )rb   boolN)rc   Callable[[], bool]re   rd   )__name__
__module____qualname__rb   __annotations__rc   re   rj   classmethodro   rr    rk   ri   ra   ra   h   sf    "&K&04K-4%)F")

 5 5 OSD,D6LD Drk   ra   c                  B    e Zd ZU ded<   ded<   ded<   dZded<   d
d	Zy)PendingFusionru   rc   r\   node1node2Nrd   re   c                2    | j                   | j                  fS rt   r~   r   rg   s    ri   get_fusion_nodeszPendingFusion.get_fusion_nodes   s    

DJJ''rk   )return+tuple[BaseSchedulerNode, BaseSchedulerNode])rv   rw   rx   ry   re   r   r{   rk   ri   r}   r}   ~   s$    ##%)F")(rk   r}   c                  $   e Zd ZdZedd       Zedd       Ze	 	 	 	 	 	 dd       Zedd       Z	e	 	 	 	 	 	 dd       Z
e	 	 	 	 	 	 dd       Zedd       Ze	 	 	 	 	 	 dd	       Zedd
       Ze	 	 	 	 	 	 dd       Zedd       Zedd       Zy)MixOrderReductionz
    This class contains utility functions to decide if we should fuse reductions
    reducing across different dimensions of the same input tensor.
    c                f    | j                         xr  t        d | j                         D              S )Nc              3     K   | ]V  }t        |t              rD|j                         r4t        |j                  t              r|j                  j
                  d u X y wrt   )
isinstanceSchedulerNodeis_reductionnoder   _split_size.0subnodes     ri   	<genexpr>z7MixOrderReduction.is_split_reduction.<locals>.<genexpr>   sK      +
'=1$$&7<<8	 LL$$D0+
s   AA)r   all	get_nodesr   s    ri   is_split_reductionz$MixOrderReduction.is_split_reduction   s3      " 
s +
>>++
 (
 	
rk   c                `   | j                  |      rd }d }|j                         D ]n  }t        |t              r*|j	                         rt        |j
                  t              s?|j
                  j                  J t        j                  j                  j                  t        |j
                  j                              }|j
                  j                  J t        j                  j                  j                  t        |j
                  j                              }||}|}t        j                  j                  j                  ||      sJ | d|        t        j                  j                  j                  ||      reJ | d|         |J ||fS |j                  d   S )N v.s. r   )r   r   r   r   r   r   r   _original_rangesrW   graphsizevarssimplifyrV   _original_reduction_rangesstatically_known_equalsgroup)rn   r   xnumelrnumelr   	curxnumel	currnumels          ri   get_numel_rnumelz"MixOrderReduction.get_numel_rnumel   s   !!$'FF>>+ 4w6,,."7<<@||44@@@GG,,55!',,"?"?@	 ||>>JJJGG,,55!',,"I"IJ	 >&F&F77++CC	 4 	{34  77++CC	 4 	{34 148 %%%F##::a= rk   c                    | j                  |      }| j                  |      }t        |      dk7  st        |      dk7  s||k(  ryt        |      t        t        |            k(  S )N   F)r   lentuplereversed)rn   r~   r   g1g2s        ri   has_mix_reduction_ordersz*MixOrderReduction.has_mix_reduction_orders   sZ     !!%(!!%(r7a<3r7a<28RyE(2,///rk   c                R   d}|j                   j                  D ]&  }t        |t              s|j                  |k(  s$|} n |sy|j
                  }|j                   j                  }|sDt        |t              sJ t        |              |j                  d   j                   j                  }|sJ t        |      t        |j                        z
  syt        j                  j                  j                  t!        |j"                        t!        |j%                                     ryy)z@
        The access to 'buf' is not a broadcast access.
        NFr   T)read_writesreadsr   r2   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r    rW   r   r   r   rV   sizevalues)rn   bufr   	found_depdepr   r   s          ri   _is_full_accessz!MixOrderReduction._is_full_access   s    
 	##)) 	C#y)chh#o		
 %%00
d$67HDJ<H7Q33>>Jz:&E4F4F)GG
 7733)..)=9J9J9L+M
 rk   c                    g }|j                         |j                         z  }|D ]9  }| j                  ||      s| j                  ||      s)|j                  |       ; |S rt   )used_buffer_namesr   append)rn   r~   r   outcommon_readsr   s         ri   get_common_readz!MixOrderReduction.get_common_read   se     ..053J3J3LL 	 C""3.33F3FsE3R

3	  
rk   c                >    t        | j                  ||            dkD  S Nr   )r   r   rn   r~   r   s      ri   has_common_readz!MixOrderReduction.has_common_read   s!     3&&ue4599rk   c                    | j                  |      }t        j                  j                  j	                  |d   |d   z  d      S )Nr   r   fallback)r   rW   r   r   optimization_hint)rn   r   r   s      ri   	get_numelzMixOrderReduction.get_numel   s>    !!$'ww11"Q%"Q%-!1LLrk   c                $    | j                  |      S rt   )r   r   s      ri   get_fusion_scorez"MixOrderReduction.get_fusion_score  s    
 }}U##rk   c                L   t         j                  j                  syt        j                  j
                  ry|j                         r|j                         sy|j                         j                  }|dvst        |      dk7  ry|j                         r|j                         sy|j                  |j                         z  s|j                  |j                         z  ry| j                  ||      syt        j                  ||      }t!        |      dk(  ry| j#                  |      r||}}n| j#                  |      r||}}ny| j%                  |      }|\  }}	t         j                  j&                  sd}
t        j                  j(                  j+                  t-        j.                  ||	z  |
            syt        j                  j(                  j+                  t-        j.                  ||	dz              syt        j                  j(                  j+                  t-        j.                  |d            syt1        d |j3                         D              ryt        j                  j(                  j5                  |	d	      syt7        d
 |j3                         D              }|S )zP
        Check whether we can fuse two reductions with mix loop orders.
        F)cudaxputritonr   i  P r   i   c              3     K   | ]T  }|j                         rB|j                  j                  j                  t        j
                  t        j                  fv V y wrt   )r   r   datareduction_hintrA   INNERDEFAULTr   s     ri   r   z-MixOrderReduction.can_fuse.<locals>.<genexpr>V  sR      
 ##% LL,,##%%
s   AAi @  c              3  t   K   | ]0  }|j                         r|j                  j                         d v  2 yw)>   sumprodN)r   r   get_reduction_typer   s     ri   r   z-MixOrderReduction.can_fuse.<locals>.<genexpr>i  s=      
 ##% LL++-
s   68)r&   r   mix_order_reductionrW   r   cpp_wrapperrR   
get_devicer   rI   r   	ancestorsget_operation_namesr   r   r   r   is_contiguous_noder   #mix_order_reduction_non_strict_moder   guard_or_truesympyGeanyr   statically_known_leqr   )rn   r~   r   device_typer   contiguous_node
other_noder   nrowncol
size_thresr   s               ri   can_fusezMixOrderReduction.can_fuse  s9   
 }}00 77||~U\\^&&(--.";/8;!!#5+=+=+?OOe7799OOe7799  ++E59 )88F|!!!%(*/ZO##E**/ZO!!/2
d }}@@ #J
 77##11%((4$;
2ST
 77##11%((42JK
 77##11%((42FG  
 +446
 
 
 ww44T9E  
 &//1
 
 
rk   c                &    | j                  ||      S rt   )r   r   s      ri   are_mix_order_reductionsz*MixOrderReduction.are_mix_order_reductionst  s     ||E5))rk   c                \     t         fdj                  j                  D              syy)Nc              3  V   K   | ]   }j                  |j                         " y wrt   )is_contiguous_loadr   )r   r   rn   r   s     ri   r   z7MixOrderReduction.is_contiguous_node.<locals>.<genexpr>|  s'      
7:C""388T2
   &)FT)r   r   r   )rn   r   s   ``ri   r   z$MixOrderReduction.is_contiguous_nodez  s,     
>B>N>N>T>T
 
 rk   c                   ddl m} |j                         D ]  }t        |t              sJ |j
                  }|j                  |j                     }|D cg c]  }|j                  |k(  s|j                    }}t        |      dk(  rr|D ]u  }	|j                  |	   }
|j                  }t        |j                               }t        j                   j"                  j%                  |
||      }|d   dk(  rk|d   dk(  rt  y  yc c}w )Nr   )MemoryUsageTyper   FT)torch._inductor.loop_bodyr   r   r   r   _bodymemory_usageLOADbuffer_name
index_namer   indexing_exprsr   listkeysrW   r   r   stride_vars)rn   r   parent_noder   r   	loop_bodyentrieseindex_namesr   
index_exprr   var_symbolsr   s                 ri   r   z$MixOrderReduction.is_contiguous_load  s   =))+ 	!DdM222

I,,_-A-ABG18QAAMMS<P1<<QKQ;1$ * !
&55jA
&11
 #:??#45gg..:: $B1,B10D !	!2 + Rs   D*DNr   r\   r   rs   )r   r\   r   ztuple[sympy.Expr, sympy.Expr]r~   r\   r   r\   r   rs   )r   strr   r\   r   rs   )r~   r\   r   r\   r   	list[str])r   r\   r   intr~   r\   r   r\   r   r
  )r   r  r   r\   r   rs   )rv   rw   rx   __doc__staticmethodr   rz   r   r   r   r   r   r   r   r   r   r   r   r{   rk   ri   r   r      sv   
 
 
 #! #!J 	0%	0.?	0		0 	0  B 	%	.?			 	 :%:.?:	: :
 M M $%$.?$	$ $ e eN *%*.?*	* *
    rk   r   c                      e Zd ZU ded<   ded<   ded<    ej
                  e      Zded	<    ej
                  e      Z	d
ed<   ddZ
ddZddZddZddZddZddZddZddZddZy)SchedulerBuffer	Scheduler	schedulerz	ir.Bufferr   Optional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr?   
mpi_bufferc                B    | j                   }|J |j                         S rt   )r  get_name)rh   ops     ri   defining_op_namez SchedulerBuffer.defining_op_name  s#    ~~{{}rk   c                @    t        | j                  j                        S rt   )hashr   r   rg   s    ri   __hash__zSchedulerBuffer.__hash__  s    DIINN##rk   c                v   t               }| j                         }|j                  | dt        | j                        j
                          |j                  | d| j                  j                          | j                         r-|j                  | dt        | j                                       | j                         r-|j                  | dt        | j                                       t        | j                        dk  r0|j                  | d| j                          |j                         S |j                  | d       |j                  d      5  | j                  D ]  }|j                  | d        	 d d d        |j                  d	       |j                         S # 1 sw Y   *xY w)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rO   r  	writeliner   r   rv   layoutget_aliasespformatget_mutationsr   r  indentgetrawvalue)rh   resultr   users       ri   	debug_strzSchedulerBuffer.debug_str  s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! 1 JJ 1D$$vQZ011 S!!!##	1 1s   &F//F8c                6    | j                   j                         S rt   r   r  rg   s    ri   r  zSchedulerBuffer.get_name      yy!!##rk   c                   | j                   J | j                   j                         sy | j                   j                         sL| j                   j                         s2t	        | j                   j                         t        j                        r4t        j                  j                  j                  | j                          y t        t        j                  d      r| j                         t        j                  j                  v rt        j                  j                  | j                            }|| j                   j"                  v r$| j                   j"                  |   j                   }n#| j                   j$                  |   j                   }t        j                  j                  j'                  || j                          y t        j                  j                  j                  | j                          y )Nargs)r   should_allocateget_inputs_that_alias_outputget_mutation_namesr   get_output_specr)   CommBufferLayoutrW   r   wrapper_codecodegen_allocationhasattrkernelr  inplace_update_buffersr  name_to_donated_buffername_to_bufcodegen_inplace_reuse)rh   input_buffer_nameinput_buffers      ri   allocatezSchedulerBuffer.allocate  sV   yy$$$yy((* II224yy++-$))335r7J7JKGG  33DII> AHHf%188#B#BB !" ? ? P DNN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>rk   c                   | j                   J t        | j                   j                  t        j                        st        | j                         ry| j                  D ]  }t        |j                   t              s y yNFT)r   r   r$  r)   r=   rS   r  
OutputNode)rh   uses     ri   can_freezSchedulerBuffer.can_free  sg    yy$$$dii&&6:SII;
 :: 	C#((J/	 rk   c                ,   i }|D ]o  }t        |j                        |v r>|j                  |t        |j                                 |t        |j                        <   X||t        |j                        <   q t        |j	                               | _        y rt   )idr   merger   r   r  )rh   r  r*  rE  s       ri   	set_userszSchedulerBuffer.set_users  st    &( 	+C#((|v%'*yy3881E'Fr#((|$'*r#((|$		+
 &--/*
rk   c                R    | j                   J | j                   j                         S rt   )r   r3  rg   s    ri   r%  zSchedulerBuffer.get_aliases  s%    yy$$$yy5577rk   c                R    | j                   J | j                   j                         S rt   )r   r4  rg   s    ri   r'  zSchedulerBuffer.get_mutations  %    yy$$$yy++--rk   c                R    | j                   j                         j                         S rt   )r   r5  r   rg   s    ri   r   zSchedulerBuffer.get_device
  s    yy((*5577rk   Nr   r  r   r
  r   Noner   rs   )r  r  r   rR  r   zSequence[str]r   Optional[torch.device])rv   rw   rx   ry   dataclassesfieldr   r  r?   r  r  r  r,  r  rA  rF  rJ  r%  r'  r   r{   rk   ri   r  r    sz    
O,,-K--dCE>C.?k.?.?3/J+ 
$$($?B
+8.8rk   r  c                      e Zd ZU dZded<   y)SchedulerDonatedBufferNr  r  )rv   rw   rx   r  ry   r{   rk   ri   rZ  rZ    s    /3K,3rk   rZ  c                      e Zd ZU ded<   ded<   ded<   ded<   ded<   d	ed
<   ded<   dZded<   ded<   ded<   dZded<   ded<   ded<   dZded<   dRdZdSdZdTdZ	dTd Z
dTd!ZdUd"ZdTd#ZdVd$Z	 	 	 	 	 	 dWd%ZdXd&ZdYd'ZdZd(Zd[d)Z	 	 	 	 	 	 d\d*ZdVd+Zd]d,Zd]d-ZdVd.ZdVd/Z	 	 	 	 d^d0ZdTd1ZdTd2Zed]d3       Zed]d4       ZedZd5       Z edZd6       Z!d_d7Z"d`d8Z#dad9Z$dbd:Z%dZd;Z&dZd<Z'dZd=Z(dZd>Z)dZd?Z*dZd@Z+dZdAZ,dZdBZ-dcdCZ.dZdDZ/dVdEZ0	 dd	 	 	 	 	 dedFZ1edfdG       Z2edfdH       Z3edfdI       Z4	 	 	 	 	 	 dgdJZ5	 	 	 	 	 	 dhdKZ6edidL       Z7djdMZ8edjdN       Z9dkdOZ:dldPZ;e<	 	 	 	 dmdQ       Z=y)nr\   OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]r   
last_usager
  	min_order	max_orderr@   mpi_nodedict[str, str]mutation_renamesNzOptional[ir.Operation]r   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_nameOptional[float]override_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFrs   writtenc                "    || _         d | _        y )Nc                     g S rt   r{   )r1  kwargss     ri   <lambda>z,BaseSchedulerNode.__init__.<locals>.<lambda>+  s    B rk   )r  debug_device_str)rh   r  s     ri   __init__zBaseSchedulerNode.__init__(  s    $-& 	rk   c                Z   || _         t               | _        t        t                  | _        d| _        |j                         D cg c]  }t        | j                  ||        c}| _	        | j                  D ci c]  }|j                         | c}| _        i | _        y c c}w c c}w )NF)r  r   r  )r   r   r   r  r]  rl  get_outputsr  r  rd  r  rf  rb  )rh   r   outputr   s       ri   _init_from_nodez!BaseSchedulerNode._init_from_node.  s    	#$
   **,
  .. 
 @D||L 3L !#
  Ms   B#;B(c                T    t        |       j                   d| j                         dS )Nz(name=)r   rv   r  rg   s    ri   __repr__zBaseSchedulerNode.__repr__F  s'    t*%%&fT]]_,?qAArk   c                H   | j                         }t               }|j                  | dt        |       j                   dt        t        | dd            j                   d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j                  |j                                # 	 ddd       |j                  d       	 |j                  | j                                |j'                         j)                         S # 1 sw Y   XxY w# t         $ r t"        j%                  dd       Y Lw xY w)#Longer form printout for trace logsr   (r   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        r"  Ignoring error in debug_str()Texc_info)r  rO   splicer   rv   getattrr&  r   writesrk  r   r(  rt  r,  r#  debug_str_extra	Exceptionlogwarningr)  rstrip)rh   r   r   r   s       ri   r,  zBaseSchedulerNode.debug_strI  s   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   %5E25E> 2E;> F! F!c                     y)N r{   rg   s    ri   r  z!BaseSchedulerNode.debug_str_extrab      rk   c                $    | j                  |       S rt   )rq  rg   s    ri   _debug_str_for_devicez'BaseSchedulerNode._debug_str_for_devicee  s    $$T**rk   c                   t        | j                  dd       }d}t        |t        j                  j
                  j                        r'd|j                  |j                         gdd      z   }nct        |t        j                  j
                  j                        r5d|j                  |j                         |j                         gdd      z   }|  | S )Nr   r  , F)shorten	multiline)r  r   r   torch	_inductorr)   	Pointwise
str_helperget_size	Reductionget_reduction_sizer   )rh   
maybe_datadata_strs      ri   debug_str_shortz!BaseSchedulerNode.debug_str_shorth  s    TYY5
j%//"4"4">">?j33$$&'% 4  H 
EOO$6$6$@$@Aj33..0*2O2O2QR 4  H
 z""rk   c                p    t         j                  d| | j                  | j                  j                         y )Nz(%s: unmet_dependencies = %s, writes = %s)r  infork  r   r  rg   s    ri   log_detailszBaseSchedulerNode.log_detailsw  s,    6####		
rk   c                     yNFr{   )rh   self_dep	other_deps      ri   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pair       rk   c                    d | j                   j                         D        D ci c]  }||v r|||    c}| _        | j                  | j                   j	                  | j                               y c c}w )Nc              3  4   K   | ]  }|j                     y wrt   r   r   r   s     ri   r   z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>  s     QcQ   )r   reads_and_writesrb  set_read_writesrename)rh   renamesr   s      ri   update_mutated_namesz&BaseSchedulerNode.update_mutated_names  sp     RT-=-=-N-N-PQ!
w '$-!

 	T--44T5J5JKL!
s   A2c                X    | j                  | j                  j                  |             y rt   )r  r   	with_readrh   r   s     ri   add_fake_depzBaseSchedulerNode.add_fake_dep  s!    T--77<=rk   c                B    t        d | j                         D              S )Nc              3  `   K   | ]&  }|j                         xs |j                          ( y wrt   )r%  r'  )r   r   s     ri   r   z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s-      
9<COO4!2!2!44
s   ,.)r   rt  rg   s    ri   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation  s%     
@D@P@P@R
 
 	
rk   c                h    || _         | j                   j                  | _        | j                          y rt   )r   r   rk  
prune_deps)rh   rws     ri   r  z!BaseSchedulerNode.set_read_writes  s(    "&"2"2"8"8rk   c                b    | j                         }t        fd|D              }||z
  | _        y )Nc              3  B   K   | ]  }j                  ||        y wrt   )get)r   kmutation_real_names     ri   r   z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>  s     !U1"4"8"8A">!U   )used_or_aliased_buffer_namesr   r]  )rh   future_used_buffersr  used_bufferss     ` ri   set_last_usagez BaseSchedulerNode.set_last_usage  s0     88:!!U!UU&)<<rk   c                F    | j                   D ]  }|j                           y rt   )rd  rA  )rh   r   s     ri   mark_runzBaseSchedulerNode.mark_run  s    << 	CLLN	rk   c                    t        d t        j                  | j                  j                  | j                  j
                        D              S )Nc              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s      
 HH
r  )r   	itertoolschainr   r   r  rg   s    ri   r   z#BaseSchedulerNode.used_buffer_names  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rk   c                \   t               t        j                  | j                  j                  | j                  j
                        D cg c]*  }t        |t              r|j                  s|j                  , }}t        |      dkD  r|j                         }j                  |       t        j                  j                  j!                  |      rC|j#                  fdt        j                  j                  |   j%                         D               t        |      dkD  rS c c}w )z
        Returns buffer names used by this node, including aliases.

        Note: is_fake WeakDeps are excluded since they are purely for ordering
        and should not affect buffer lifetime.
        r   c              3  *   K   | ]
  }|vr|  y wrt   r{   )r   alias
used_namess     ri   r   zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>  s#       J.	 s   )r   r  r  r   r   r  r   r4   is_faker   r   popaddrW   r   name_to_bufferr  extendr3  )rh   r   depsr  s      @ri   r  z.BaseSchedulerNode.used_or_aliased_buffer_names  s     '1l
 !t'7'7'='=t?O?O?V?VW
sG, HH
 

 $i!m((*CNN3ww%%))#. !"!7!7"224	 	 $i!m !
s   /D)c                L     t         fd j                  D               _        y )Nc              3  f   K   | ](  }|j                   j                  j                  vr| * y wrt   )r   r  available_buffer_namesr   r   rh   s     ri   r   z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>  s/      -
xxt~~DDD -
s   .1r   rk  rg   s   `ri   r  zBaseSchedulerNode.prune_deps  s#    ", -
..-
 #
rk   c                     d fdt        fd j                  j                  D              } j                   j                  j	                  |             y )Nc                   t        | t              sy| j                  j                  j                  vryj                  j                  | j                     j                         }|t        j                  j                  v S r  )	r   r4   r   r  r=  r  rW   r   removed_operations)r   op_namerh   s     ri   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  s_    c7+xxt~~999nn00:KKMGagg8888rk   c              3  4   K   | ]  } |      s|  y wrt   r{   r   r   r  s     ri   r   z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  s      
\#5FC
   r   r1   r   rs   )r   r   r   r  remove_reads)rh   	to_remover  s   ` @ri   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps  sN    	9  
++11
 
	 	T--::9EFrk   c                F    t        | || j                  j                         y rt   )_prune_redundant_depsr  r=  )rh   name_to_fused_nodes     ri   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps  s     	d$68R8RSrk   c                R    | j                   J | j                   j                         S rt   )r   get_operation_namerg   s    ri   r  zBaseSchedulerNode.get_name  rM  rk   c                "    | j                         S rt   r  rg   s    ri   get_first_namez BaseSchedulerNode.get_first_name  s    }}rk   c                B    t        d | j                         D              S )Nc              3  <   K   | ]  }|j                           y wrt   r  r   r   s     ri   r   z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s     Gd$--/G   )r   r   rg   s    ri   r   z%BaseSchedulerNode.get_operation_names  s    Gdnn6FGGGrk   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrt   r  r   r   s     ri   r   z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     AS#,,.Ar  )r   rd  rg   s    ri   get_buffer_namesz"BaseSchedulerNode.get_buffer_names  s    ADLLAAArk   c                B    t        d | j                         D              S )Nc              3  Z   K   | ]#  }t        |t              xr t        |d        % yw)T)disallow_fp32_opsNr   r   r+   r   ns     ri   r   zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s7      
  q-( G+AFG
s   )+r   r   rg   s    ri   can_codegen_in_low_precisionz.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
rk   c                B    t        d | j                         D              S )Nc              3  V   K   | ]!  }t        |t              xr t        |       # y wrt   r  r  s     ri   r   z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s-      
 q-(K-H-KK
s   ')r   rg   s    ri   r+   z-BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
rk   c                    | gS rt   r{   rg   s    ri   r   zBaseSchedulerNode.get_nodes  s	    vrk   c                    | j                   S rt   )rd  rg   s    ri   rt  zBaseSchedulerNode.get_outputs  s    ||rk   c                     | j                   |   S rt   )rf  )rh   buf_names     ri   
get_outputzBaseSchedulerNode.get_output  s    ##H--rk   c                R    | j                   J | j                   j                         S rt   )r   r   rg   s    ri   r   zBaseSchedulerNode.get_device  s%    yy$$$yy##%%rk   c                L    | j                         }|d uxr |j                  dk(  S Ncpu)r   r   rh   devices     ri   is_cpuzBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::rk   c                X    | j                         }|d uxr t        |j                        S rt   )r   rR   r   r  s     ri   rR   zBaseSchedulerNode.is_gpu  s'    "T!9fV[[&99rk   c                     yr  r{   rg   s    ri   r   zBaseSchedulerNode.is_reduction      rk   c                     yr  r{   rg   s    ri   is_native_matmulz"BaseSchedulerNode.is_native_matmul  r  rk   c                     yr  r{   rg   s    ri   is_split_scanzBaseSchedulerNode.is_split_scan  r  rk   c                     yr  r{   rg   s    ri   is_templatezBaseSchedulerNode.is_template  r  rk   c                     yr  r{   rg   s    ri   	is_externzBaseSchedulerNode.is_extern   r  rk   c                     yr  r{   rg   s    ri   
is_foreachzBaseSchedulerNode.is_foreach#  r  rk   c                     yr  r{   rh   read_deps     ri   can_inplacezBaseSchedulerNode.can_inplace&  r  rk   c                     yr  r{   rg   s    ri   has_side_effectsz"BaseSchedulerNode.has_side_effects)  r  rk   c                \
    ddl m} t         t              rt        j
                  rt        j                  j                   j                         t        j                        r{t        t        j                  t        j                  j                  j                   j"                        rt%        t        j                  dd      t'        t        j                  d      sy j(                  t        j                  j*                  z   j,                  j.                  z  }d fd} j1                         D ]  }|j2                  }|J |j5                         rJ|j7                         s:|j9                         s*|j;                         t        j                  j<                  v ro j>                  j@                  D ]h  }|jB                   j,                  jD                  v r$ j,                  jD                  |jB                     }n/ j,                  jF                  jI                  |jB                        }|s|t        j                  jJ                  jM                  |       st        |jN                  tP              r|jR                  J |jR                  D cg c]   }|j2                  j;                         |vr|" }	}tU        |	      dk(  s|	d   jV                  s&|	d   j2                   u s9|j2                  Gt        |j2                  jY                         tZ        j\                  tZ        j^                  tZ        j`                  f      r|jN                  rft        |jN                  j2                  tZ        jb                  tZ        jd                  f      r(tU        |j2                  j7                               dkD  r ||j2                  |j2                        s+ ||      s5t        j                  jf                  ji                  |j;                         |j;                                t        t        j                  t        j                  j                  j                   j"                        rnt        j                  jj                  jm                  |j;                                t        j                  jj                  jm                  |j;                                |j;                         t        j                  jn                  |j;                         <      yc c}w )	z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNr1  c                   | j                   j                        }| j                         t               }| j                  D ]  }|j
                  }t        |t              s |j                         | j                   j                  vs| j                   j                  |      |urd|fd|j                  j                         D        z  }t        |      dkD  s y y)Nc              3  @   K   | ]  }|j                   k(  r|  y wrt   r  )r   or  s     ri   r   z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>_  s%      vv)    r   FT)r  get_fused_noder  r   r  r   r   r\   r  r  r   r  r   )buf_to_be_inplaced
fused_noder  r+  	user_noder  rh   s        @ri   single_index_in_fused_nodezKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodeG  s    
 ,55DDTJJ)224H %/LD*00 ! II	!)->? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= '!* rk   r   )r+  r  r   rs   )8codegen.wrapperr$  r   r   r&   inplace_buffersrW   r   has_featurer   r,   INPLACE_BUFFERSr:  r  r  codegensimd
SIMDKernelr  r9  r   r  r  completed_operationsrt  r   r2  r3  r4  r  removed_buffersr   r   r   r<  r=  r  r7  	can_reuser  NopKernelSchedulerNoder  r   r   r5  r)   r=   r<   MutationLayoutSHOULDREMOVEFallbackKernelr;   r1  make_inplacer%  r  r;  )
rh   r$  inconsequential_nodesr.  r   buf_noderead	input_bufxremaining_usess
   `         ri   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update,  s   
 	; t]+&&##DOO$5~7U7UVqxx)@)@)E)E)P)PQ188[$7C &) NNgg(()nn112 	 	D ##% C	CxxH''',,.88:..0<<>QWW%<%<<((.. 899 E EE $ E Edii PI $ : : > >tyy II ,,66y$G&y'<'<>TU$??666 "+&66??,4II &N & N+q0*1-99*1-22d:%NN6 *%NN::< " " 4 4 " = =! &11 * ) 5 5 : :!#!2!2BNN C! !$INN$O$O$Q RUV V1)..#((K6yA
 2293E3E3GX%HHeoo&=&=&B&B&M&M HH..2293E3E3GHHH..223<<>B &..0 77G q8C	0&s   %T)c                R   t         j                  sy |r| j                  ry | j                  J | j                  j	                         }g }|D ]0  }|j
                  dk(  r|j                  d       |j                  d       d|j
                   d|j                   }d|j                  v r|d|j                  d    z   }|j                  |       d|j                  v s|j                  d    }|j                  d	d
      d   }|j                  d|j                  dd      j                  dd      j                  dd      j                  dd      z          |j                  d       |j                  d       3 t        |      dk(  ry |j                  |       d| _        y )Nru  r  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr   {z{{}z}}r  \z\\z#pragma CMT END ORIGINr   T)r&   comment_originrl  r   get_originsr  r   targetmetarsplitreplacer   
writelines)	rh   buffer	only_onceorigins	out_linesr(  op_info_strrG  stack_trace_last_lines	            ri   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info  s    $$yy$$$))'')	 	%AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(:(:3(:(KB(O%  "+33C>WS$'WT4(Wf	   !9:  $3	%6 y>Q 	)$rk   c                (    | j                  dd      S )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrg   s    ri   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s    55t 6 
 	
rk   c                (    | j                  dd      S )NTFr\  r_  rg   s    ri   get_read_buffer_sizesz'BaseSchedulerNode.get_read_buffer_sizes  s    55u 6 
 	
rk   c                (    | j                  dd      S )NFTr\  r_  rg   s    ri   get_write_buffer_sizesz(BaseSchedulerNode.get_write_buffer_sizes  s    55 6 
 	
rk   c                Z    t        | j                  ||      j                         d      S )Nr\  r   )start)r   get_read_write_buffer_accessesr   )rh   r]  r^  s      ri   r`  z3BaseSchedulerNode.get_read_write_buffers_sizes_impl  s3     //+N 0 fh	
 	
rk   c                    t         t              ri S t         t              rt         j                  t              ri S t         t              r`t         j                  t
        j                        r< j                  j                  t        j                  j                  j                  u ri S ddt         t              r@ t         j                         d         t         j                         d         z        nt        d      t!        j"                  t$              }|r9 j&                  j(                  D ]   }||j*                     j-                  |       " |r9 j&                  j.                  D ]   }||j*                     j-                  |       " |r&t1        d  j&                  j(                  D              n	t1               }|r&t1        d  j&                  j.                  D              n	t1               }d fdt         t2              rt1         fd|D              }||z
  }||z
  }i }||z  D ]  }	t5        fd	||	   D              |	t6        j8                  j:                  v rt6        j8                  j:                  |	   }
n;|	t6        j8                  j<                  v rt6        j8                  j<                  |	   }
n	 	 	 	 d fd
 |
      }|	|vr|||	<   ||	xx   |z  cc<    |S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        c                X    t         j                  j                  j                  | d      S )Nr   r   )rW   r   r   r   )ss    ri   try_size_hintzGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint#  s"    77##55a!5DDrk   r   r       eAc              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>8  s     BCsxxBr  c              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>=       CCsxxCr  c                    j                   j                  |    j                  }t        d |D              }t	        |t        |      z
        dkD  S )Nc              3  4   K   | ]  }|j                     y wrt   r   )r   r+  s     ri   r   z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>D  s     !>$))!>r  r   )r  r=  r  r   r   )r   r   r  buf_usesrh   s       ri   is_materializedzIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedB  sG    NN..s399E!!>!>>Hx*V"44599rk   c              3  J   K   | ]  } |j                         r|  y wrt   r   )r   r   rt  rh   s     ri   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>H  s#      )_S$++-N)s   ##c              3  "   K   | ]  }  y wrt   r{   )r   r   
node_numels     ri   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>Q  s     $RCZ$Rs   c                B   | syt        | t        j                        r| j                         S t        | j                  t
              r͉j                  j                  | j                            j                  }d}|D ]  }t        |j                  t              rt        |j                  t              sJ t        |j                  j                  t              r5|j                  j                         D ]  }| |j                        z  }  y |S t        | j                  t        j                        r"t!        fd| j#                         D              S  	t%        | j'                                     }t)        | j+                               t-        |      z  S )Nr   c              3  h   K   | ])  } t         j                  j                  |             + y wrt   )rW   r   
get_buffer)r   mut_nameget_buf_bytess     ri   r   zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>u  s-      $ &agg&8&8&BCs   /2)r   r)   TorchBindObjectr}  r$  r<   r  r=  r  r  r   rD  r\   r;   rt  r=   r   r4  rV   r  rK   	get_dtypemin)
r   r  totr+  	sched_buf	buf_elemsbuf_accessed_elemsr}  rh   rl  s
         ri   r}  zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesZ  sT    c2#5#56,,..

,=> !NN66s||~FLLEC % %%dii<$)$))5FGGG%diinnkB-1YY-B-B-D E	 #}Y^^'D DE $%% J

BMM: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rk   )rk  z
sympy.Exprr   r
  )r   r  r   Sequence[BaseSchedulerNode]r   rs   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r
  )r   r9  ExternKernelSchedulerNoder   r;   r)   r;  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_stater   rV   
get_rangesr
  collectionsr   r   r   r   r   r   r  r   r   r   rW   r   r  graph_inputs)rh   r]  r^  buf_accessesr   r   r  r7  buf_byte_accessesr  r   	buf_bytesr  r}  rt  rx  rl  s   `           @@@@@ri   rh  z0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d23Id56:II{<
 It67499b&7&78		%%||%%BBC I	E dM*&doo/23 1! 456J
 SJ"..t4''-- 3SXX&--c23 ''.. 3SXX&--c23
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d./( )%) O o-FO+E,. 3	9H!$$R<;Q$R!R177111gg,,X6QWW111gg**84#Q## #J &c*I00.7!(+!(+y8+g3	9j ! rk   c                T   | j                   y | j                   j                         }|y t        |      }|y t        |t        j
                        r|j                   j                  }t        j                  j                  j                  |d      }t        d   dxx   |z  cc<   |S )Nr   r   inductor
flop_count)r   get_origin_noder7   r   r  SymIntexprrW   r   r   r   r   )rh   fx_nodeflopsresolved_flopss       ri   estimate_flopsz BaseSchedulerNode.estimate_flops  s    99))++-?w'=eU\\*JJOOE));;EA;N\*n<*rk   c                R    | j                   | j                   S | j                         S rt   )rh  _get_estimated_runtimerg   s    ri   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtime  s)    **6222**,,rk   c                   | j                         d   j                         d   }|j                  j                         }t	        t        |            syt        | j                        rt        | j                  t        j                        sJ 	 t        j                  rst        |       }t               }|j                  |      }|t        |t              sJ |S t!        |       }|t#        | j                        }|j%                  ||       |S t#        | j                        S t/        | j                        ryt1        |       }||S |j                  j3                         }		 t5               }
t7        |	      dz  }|
dk  rt9        d|
       |dk  rt9        d|       	 | j=                         }|dk(  s|| j?                         |
z  }|dz  }|S d}| j?                         }|dn|}||z  |z  d	z  }||
z  }tA        ||      }|dz  }|S # t&        $ r}t(        j+                  |       Y d}~yd}~wt,        $ r}t(        j+                  |       Y d}~yd}~ww xY w# t:        $ r Y yw xY w)
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?rm  )!r   rt  r   r5  rR   r9   rP   r   r)   IRNoder'   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupfloatr0   r/   	set_value
ValueErrorr  r  	TypeErrorrU    maybe_estimate_runtime_benchmarkmaybe_get_dtyperL   rJ   AssertionErrorr  r  ra  max)rh   r   r$  	cache_keycache	cache_valmsr  retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     ri   r  z(BaseSchedulerNode._get_estimated_runtime  sw   
 nnq!--/2))+of-. #dii333LL I$ OI68E %Y 7I ,))U;;;((HNBz=diiHOOIRO8I7		BB TYY
 .t4?J((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.2247KKBcBI 99;*2*Y6#=%(<< }-#X	o    :  		sC   AH 6H H (>I$ 	I!H66I!II!$	I0/I0c                     y rt   r{   rg   s    ri   get_template_nodez#BaseSchedulerNode.get_template_node      rk   c                .    | j                         }|J |S rt   r  )rh   templates     ri   get_template_node_or_throwz,BaseSchedulerNode.get_template_node_or_throw  s!    ))+###rk   c                f    t        d t        |       D              }| d| }| |   }| |dz   d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c              3  H   K   | ]  \  }}|j                         s|  y wrt   r  )r   ir  s      ri   r   zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s     PDAqaPs   ""Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        ri   get_prologue_template_epiloguez0BaseSchedulerNode.get_prologue_template_epilogue   sN     PIe,<PP.)n-!+-.00rk   )r  r  r   rR  )r   ir.Operationr   rR  rO  )r   r	  rQ  r  r2   r  r2   r   rs   r  ra  r   rR  )r   r1   r   rR  rS  )r  ri  r   rR  r  r\  r  ra  r   rR  r   r\  r  dict[str, BaseSchedulerNode]r   rR  r   r  )r   zSequence[SchedulerBuffer])r  r  r   r  rU  r  zdependencies.Depr   rs   T)rT  rO   rU  rs   r   rR  rP  )r]  rs   r^  rs   r   r
  )r]  rs   r^  rs   r   zdict[str, int]r   z
int | Noner   r  r   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)r  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]])>rv   rw   rx   ry   r   rh  rl  rr  rv  rz  r,  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  rF   r   r  r  r+   r   rt  r  r   r  rR   r   r  r  r  r  r  r   r"  rC  rZ  ra  rc  re  r`  rh  r  r  r  r  r  r  r  r{   rk   ri   r\   r\     s   BB NN''$$#'D
 '""//266((''GT
#0B*2+#
!.7	
M>


=#2=HV=	=
6
G T">T	T
. H H B B 
 
 
 
.&;:@F 9=-$-15-	-^ 
 

 
 

 
 


!
37
	
L!!L!37L!	L!\  $- U Un
 1&1	S1 1rk   c                 R    t         j                  j                  j                         S rt   )r  r  	codecache
LocalCacher{   rk   ri   r  r    s    ??$$//11rk   c                   t        | j                  dd      }| j                  j                  }| j                  j                  g || j                  j                  | j                  j
                        }| j                  j
                  }t        j                  ||f      \  }}ddt        |ft        fd|D              z         }|S )Npython_kernel_namer  c                p    t        | t        j                        xr t        | t        j                         S rt   )r   r)   r  GeneratorStaterA  s    ri   _is_tensor_irz@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir  s(    !RYY'P
1b>O>O0P,PPrk   c              3  d   K   | ]'  } |      rt        |j                               nd  ) y wrt   )r   r  )r   ar  s     ri   r   z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>#  s(     Ua}Q'7ajjl#TAUs   -0rS  )
r  r   inputsfill_non_provided_argsconstant_argsro  pytreetree_flattenr  r   )snoder  r1  ro  	flat_argsflat_args_pytree_specr  r  s          @ri   r  r    s     -A2F::D::,,*$*))*

D ZZF'-':':D&>'J$I$Q 	
U9U
U	VI rk   c                   t        | t              sy t        j                  j                  j
                  t        j                  j                  j                  t        j                  j                  j                  d}t        | j                  dd      }||vry t        | j                  t        j                        sy ||   S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr  r  )r   r  r  opsatenmmbmmaddmmr  r   r)   ExternKernel)r  mms_fnsr  s      ri   _get_mm_like_fnr  (  s    e67"YY^^..#iinn00 %		 4 4G
 !-A2F(ejj"//2%&&rk   c                Z    d }d }t         j                  rt               }|y |} fd}ny t               }t	               }|j                  |      }|t        |t              sJ |S ddlm	  |       \  }}ddl
m}	 |	j                  |||ddd      }
|j                  ||
	       |
S )
Nc                             S rt   r{   )r  snode_args_kwargss   ri   rp  z2maybe_estimate_runtime_benchmark.<locals>.<lambda>A  s    !25!9 rk   r   )r  r   )benchmarker   
   )memory_warmup_itersbenchmark_itersmax_benchmark_durationr  )r&   !runtime_estimations_mms_benchmarkr  r  r  r  r   r  utilsr  $torch._inductor.runtime.benchmarkingr  	benchmarkr  )r  bench_fnargs_kwargs_fnmm_fnr  r  r  r1  ro  r  r  r  s   `          @ri   r  r  8  s    HN//&=99%@I&(EY'I)U+++(!#LD&@			! 
 
B 
OOIRO(Irk   T)slotsc                  N    e Zd ZU ded<   ded<   ded<   ded<   ddZddZdd	Zy
)	WhyNoFuser  name1name2reasonztuple[Any, ...]r1  c                X    |j                         | _        |j                         | _        y rt   )r  r  r  rh   r~   r   s      ri   rr  zWhyNoFuse.__init__e  s    ^^%
^^%
rk   c                J    || _         || _        t        j                  |        y rt   )r  r1  
fusion_logdebug)rh   r  r1  s      ri   __call__zWhyNoFuse.__call__i  s    	rk   c                p    d| j                    d| j                   d| j                  | j                  z  z   S )Nzcannot fuse z with r   )r  r  r  r1  rg   s    ri   __str__zWhyNoFuse.__str__n  s6    djj\

|2>KK$))#
 	
rk   Nr~   r\   r   r\   r   rR  )r  r  r1  r   r   rR  rO  )rv   rw   rx   ry   rr  r  r  r{   rk   ri   r  r  ^  s&    JJK
&

rk   r  c                    t        | t        t        f      rt        | t              } t        j                  | d      }d|v rdt        j                  |d       S |S )Nkey   )r(  r      )	r   r   setsortedr  pprintr&  textwrapr(  )objr*  s     ri   r&  r&  t  sR    #
C()Sc"^^C*Fv~HOOFG4566Mrk   c                  0    e Zd ZddZddZddZd	dZeZy)
rD  c                &    t        |g      | _        y rt   r  r  s     ri   rr  zOutputNode.__init__  s    ",cU"3rk   c                     yr  r{   rg   s    ri   r   zOutputNode.is_reduction  r  rk   c                     y)Nr{   r{   rg   s    ri   r3  z'OutputNode.get_inputs_that_alias_output  r  rk   c                     y)NOUTPUTr{   rg   s    ri   r  zOutputNode.get_name  s    rk   N)r   r3   r   rR  rS  rT  rO  )rv   rw   rx   rr  r   r3  r  rz  r{   rk   ri   rD  rD  ~  s    4 Hrk   rD  c                    t        j                          j                  D ]N  }t        |t              r|j
                     j                         }|   j                         xx   dz  cc<   P d fdt        fd j                  D              }|r? j                  |z
   _         j                   j                  j                  |             yy)am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   c                    t        | t              rf| j                     j                         }|   j	                            dkD  xr  j
                  j                  | |         }|   k(  }|xs |S y)Nr   F)r   r4   r   r  r  r  fusable_weak_dep)r   r  is_redundantis_self_depr=  name_to_dep_countr  r   s       ri   r  z+_prune_redundant_deps.<locals>.should_prune  s    c7#!#((+<<>G,"7+446 nn55'0$  -W5=K.;.rk   c              3  4   K   | ]  } |      s|  y wrt   r{   r  s     ri   r   z(_prune_redundant_deps.<locals>.<genexpr>  s      ,s2Cr  Nr  )r  r   rk  r   r4   r   r  r  r   r  r   r  )r   r  r=  r   r  deps_to_pruner-  r  s   ```   @@ri   r  r    s     '2&9&9&;&& K#w'!#((+<<>G09BBDEJEK
    .. M "&"9"9M"IT--::=IJ rk   c                  8     e Zd Zd fdZddZddZddZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y rt   superrr  rv  r  get_read_writesrh   r  r   	__class__s      ri   rr  z"ExternKernelSchedulerNode.__init__  5    #T"T1134rk   c                V    | j                          dt        | j                  dd        S )Nz.node.kernel = r  )r  r  r   rg   s    ri   r  z)ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbrk   c                     yNTr{   rg   s    ri   r  z#ExternKernelSchedulerNode.is_extern  r  rk   c                    | j                   J t        | j                   d      xr | j                   j                         S )Nr"  )r   r9  r"  rg   s    ri   r"  z*ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVrk   r  r  r   r  r   rR  rO  rS  )rv   rw   rx   rr  r  r  r"  __classcell__r6  s   @ri   r  r    s    5
cWrk   r  c                        e Zd Zd fdZ xZS )r9  c                    t         |   |       | j                  |       | j                  |j	                                y rt   r2  r5  s      ri   rr  zNopKernelSchedulerNode.__init__  r7  rk   r<  )rv   rw   rx   rr  r=  r>  s   @ri   r9  r9    s    5 5rk   r9  c                      e Zd ZU dZded<   ded<   	 	 	 	 	 	 d! fdZ	 	 d"	 	 	 	 	 d#dZ	 	 d"	 	 	 	 	 d$dZ	 	 	 	 	 	 d%d	Zd&d
Z	d'dZ
d(dZd'dZ	 	 	 	 	 	 d)dZd'dZ	 	 	 	 	 	 d*dZd+dZd,dZd-dZd-dZd-dZd-dZd.dZd/dZ	 	 	 	 d0dZd1dZ	 d2	 	 	 d3dZed4d       Zed4d       Zd5dZed6d       Zed- fd        Z  xZ!S )7r   zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr>   r   c                f    t         |   |       | j                  |       | j                          y rt   )r3  rr  rv  _compute_attrsr5  s      ri   rr  zSchedulerNode.__init__  s,    
 	#T"rk   c                   t        | j                  t        j                  t        j                  f      sJ | j                  j                  ||      \  | _        }|| _        | j                  j                         }| j                  j                  |      j                  }| || j                        f| _        t        j                   xs t        |j                          }t        | j                  t        j                        r,| j#                  | j                  j%                  |             y | j#                  t'        j$                  | j                  g| j                  d|i       y )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizerI  )r   r   r)   r   TemplateBuffersimplify_and_reorderrB  r   get_device_or_errorr  get_backendgroup_fnr   r&   loop_ordering_after_fusionrR   r   r  extract_read_writesr(   )rh   rG  rH  bodyr  rN  should_normalizes          ri   rD  zSchedulerNode._compute_attrs  s7   
 $))b&7&79J9J%KLLL II::'A&? ; 
T 
..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!23  		--8H-I   00JJ!%8Hrk   c                *    | j                  ||       y )NrF  )rD  )rh   rG  rH  s      ri   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
rk   c                   t        d | j                  j                  D              }| j                  t	        j
                  | j                  g| j                  d|ij                  |      j                  | j                               | j                  j                  |        |r!ddlm} |j                  j!                          y y )Nc              3  N   K   | ]  }t        |t        t        f      s|  y wrt   )r   r4   r3   r  s     ri   r   z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>  s#      0
ZgwEW5XC0
s   %%rI  r   SIMDScheduling)r   r   r   r  r(   rP  r   rB  r  r  rb  pointwise_read_writesclear_cachecodegen.simdrX  candidate_tilingscache_clear)rh   rI  need_clear_tiling_cache	fake_depsrX  s        ri   refresh_dependenciesz"SchedulerNode.refresh_dependencies  s    
 &0 0
++110
 &
	 	,,

![[4= Yy!VD))*	
 	""..t4"4 ,,88: #rk   c                    | j                   j                  |      | _         | j                   j                  | _        | j	                  dd       y )NFTrI  r^  )r   reorder_iter_loopssizesrB  r`  )rh   	new_orders     ri   apply_new_loop_orderz"SchedulerNode.apply_new_loop_order.  sA    ZZ22

 jj&&!!E4!Prk   c                   | j                   j                         }t        | j                   j                        |z
  }t	        t        |            }t	        t        |||z               }| j                  ||z          t        | j                  d         dk(  sJ | j                  d   | j                  d   d   | j                  d   d   ff| _        y )Nr   r   r   )r   get_original_num_rdimsr   	iter_varsr   rangerf  r   )rh   	num_rdims
num_pwdimspwdimsrdimss        ri   swap_pw_red_dimensionz#SchedulerNode.swap_pw_red_dimension6  s    JJ557	--.:
uZ()eJ
Y(>?@!!%&.14::a=!Q&&&ZZ]TZZ]1%5tzz!}Q7G$HH
rk   c                D    | j                   j                         | _         | S rt   )r   extract_pw_from_reductionrg   s    ri   rq  z'SchedulerNode.extract_pw_from_reduction@  s    ZZ99;
rk   c                    t         j                  |       sy t        | j                  t        j
                        sJ | j                  j                         5  | j                          d d d        y # 1 sw Y   y xY wrt   )r   r   r   r   r)   r   with_original_inner_fnrD  rg   s    ri   cancel_reduction_splitz$SchedulerNode.cancel_reduction_splitD  s^     33D9$))R%6%6777YY--/ 	"!	" 	" 	"s   A11A:c                   t        | j                  t        j                  t        j                  f      sJ | j
                  j                  ||      | _        | j
                  j                  | _        | j                  j                         }| j                  j                  |      j                  }| || j                        f| _        | j                  dd       y )NTrb  )r   r   r)   r   rJ  r   #expand_dimension_for_pointwise_noderd  rB  rL  r  rM  rN  r   r`  )rh   	dimension	new_ranger  rN  s        ri   rv  z1SchedulerNode.expand_dimension_for_pointwise_nodeK  s     $))b&7&79J9J%KLLLZZCCy

 jj&&..0>>--f5>>ht{{34
 	!!D$!Ork   c                    | j                   j                         | _         | j                   j                  | _        | j	                  dd       y )NTFrb  )r   merge_loopsrd  rB  r`  rg   s    ri   rz  zSchedulerNode.merge_loops\  s<    ZZ++-
jj&& 	!!D%!Prk   c                   d }| j                   d   }t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|rPt        xj
                  dz  c_        t        j                  d| j                         |       | j                  |       yt        j                  d| j                                y)Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
rB  r   num_varsdecide_loop_order_to_matchr*   num_loop_reorderingloop_ordering_logr  r  rf  )rh   r  r  re  
self_sizess        ri   r  z'SchedulerNode.reorder_loops_by_dep_pairh  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##W rk   c                $   | j                         }| d| j                  d    | d| j                  d    | d| j                   g}| j                  j	                         D ]  }t        |t              r|j                  }t        j                  j                  |      }t        |t        j                        rZ|j                  | dt        |j                                 t        | j                   t"              rR|j                  d| d       |j                  t%        j&                  | j                   j)                         d	             | j*                  J |j-                  | j/                                d
j1                  |      S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  r  )r  r   rB  r   r  r   r4   r   rW   r   r{  r)   r~  r   r&  r$  r   r>   r   r(  r,  r   r  r  join)rh   r   linesr   r  r   s         ri   r  zSchedulerNode.debug_str_extra  sK   }}f$TZZ]O4f'

17fIdkk]+

 ##446 	OCc7+88gg((2!#r'9'9:LLH:Z

8K7L!MN	O djj(+LL6${34LL)=)=)?HIyy$$$T//12yyrk   c                    | j                   S rt   )rB  rg   s    ri   r  zSchedulerNode.get_ranges      {{rk   c                <   t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  j                               xr' | j                  d u xs | j                  j                   S Ntype(self.node)=)
r   r   r)   r   rJ  r   rs   r   r   has_partial_accumulaterg   s    ri   r   zSchedulerNode.is_reduction  s    $))b&7&79J9J%KL 	
tDII !	
L DII0023 
JJ$Gdjj&G&G"G	
rk   c                    t        | j                  t        j                        sJ dt	        | j                               | j                  j                         dk(  S )Nr  dot)r   r   r)   r   r   r   rg   s    ri   r  zSchedulerNode.is_native_matmul  sJ    $))R%6%67N<LDO;M9NN7yy++-66rk   c                L   t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  t        j                        xr. t        | j                  j                  t        j                        S r  )r   r   r)   r   rJ  r   r   	SplitScanrg   s    ri   r  zSchedulerNode.is_split_scan  sy    $))b&7&79J9J%KL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
rk   c                J    t        | j                  t        j                        S rt   r   r   r)   rJ  rg   s    ri   r  zSchedulerNode.is_template  s    $))R%6%677rk   c                f    t        | j                  t        j                        r| j                  S d S rt   r  rg   s    ri   r  zSchedulerNode.get_template_node  s$    &tyy"2C2CDtyyN$Nrk   c                f    | j                          | j                          | j                  |       y rt   )rC  r  r3  )rh   
index_varss     ri   runzSchedulerNode.run  s#    ""$Z rk   c                &   | j                   }t        t        t        |            t        t        t        |            k(  sJ t	        t        t        j                  j                  |      t        j                  j                  |                  }|S rt   )	rB  r   mapr   dictzipr  r  from_iterable)rh   r  rd  r   s       ri   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rk   c                   | j                  |      }	 t        j                  t        t        j                         |            5  t        j
                  j                  |       5   | j                  |  ddd       ddd       y# 1 sw Y   xY w# 1 sw Y   yxY w# t        $ r" t        j                  d| j                          w xY w)a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)r  rW   set_ops_handlerrD   get_ops_handlerr:  set_current_noder   r  r  fatalr   )rh   r  r   s      ri   r3  zSchedulerNode.codegen  s     00<
	!!"213D3D3F
"ST())$/( 

J'	( ( ( ( ( (
  	II/;	sA   1B  B$B4B<B B	
BBB B +Cc                    |r| j                   nt        | j                         \  }}t        j                  | j                  |t
        j                  j                  gt        |      z  g      S )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	rB  r   r(   rP  r   r   SZeror   )rh   	pointwise
keep_sizesignore_sizess       ri   "pointwise_or_reduction_read_writesz0SchedulerNode.pointwise_or_reduction_read_writes  sT     3<4;;$++AV 
L//JJ
%'',,#lBS1S0T
 	
rk   c                &    | j                  d      S )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r  rg   s    ri   rY  z#SchedulerNode.pointwise_read_writes  s    
 666FFrk   c                &    | j                  d      S )zD
        Get the memory dependencies in the reduction axes.
        Fr  r  rg   s    ri   reduction_read_writesz#SchedulerNode.reduction_read_writes  s    
 666GGrk   c                   | j                         ryt        d | j                         D              ryt        | j                  j
                        dk(  rt        |t        j                        rt        t        | j                  j
                              }t        |t        j                        sJ dt        |             |j                  |j                  k(  xr |j                  |j                  k(  S y)NFc              3  <   K   | ]  }|j                           y wrt   )r%  r  s     ri   r   z,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?Ss ?r  r   ztype(write_dep)=)r  r   rt  r   r   r  r   r(   r2   r  iterr   r   r   )rh   r  	write_deps      ri   r   zSchedulerNode.can_inplace  s    ?D,<,<,>??t&&'1,l,,2
 T$"2"2"9"9:;Ii)?)?@WEUT)_DVBWW@>>Y__4X)..9XXrk   c                   t               }t        | j                  t              r| j                  j	                         D ]  }|j
                  dk(  s|j                  dk(  s#d|j                  v r|j                  d   dk(  s,t        |j                        dk(  s\|j                  d   dk(  so|j                  d|j                  v r|j                  d   n(t        |j                        dk\  r|j                  d	   nd
        |S )Ncall_methodstoremode
atomic_addr  r  r   r   r   r  )r   r   r   r>   r   r  rO  ro  r   r1  r  )rh   buffers_store_as_atomic_addr   s      ri   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(+

,,. GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr +*rk   c                p    | j                   | j                   j                  d      ryt        |          S )Ndevice_assert_asyncT)r   has_opr3  r"  rh   r6  s    ri   r"  zSchedulerNode.has_side_effects  s2     ::!djj&7&78M&Nw'))rk   )r  r  r   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   rR  NN)rG  *Optional[tuple[dict[Any, Any], list[Any]]]rH  zOptional[Callable[_P, _T]]r   rR  )rG  r  rH  zOptional[Callable[..., Any]]r   rR  )rI  rs   r^  rs   r   rR  )re  Sequence[int]r   rR  rQ  r   r\   )rw  r
  rx  r
  r   rR  r  rO  )r   Sequence[Sequence[sympy.Expr]]rS  r  )r  Sequence[sympy.Expr]r   rR  )r  r  r   zdict[sympy.Expr, sympy.Expr])r  r  r   rR  r  )r  rs   r   ri  )r   ri  r  r  )"rv   rw   rx   r  ry   rr  rD  rT  r`  rf  ro  rq  rt  rv  rz  r  r  r  r   r  r  r  r  r  r  r3  r  rF   rY  r  r   r  r"  r=  r>  s   @ri   r   r     s   
 -,O : 
	 RV@D$N $> 
	F RVBF
$N
 $@
 
	
;;8<;	;<QI"PP),P	P"
Q!.7	. ,
7
8O!
8	%0 !%	
	
	 	
 G G H H + +& * *rk   r   c           	     n     j                   } j                  t        j                  j	                  |D cg c]  }|j
                   c}             t         fdt        j                  |D cg c]  }|j                   c} D               j
                  j                  z
   _        y c c}w c c}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wrt   r   r  )r   r   group_snodes     ri   r   z2refresh_group_node_dependencies.<locals>.<genexpr>+  s.      
xx{;;== 
   (+)
r   r  r(   
ReadWrites
merge_listr   r   unionrk  r  )r  r   rA  s   `  ri   refresh_group_node_dependenciesr  "  s     F**6+JaAMM+JK
 	 
!'')O1!*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B-0B2r  c                   t        | t        t        f      sJ || _        || _        d | _        t        j                  |D cg c]  }|j                  |j                   c} | _        t        |        t        d | j                  D              | _        t        d | j                  D              | _        | j                         D ci c]  }|j                         | c}| _        y c c}w c c}w )Nc              3  4   K   | ]  }|j                     y wrt   r^  r   rA  s     ri   r   z"init_group_node.<locals>.<genexpr>C       HHr  c              3  4   K   | ]  }|j                     y wrt   )r_  r  s     ri   r   z"init_group_node.<locals>.<genexpr>D  r  r  )r   r   GroupedSchedulerNoder   r  r   r   r  r   r  r  r^  r  r_  rt  r  rf  )r  r  r   rA  r   s        ri   init_group_noder  4  s    
 k$68L#MNNNK%KK&,,%	A!)@!++	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@# ##K 
B#s   C*C*	C/c                      e Zd ZU dZded<   e	 	 	 	 	 	 d!d       Zd"dZd#dZe	d$d       Z
	 	 	 	 	 	 d%dZd& fd	Ze	d'd
       Zd'dZe	d(d       Zd)dZd'dZd'dZ	 	 	 	 	 	 d* fdZe	d(d       Ze	d(d       Zd+dZd'dZe	d,d       Ze	d,d       Ze	d,d       Ze	d,d       Ze	d-d       Zd.dZe	d,d       Zd/dZd0dZ d1dZ!d'dZ"e	d, fd        Z# xZ$S )2r   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r   c           	        |j                   |j                   u sJ t        |t        t        f      sJ |j	                         rt        |t
              rt        |j                  t              sJ t        |j                  j                        dk(  sJ t        t        t        |j                  j                              t              sJ t        t        |j                  j                              j                  }|j                         D cg c]  }|j	                         s| }}t        |      dk(  sJ |d   }t        |j                  j                        dk(  sJ t        t        |j                  j                              }t        |t               sJ t#        t!        ||j$                  |j&                  |j(                  |j*                        g      |j                  _
        nt        |t        t        f      sJ t-        t/        j0                  |j                         |j                                     } | |j                   |      S c c}w )Nr   r   )r  r   r   r   r  r  r   r;   r   r   r  r  r  r3   r   r   r2   r   r   	var_namesr   r  r   r  r  )	rn   r~   r   r   r   template_nodesr  writer  s	            ri   ro   zFusedSchedulerNode.fuseS  s    %//111%-1C!DEEE:e5N#O ejj+666u((//0A555d4(9(9(@(@#ABGLLLU..5567<<D/4/@WtDDTDTDVdWNW~&!+++*1-M}00778A===m77>>?@EeY///'1ekk5??EJJ

(E$ em5G%HIIIY__U__%68IJK5??E**! Xs   I'Ic                    | j                   D ]6  }t        |t              sJ |j                         sJ |j	                          8 | S rt   )r   r   r   r   rq  rh   r   s     ri   rq  z,FusedSchedulerNode.extract_pw_from_reductionu  sJ    {{ 	0Gg}555'')))--/	0 rk   c                j    | j                   D ]$  }t        |t              sJ |j                          & y rt   )r   r   r   ro  r  s     ri   ro  z(FusedSchedulerNode.swap_pw_red_dimension|  s1    {{ 	,Gg}555))+	,rk   c                    t        t        d d | j                         D                    }t        |      dk(  ry t	        |      }|S )Nc              3  |   K   | ]4  }|j                         s|j                         r|j                          6 y wrt   r  r  r  r  s     ri   r   z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>  6      '')T^^-= '')   :<r   r   filterr   r   r   rh   fpsr  s      ri   r  z!FusedSchedulerNode.estimate_flops  K      $ 0	
 s8q=#h
rk   c                   | j                         ryd}| j                  D ]`  }t        |t              sJ |;t	        |      t	        |j
                  d         k7  rt        j                  d        y|j
                  d   }b d}|J t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|s%t        j                  d| j                                yt        xj                  dz  c_        t        j                  d| j                         |       | j                  D ]%  }t        |t              sJ |j                  |       ' t        |        y)	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)r  r   r   r   r   rB  r  r  r   r|  r}  r  r*   r~  rf  r  )rh   r  r  r  r  re  s         ri   r  z,FusedSchedulerNode.reorder_loops_by_dep_pair  sL    
[[ 	)Ee]333%%
*;uU\\RS_?U*U!''G aJ	) 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[ 	2Ee]333&&y1	2 	(-rk   c                    t         |   |       t        | ||       g | _        t	        |d       j
                  | _        y )Nc                4    t        | j                               S rt   )r
  r   r  s    ri   rp  z-FusedSchedulerNode.__init__.<locals>.<lambda>  s    s1>>3C/D rk   r  )r3  rr  r  r  r  r   )rh   r  r   r6  s      ri   rr  zFusedSchedulerNode.__init__  s8    #i0%'
%DEKK
rk   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w N_r  r   r  rh   rA  s     ri   r  zFusedSchedulerNode.get_name  )    xxt{{;!;<<;   8c                <    | j                   d   j                         S r   r   r  rg   s    ri   r  z!FusedSchedulerNode.get_first_name      {{1~&&((rk   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rt   r   r  r   r  r  s     ri   r  z#FusedSchedulerNode.get_buffer_names  .    !L1!"4"4"6!LMM!L   9c                j    g }| j                   D ]!  }|j                  |j                                # |S rt   r   r  rt  rh   r*  r   s      ri   rt  zFusedSchedulerNode.get_outputs  4    (*KK 	.DMM$**,-	.rk   c           
     ~   t        | j                        D cg c]+  \  }}| j                          d| d|j                          - }}}| j                  d   j                  }||j                  | j                                t        j                  dj                  |      j                         d      S c c}}w )Nz.snodes[z] =
r   r  r  )r  r   r  r,  r   r  r  r   r(  r  r  )rh   r  r   r  s       ri   r  z"FusedSchedulerNode.debug_str_extra  s     %T[[1
4 }}xs%0@/AB
 
 {{1~""LL3356tyy/668&AA
s   0B9c                h    | j                   D cg c]  }|j                          }}|  d| S c c}w )Nz
, snodes: )r   r  )rh   r   
snodes_strs      ri   r  z"FusedSchedulerNode.debug_str_short  s9    9=Ed**,E
Ez*.. Fs   /c                    t         |   ||       t               }t        | j                        D ]/  }|j                  ||       |j                  |j                         1 y rt   )r3  r  r   r   r   updater]  )rh   r  r  r   r6  s       ri   r  z!FusedSchedulerNode.set_last_usage  s\    
 	24FG 0:|T[[) 	8D 35GH&&t7	8rk   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rt   )r   r  r   r   r  s     ri   r   z$FusedSchedulerNode.used_buffer_names  s.    !MA!"5"5"7!MNN!Mr  c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rt   )r   r  r   r  r  s     ri   r  z/FusedSchedulerNode.used_or_aliased_buffer_names  s3    8<D1a,,.D
 	
Dr  c                    | j                   S rt   rv  rg   s    ri   r   zFusedSchedulerNode.get_nodes  r  rk   c                T    t        |       j                   d| j                          dS )Nz(nodes=rx  ry  rg   s    ri   rz  zFusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@rk   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrt   )r   r  s     ri   r   z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     91>>#9r  r   r   rg   s    ri   r   zFusedSchedulerNode.is_reduction   s    9T[[999rk   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrt   )r  r  s     ri   r   z6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>  s     =A1%%'=r  r  rg   s    ri   r  z#FusedSchedulerNode.is_native_matmul  s    ====rk   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrt   )r  r  s     ri   r   z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>
  s     :1??$:r  r  rg   s    ri   r  z FusedSchedulerNode.is_split_scan  s    :dkk:::rk   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrt   r  r  s     ri   r   z1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8q1==?8r  r  rg   s    ri   r  zFusedSchedulerNode.is_template  s    8DKK888rk   c                j    | j                   D ]$  }|j                         s|j                         c S  y rt   )r   r  r  rh   r   s     ri   r  z$FusedSchedulerNode.get_template_node  s5    KK 	0D!--//	0 rk   c                     | j                   d   S r   )r   rg   s    ri   r   zFusedSchedulerNode.get_device  s    zz!}rk   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrt   )r  r  s     ri   r   z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA1--/Er  r  rg   s    ri   r  z+FusedSchedulerNode.has_aliasing_or_mutation  s    EEEErk   c                    t         rt   NotImplementedError)rh   r  s     ri   r  z'FusedSchedulerNode.update_mutated_names       !!rk   c                    t         rt   r  )rh   r   s     ri   r  zFusedSchedulerNode.add_fake_dep#  r  rk   c                    t         rt   r  r  s     ri   r   zFusedSchedulerNode.can_inplace&  r  rk   c                P   | j                         }dj                  d | j                  D              }t               }|j	                  | dt        |       j                   d| d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j	                  |j                                # 	 ddd       |j                  d       	 |j	                  | j!                                |j)                         j+                         S # 1 sw Y   XxY w# t"        $ r t$        j'                  dd       Y Lw xY w)r|  r!  c              3  F   K   | ]  }t        |      j                    y wrt   )r   rv   r  s     ri   r   z/FusedSchedulerNode.debug_str.<locals>.<genexpr>,  s     FQQ 0 0Fs   !r   r}  r~  r  r  r  r  z.outputs = [
            Nr"  r  Tr  )r  r  r   rO   r  r   rv   r&  r   r  rk  r   r(  rt  r,  r#  r  r  r  r  r)  r  )rh   r   node_typestrr   r   s        ri   r,  zFusedSchedulerNode.debug_str)  s   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   )5E69F 6E? F%$F%c                p    | j                   t        d | j                   D              S t        |          S )Nc              3  <   K   | ]  }|j                           y wrt   )r"  r  s     ri   r   z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>F  s     G4t,,.Gr  )r   r   r3  r"  r  s    ri   r"  z#FusedSchedulerNode.has_side_effectsC  s0    ;;"G4;;GGGw'))rk   r~   r\   r   r\   r   r   r  rQ  r  r  )r  r  r   r  r   rR  rO  r  r   rc  r  r  rS  r  )r   torch.devicer  )r   r1   r   rR  r  )%rv   rw   rx   r  ry   rz   ro   rq  ro  rF   r  r  rr  r  r  r  rt  r  r  r  r   r  r   rz  r   r  r  r  r  r   r  r  r  r   r,  r"  r=  r>  s   @ri   r   r   J  s    $#+%+.?+	+ +B,
  "(!(.7(	(TL = =) N N	B/8#28HV8	8 O O 
 

A : : > > ; ; 9 9   F F
"""*4 * *rk   r   c                  D     e Zd Zd fdZ	 	 	 	 	 	 ddZddZddZ xZS )FusedMixOrderReductionsc                `   t         j                  |      st         j                  |      sJ ||}}|| _        || _        t        |   |j                  t        |j                               t        |j                               z          t         j                  | j                        | _
        y rt   )r   r   r~   r   r3  rr  r  r   r   r   numel)rh   r~   r   r6  s      ri   rr  z FusedMixOrderReductions.__init__K  s     33E:$77>>> %5E

OOT%//"34tEOO<M7NN	
 '00<
rk   c                   t        |t              rJ t        |t              rJ | j                  j                  ||d      syt        j                  |      rt        j                  |      sydd}	 	 	 	 dd}|r' |||f       ||      z  s ||       |||f      z  ry|j                          xsC t        j                  t        | j                  j                  ||d            | j                  k\  S )a  
        node1 is from the current mix order reduction; node2 is another node we want to fuse in.

        other_nodes are passed in to check if fusion will introduce producer/consumer relationship
        between the inner and outer reduction. If yes, we don't fuse.
        Fallow_mix_order_reductionc                B    t               } |j                  d | D         S )Nc              3  4   K   | ]  }|j                     y wrt   )r   r  s     ri   r   zTFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>u  s     :qq{{:r  r   r  r  r   s     ri   _get_ancestorszAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestorss  s     ,C399:E:;;rk   c                B    t               } |j                  d | D         S )Nc              3  <   K   | ]  }|j                           y wrt   )r   r  s     ri   r   zZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>{  s     F1q446Fr  r'  r(  s     ri   _get_operation_nameszGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_namesw  s"     ,C399FFGGrk   )count_bytes)r  tuple[BaseSchedulerNode, ...]r   r\  )r   r  r  r   r   r   r   typingcastr
  score_fusion_memoryr!  )rh   r~   r   other_nodesr)  r,  s         ri   sub_node_can_fusez)FusedMixOrderReductions.sub_node_can_fuseW  s    e%<===e%<===
 ~~&&ueu&U //
#66u=	<	H0	H	H u~.1Ek1RR{+.BE5>.RR ""$$ {{T^^77uRW7X zz	
rk   c                   t        |t              sR| j                  | j                  || j                  f      xs( | j                  | j                  || j                  f      S | j                  | j                  |j                  | j                  |j                  f      xr/ | j                  | j                  |j                  t                     S rt   )r   r  r3  r~   r   r   rh   others     ri   can_fuse_withz%FusedMixOrderReductions.can_fuse_with  s    %!89))

EDJJ= J''

EDJJ=IJ ))

EKK$**ekk)B K((U[[%'JKrk   c                T   | j                   j                         }| j                  j                  |      }t	        |t
              rX|j                  | j                   |j                         }|j                  | j                  |j                        }t        ||      S | j                  | j                   || j                  f      r2|j                  | j                   |      }t        || j                        S |j                  | j                  |      }t        | j                   |      S rt   )	r~   r   r  rM  r   r  ro   r   r3  )rh   r6  r  backendfused_node1fused_node2r,  s          ri   	fuse_withz!FusedMixOrderReductions.fuse_with  s    &&(..,,V4e45!,,tzz5;;?K!,,tzz5;;?K*;DD%%djj%$**G$\\$**e<
.z4::FF$\\$**e<
.tzz:FFrk   r  )r~   r\   r   r\   r2  r.  )r6  r\   )rv   rw   rx   rr  r3  r7  r<  r=  r>  s   @ri   r  r  J  s6    
=2
 2
 !2
 3	2
h
KGrk   r  c                  N    e Zd ZU dZ	 	 	 	 ddZ	 	 	 	 ddZedd       Ze	 	 	 	 	 	 dd       Z	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	e	 	 	 	 dd       Z
e	 	 	 	 dd       ZeZd	ed
<   e	 	 	 	 dd       Ze	 	 	 	 dd       ZddZddZddZddZd dZd!dZ	 	 	 	 d"dZ xZS )#ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    c                    |j                         D ]=  }|j                         | j                  v s | j                  |j                            c S  y rt   )rt  r  read_to_node)rh   producerr   s      ri   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  sL     '') 	9C||~!2!22((88	9 rk   c                   t        t                  }|j                  j                  D ]  }|j                  | j
                  j                  vr&| j
                  j                  |j                     j                         }|| j                  v sf|j                  | j                  |           t        |      dk(  rt        t        |            S y Nr   )r   r\   r   r   r   r  r=  r  name_to_noder  r   r  r  )rh   consumer	producersrd	node_names        ri   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,, 	<Bwwdnn88822277;LLNID---d//	:;	< y>QY((rk   c                   t        |      }j                         r|j                         rt        j                  t              t        j                  t        |      }t        j                        t        |j                        k(  }|s |d       |xr2 t        fdt        j                  |j                        D              S |j                         rkj                         r	 |d       yt        j                  t        |      }|j                        }||j                  j                  |      S  |d       yj                         rk|j                         r	 |d       yt        j                  t              j                  |      }|j                  j                  ||      S  |d       yt        d      )	Nzforeach do not have same lengthc              3  \   K   | ]#  \  }}j                   j                  ||       % y wrt   )r  r   )r   lrrA  s      ri   r   z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s0      )Aq ""++Aq1)s   ),zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  r/  r0  r>  r   r   r   r  r   rB  r  r   rJ  r  )rn   rA  rF  whyforeach_matchconsumer_subnodeproducer_subnodes    `     ri   r   z#ForeachKernelSchedulerNode.can_fuse  s   (+ X%8%8%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    "$$&n {{#=xHH'@@J+))228=MNNGH  "$$&n {{#=xHH'@@J+))223CXNNGHf
 	
rk   c                
   |j                         s|j                         sJ |j                         r3t        j                  t        |      }|j                  }|j
                  }n2t        j                  t        |      }|j                  }|j
                  }d }d }|j                         r|j                         r|t        j                  t        |      }t        j                  t        |      }t        |j                  |j                        D cg c]  \  }}t        j                  ||       }	}}n/|j                         rt        j                  t        |      }|j                  |      }
g }	|}d }|j                  D ]A  }||
u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C n|j                         rt        j                  t        |      }|j                  |      }g }	|}d }|j                  D ]A  }||u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C nt        d       | |j                  |	||||      S c c}}w )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r/  r0  r>  rT  rW  r  r   r   ro   rJ  r   rB  r  r  )rn   rA  rF  rT  rW  rU  rV  rM  rN  fused_nodesrR  r   new_noderQ  s                 ri   ro   zForeachKernelSchedulerNode.fuse  sZ    ""$(;(;(=== {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O X%8%8%:{{#=xHH{{#=xHH  AAq #''1-K    "{{#=xHH'@@JK"KK  -++166tXFH"*K&&x0&&t,-   "{{#=xHH'@@JK"KK  -++166xFH"*K&&x0&&t,- !f  &?##+
 	
Ks    I?c                    i  _         i  _        ||qt           ||       |D ]Z  }|j                  j
                  D ]  }| j                   |j                  <    |j                         D ]  }	| j                  |	<    \ n| _        | _	        d  _
        g  _         j                  t        j                  j                  |j                  |j                  g             t!         fdt!        j"                  |j$                  |j$                        D               j                  j&                  z
   _        t)        |j*                  |j*                  g       _        t-        |j.                  |j.                  g       _        |j1                         rt3        |t4              sJ ||}}
nt3        |t4              sJ ||}}
|
j6                   _         j6                  j9                  |j6                         |
j                   _        |j                         D ]  }	| j                  |	<     j                  D ci c]'  }|j:                  j=                         D ]  \  }}||
 ) c}}} _        | _        |d   jA                         }|sJ |tC        jD                  d      fff _#        t!        tH        jJ                  jL                             _'        | _(        y c c}}}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wrt   r  r  s     ri   r   z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>\	  s0       xxt'<'<'>>	 r  r   combo_kernel))r@  rE  r3  rr  r   r   r   r   r  r   r   r  r  r(   r  r  r   r  rk  r  r  r^  r  r_  r  r   r>  r   r  rf  itemsrT  r   r   Exprr   r  fxNoderV  rW  )rh   r  r   rT  rU  rV  rW  r   r?  r   foreach_noder   r  r  vr  r6  s   `               ri   rr  z#ForeachKernelSchedulerNode.__init__:	  s    +"5GY/ 3 ,,22 8D37D%%dii08 !446 3D.2D%%d+3	3 'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%'!+/IJJJ+6j!+/IJJJ+6j)33DNNN!!*"6"67 , 9 9D"668 5*4!!$'5 #'++@ @%:O:O:U:U:W@26!Q1@@D  *C&%%'v

> :<>?
!%((--02.@s   ,K&c           
        |D cg c]  }t        |t              s| }}|rSt        j                  dt	        |      |D cg c])  }|j
                  |j
                  j                         + c}       |D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]$  }t        |t        t        t        t        f      s|& }}|D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]  }t        |t              r| }}|D cg c]  }|j                         s| }	}|	r t        j                  dt	        |	      |	       |D cg c]	  }||	vs| }}t        j                  ra|D cg c]  }|j                         s| }
}|
rt        j                  dt	        |
             |D cg c]  }|j                         r| }}|S c c}w c c}w c c}w c c}w c c}w c c}w c c}w c c}w c c}w c c}w c c}w )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d grouped nodes are filteredz;ComboKernels: %d FusedMixOrderReductions nodes are filteredz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %szCComboKernels: %d reduction nodes are filtered (pointwise_only mode))r   r  r  r  r   r   rN  r  r  r9  r>  r  r&   combo_kernels_pointwise_onlyr   )rn   r  rA  externr   grouped	mix_orderfiltered_nodesforeach_nodesr  reduction_nodess              ri   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes	  sd    #Oj4M&N!OOIIAF5;UTtyy?T&&(U
 $Kz!5I'J1KKII=G !&P1A7N)OQP	PIIMI 
*-(+	 
 
 &
A7Q)RA
 
 IICSEWX%
Z;U-VA
 
 &4Gq}}!GGIIBN#
 &4Oq7N!OO ..*8MQANN<LqMOM		Y( *8PAq~~?OaPNPy P
 VK Q



 H P N Qs|   IIII:II;I I <)I%+I*I*,I/I/I4"I4	I9I92I>I>3J	Jc                `   | j                         }g }d}t        |D cg c]0  }|D ])  }t        |t              r|j	                         D ]  }| + 2 c}}}      }|D ]  }t        t              }	|D ][  }|j                         }
|
r|
j                  dk(  s|
j                  dk(  r4|j                         |z  rH|	|
   j                  |       ] |	j                         D ];  }|j                  t        dt        |      |      D cg c]
  }||||z     c}       =  |S c c}}}w c c}w )zS
        Returns a list of lists of nodes that are to be grouped together.
           mpsr  r   )_topological_sort_nodesr   r   r  r  r   r   r   r   r   r   r   r  rj  r   )r  sorted_nodesgrouped_nodesmax_num_nodesr   r   r  excluded_buffer_namesr  device_groupsr  device_nodesr  s                ri   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels	  sq    !88:1; * ! d$;< $ 5 5 7
 	 2
 " 	E D!   3*v{{e3v{{e7K ))+.CCf%,,T23 !. 4 4 6 $$ "'q#l*;]!K %Q]):;!	. ?4s   5D$D+4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    | t         _        y rt   r>  rx  )custom_group_algorithms    ri   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels	  s    
 # 	#Drk   c                ,    t         j                  |       S rt   rz  r  s    ri   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels	  s     *KKIVVrk   c                    t         rt   r  rg   s    ri   r  z#ForeachKernelSchedulerNode.mark_run
  r  rk   c                    t         rt   r  rg   s    ri   r3  z"ForeachKernelSchedulerNode.codegen
  r  rk   c                     yr:  r{   rg   s    ri   r  z%ForeachKernelSchedulerNode.is_foreach	
  r  rk   c                ,    t        | j                        S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r   rg   s    ri   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes
  s     DKK  rk   c                t    t        t        j                  j                  d | j                  D                    S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  <   K   | ]  }|j                           y wrt   )r   r  s     ri   r   z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>
  s     1UA!++-1Ur  )r   r  r  r  r   rg   s    ri   r   z$ForeachKernelSchedulerNode.get_nodes
  s(     IOO111U1UUVVrk   c                <    | j                   d   j                         S r   )r   r  rg   s    ri   r  z)ForeachKernelSchedulerNode.get_first_name
  s    {{1~,,..rk   c                    t        | || j                  j                         | j                  D ]  }|j	                  |        y rt   )r  r  r=  r   r  )rh   r  r   s      ri   r  z/ForeachKernelSchedulerNode.prune_redundant_deps
  s=     	d$68R8RSKK 	:D%%&89	:rk   )rA  r\   r   r  )rF  r\   r   r  rA  r\   rF  r\   r   rs   )rA  r\   rF  r\   r   r>  )NNF)r  r  r   r  rT  rs   rU  r  rV  r  rW  rs   r   rR  r  r  r   r  )r  r  r   list[list[BaseSchedulerNode]])r{  rw  r   rR  rQ  rS  r   r  r  rO  r  )rv   rw   rx   r  rB  rJ  rz   r   ro   rr  rk  r  rv  rx  ry   r|  r  r  r3  r  r  r   r  r  r=  r>  s   @ri   r>  r>    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %F/F/ (F/ $(	F/
 1F/ 1F/ F/ 
F/P ?+?	 ? ?B **	&* *\ 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	:rk   r>  c                       e Zd ZU dZded<   edd       Z	 d	 	 	 	 	 	 	 d fdZddZddZ	e
dd       Zdd	Ze
dd
       ZddZe
dd       ZddZddZedd       Z xZS )r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r   c                    |d   j                   t        fd|D              sJ  | |      }|D ]  }|j                  |j                         <   ! |j                  |j                         <   |S )Nr   c              3  :   K   | ]  }|j                   u   y wrt   r~  )r   r   r  s     ri   r   z.GroupedSchedulerNode.create.<locals>.<genexpr>1
  s     B44>>Y.B   )r  r   r  r  )rn   r   grouped_snoder  r  s       @ri   createzGroupedSchedulerNode.create.
  sy    1I''	B6BBBBIv. 	KE=JI(()9:	KAN	$$]%;%;%=>rk   c                L    t         |   |       t        | ||       || _        y rt   )r3  rr  r  temp_grouping)rh   r  r   r  r6  s       ri   rr  zGroupedSchedulerNode.__init__8
  s(     	#i0 +rk   c                6   | j                   r| j                  S | j                  D ])  }|| j                  j                  |j	                         <   + | j                  j                  | j	                         = | j                  j                  | j                        S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  r   r  r  r  
fuse_nodes)rh   r  s     ri   unpackzGroupedSchedulerNode.unpackG
  sx    
 ;;[[ 	HEBGDNN--enn.>?	HNN--dmmo>~~((55rk   c                    | j                  | j                  j                  |             | j                  j	                  |       y rt   )r  r   r  rk  r  )rh   fake_deps     ri   r  z!GroupedSchedulerNode.add_fake_depT
  s5    T--77AB##H-rk   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w r  r  r  s     ri   r  zGroupedSchedulerNode.get_nameX
  r  r  c                <    | j                   d   j                         S r   r  rg   s    ri   r  z#GroupedSchedulerNode.get_first_name\
  r  rk   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rt   r  r  s     ri   r  z%GroupedSchedulerNode.get_buffer_names_
  r  r  c                j    g }| j                   D ]!  }|j                  |j                                # |S rt   r  r  s      ri   rt  z GroupedSchedulerNode.get_outputsc
  r  rk   c                    t        t        d d | j                         D                    }t        |      dk(  ry t	        |      }|S )Nc              3  |   K   | ]4  }|j                         s|j                         r|j                          6 y wrt   r  r  s     ri   r   z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>o
  r  r  r   r  r  s      ri   r  z#GroupedSchedulerNode.estimate_flopsi
  r  rk   c                    | j                   S rt   rv  rg   s    ri   r   zGroupedSchedulerNode.get_nodes{
  r  rk   c                X    | j                   r| j                   d   j                         S d S r   )r   r   rg   s    ri   r   zGroupedSchedulerNode.get_device~
  s$    .2kkt{{1~((*CtCrk   c                     yr  r{   )rn   rA  rF  s      ri   r   zGroupedSchedulerNode.can_fuse
  r  rk   )r   r  r   r  )F)r  r  r   r  r  rs   r   rR  r  )r  r1   r   rR  rO  r  r  r  r  rU  r  )rv   rw   rx   r  ry   rz   r  rr  r  r  rF   r  r  r  rt  r  r   r   r   r=  r>  s   @ri   r  r  "
  s     $#  $	++ (+ 	+
 
+6. = =) N N  "D  rk   r  c           
          t         j                  d fd       }t        t        t	        t         d                           }t        |      dkD  r|D cg c]  } |   	 c} t        j                  r|j                  |       |S c c}w )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                t   |    dk(  s|   dk(  rt        |    dk(  |   dk(        S D cg c]  }t        ||           }}D cg c]  }t        ||          }}t        d t        ||      D              }t        d t        ||      D              }||kD  ry||kD  ryt        ||       S c c}w c c}w )Nr   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr   Nr{   r   sl_asl_bs      ri   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>
  )      
)3tDAI$$
   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr  r{   r  s      ri   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>
  r  r  r   )rG   absr   r  )	r  bslstride_len_astride_len_ba_firstb_firstrd  stride_lengthss	          ri   	index_cmpz"pick_loop_order.<locals>.index_cmp
  s    8q=E!HMuQx1}eAh!m44 .<<rBqE
<<-;<rBqE
<<  
7:<7V
 
  
7:<7V
 
 WW 1ay# =<s   B0	B5r   r  )r  r
  r  r
  r   r
  )		functools
cmp_to_keyr   r   rj  r   r&   pick_loop_orderssort)r  rd  priority_idxr  orderpis   ``    ri   pick_loop_orderr  
  s      4 %N1$5 6789E
<17CD.,D

y
!L Es   Bc                   |j                         }| j                         }t        |t              rt        |t              sJ |j                         }| j                         }t        |t              rt        |t              sJ t        j
                  j                  |= ||_        t        j
                  j                  |= ||_	        t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   y rt   )r  r   r  r  rW   r   r  r   
name_to_opoperation_namebuffersr   remove
operations)	orig_noderY  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          ri   _replace_operation_bufferr  
  sU    !))+&&(MmS)j9JC.PPP224//1LlC(Z8H#-NNN	01!HM	+,*H77??  +DGGOO8$$AGGOOD,4AGG=)77##I.DGGh''AGGt'/AGG|$rk   c                p    |j                         }| j                         }||z
  }||z  }|d|z   z  }||z  S rD  )rc  re  )r~   r   epilogue_runtimetotal_read_bytestemplate_write_bytesextra_bytesextra_bytes_ratioextra_memory_ratios           ri    _estimate_fused_epilogue_runtimer  
  sX     224 779"%99K#&:: +a2C.CD 000rk   c                  T    e Zd ZU ded<   dZded<   dZded<   ddZddZdd	Zdd
Z	y)NodeUser$Union[BaseSchedulerNode, OutputNode]r   Frs   r   is_weakc                v    t        | j                  j                         | j                  | j                  f      S rt   )r  r   r  r   r  rg   s    ri   r  zNodeUser.__hash__
  s+    TYY'')4+;+;T\\JKKrk   c                    t        |t              xrW | j                         |j                         k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S rt   )r   r  r  r   r  r5  s     ri   __eq__zNodeUser.__eq__
  s[    uh' .5>>#33.  E$5$55. -		
rk   c                6    | j                   j                         S rt   r.  rg   s    ri   r  zNodeUser.get_name
  r/  rk   c                    | j                   |j                   u sJ t        | j                   | j                  xr |j                  | j                  xr |j                        S rt   )r   r  r   r  r5  s     ri   rI  zNodeUser.merge
  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rk   NrP  )r6  objectr   rs   rO  )r6  r  r   r  )
rv   rw   rx   ry   r   r  r  r  r  rI  r{   rk   ri   r  r  
  s3    
..K GTL
$
rk   r  c                 "    t         j                  S rt   )r&   r  r{   rk   ri   *used_non_deterministic_runtime_estimationsr    s    333rk   c                   t               }| j                         }t        |t        j                        r|j                  t        |j                        t        |j                        z  t        |j                        z         t        |t        j                        r$|j                  t        |j                               |S |
J d|        |S )z=Get free symbols from a node's layout (size, stride, offset).z*Expect layout to be None but found layout=)r   maybe_get_layoutr   r)   Layoutr  r    r   strideoffsetr:  get_layout_symintsrO  )r   free_symbol_usesr$  s      ri   r  r    s    1;""$F&"))$%6==)*6==)*	

 fb;;<##$6v}}$EF  ~T!KF8TT~rk   c                "   t        | t              r( t               j                  d | j                  D         S | j
                  J | j
                  j                         } |j                  d | j
                  j                         D          |S )z
    Gets symbols used in a scheduler node, including free symbols from
    the node's operations and layout symints from outputs.
    c              3  2   K   | ]  }t        |        y wrt   get_scheduler_node_symbol_uses)r   r  s     ri   r   z1get_scheduler_node_symbol_uses.<locals>.<genexpr>   s     M,U3M   c              3  2   K   | ]  }t        |        y wrt   )r  )r   ir_nodes     ri   r   z1get_scheduler_node_symbol_uses.<locals>.<genexpr>%  s     	M'
W
%	Mr  )	r   r   r   r  r   r   get_free_symbol_usesr  rt  )r   r  s     ri   r  r    s     $*+!z|!!MM
 	
 99   yy557	MTYY5J5J5L	M rk   c                l    | j                         xr# t        j                  xr |j                          S rt   )r  r&   epilogue_fusionr   s     ri   is_epilogue_fusionr  *  -    U6#9#9U%BSBSBU>UUrk   c                l    |j                         xr# t        j                  xr | j                          S rt   )r  r&   prologue_fusionr   s     ri   is_prologue_fusionr  .  r  rk   c                6    t        | |      xs t        | |      S rt   )r  r  r   s     ri   is_template_fusionr  2  s    eU+O/A%/OOrk   c                "    t        | |      r|S | S rt   )r  r   s     ri   template_fusion_pw_noder  6  s    &ue45?%?rk   c                  \    e Zd ZdZd\dZd\ fdZd]dZed^d       Zej                  d_d       Zd`dZ
dadZdbd	Zd`d
Zd`dZd`dZd`dZdcdZ	 	 	 	 dddZdedZdfdZd`dZd`dZdddZd`dZ	 	 	 	 dgdZ	 dh	 	 	 	 	 	 	 didZ	 	 	 	 	 	 djdZ	 	 	 	 dkdZd`dZ	 	 	 	 	 	 	 	 	 	 dldZdmdZ	 dh	 	 	 	 	 dndZ 	 	 	 	 	 	 dodZ!dpdZ"	 	 	 	 	 	 	 	 dqd Z#	 	 	 	 	 	 	 	 drd!Z$	 	 	 	 	 	 dsd"Z%	 	 	 	 	 	 	 	 	 	 dtd#Z&	 	 	 	 dud$Z'	 	 	 	 dvd%Z(	 	 	 	 	 	 dwd&Z)dhdxd'Z*dyd(Z+	 	 	 	 	 	 dzd)Z,	 	 	 	 	 	 d{d*Z-	 	 	 	 	 	 d{d+Z.	 	 	 	 	 	 	 	 d|d,Z/	 	 	 	 	 	 d{d-Z0	 	 	 	 	 	 	 	 d}d.Z1	 	 	 	 	 	 d~d/Z2	 	 	 	 	 	 d~d0Z3dd1Z4	 	 	 	 	 	 	 	 dd2Z5	 	 	 	 	 	 dd3Z6	 	 d	 	 	 	 	 	 	 	 	 dd4Z7	 	 	 	 	 	 d{d5Z8	 	 	 	 	 	 	 	 dd6Z9dd7Z:ddd8Z;	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd9Z<	 	 	 	 dd:Z=	 	 	 	 dd;Z>d`d<Z?d`d=Z@d`d>ZAdd?ZBdd@ZCddAZDddBZE	 	 	 	 	 	 ddCZFddDZGeHddE       ZI	 	 	 	 ddFZJ	 	 ddGZK	 	 	 	 ddHZL	 	 	 	 	 	 ddIZM	 	 	 	 	 	 ddJZN	 	 	 	 ddKZO	 	 	 	 dddLZP	 	 	 	 dddMZQ	 	 	 	 dddNZR	 	 ddOZS	 	 	 	 	 	 ddPZTddQZUd`dRZV	 	 	 	 	 	 ddSZW	 	 	 	 	 	 ddTZX	 	 	 	 	 	 ddUZYd`dVZZdydWZ[	 	 	 	 ddXZ\ddYZ]ddZZ^d`d[Z_ xZ`S )r  z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    c                f    t        d      5  | j                  |       d d d        y # 1 sw Y   y xY w)NzScheduler.__init__)r   _initrh   r  s     ri   rr  zScheduler.__init__@  s,    ./ 	JJu	 	 	s   '0c           
         t                     t        j                  _        i  _        t        t               _        t        j                          _        t                _        t        g t        j                  j                  j                         t        j                  j                   j                         t        j                  j"                  j                                _        |D cg c]  } j'                  |       c} _        d  _        d  _         j/                           j$                  j1                  t        j                  j                   j                                 j(                  D ]  }|j3                           d  _         j7                          _         j(                  D ci c]  }|j;                         | c} _         j(                  D ci c](  }|j?                         D ]  }|j;                         | * c}} _          j<                  jC                          _"        i  _#        i  _$        t                _%        tM        jN                   j(                   j@                   jD                         _         jQ                           jS                   j(                         _         jU                           j(                  D ci c]  }|j;                         | c} _"         jW                          tX        xjZ                  t]         j(                        z  c_-        ddl/m0}m1}  | j(                         t]         j(                         _2         jg                           jS                   j(                         _        t        th        tj        tj        f              _6        tn        jp                  $to        jp                   j(                         _        tn        jr                  r'ddl:m;} |jy                           jW                           j{                   j(                         _        tn        j|                  $to        j|                   j(                         _         j                           j                          tn        j                  stn        j                  r<t               r2t        j                  j                  j                  j                          tn        j                  r)t        ddd      5   j                  d        d d d        tn        j                  rdd	lMmL}  | j(                   j@                   jD                  t        t        j                  j                  j                               t        t        j                  j                                      _        tn        j                  stn        j                  rtn        j                  s#dd
lMmQ}	  |	 j(                   j@                         t               rvt        j                  rftn        j                  st        j                  rFd}
 j(                  D ]  }t        |j                        sd}
 n |
rddl&mY}  | j(                         t        j                  rddl[m\}  |dd  fd       tM        j                   j(                         _         j                          tn        j                  rttn        j                  j                  rZtn        j                  j                  r@ j                   j(                         _         j                   j(                         _         j                          t        j                  jn                  j                  j                  r j                           | j(                         t        j                  j                   j(                          j                          t                _l        i  _m        t        d      j                   fd       t                _p        y c c}w c c}w c c}}w c c}w # 1 sw Y   ExY w)Nr   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotunez#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffersF)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     dddS )N#scheduler_nodes_before_comm_overlapstring)r   encodingr{   r{   rk   ri   rp  z!Scheduler._init.<locals>.<lambda>  s     E$,) rk   c            
         dj                  t        j                        D  cg c]0  \  } }d|  d|j                         z   d|j	                          z   2 c}}       S c c}} w )Nz

zsnode[r"  z buffer_names:)r  r  r  r,  r  )r  r  rh   s     ri   rp  z!Scheduler._init.<locals>.<lambda>  sl    v{{
 )2$**(=	 !%1 %QCqMkkm, .q/A/A/C.DEF( s   5A"
)metadata_fn
payload_fngraph_statsc                 ^     j                    j                  t         j                        dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r  rg   s   ri   rp  z!Scheduler._init.<locals>.<lambda>  s'     33+/+>+>*-djj/ rk   )qr3  rr  rW   r   r  backendsr  _post_grad_graph_counterr  r  count_graph_partition_counterr   r6  r  r   	constantstorchbind_constantsr  create_scheduler_noder  previous_nodecurrent_nodeupdate_zero_dim_cpu_tensorr  r  default_device_contextget_donated_buffersr<  r  rE  rt  r=  copyr  r  rb  seen_template_fusionsr%   decide_global_ordering_of_commsrZ   topological_sort_scheduledead_node_eliminationcompute_ancestorsr*   ir_nodes_pre_fusionr   torch._inductor.debugr  r  r  create_foreach_nodesr   r  logged_slow_fusionr&   _pre_fusion_custom_passdistributed_max_autotune_gemmr  r   scheduler  _post_fusion_custom_passrz  finalize_multi_template_buffersmax_autotune_gemmmax_autotuner   r  r  select_algorithmPrecompileThreadPoolshutdown_instancecombo_kernelsr   create_combo_kernel_nodesr  memoryget_output_namesdeterministic reorder_for_compute_comm_overlapr  r  r'   6runtime_estimations_align_across_all_distributed_ranksr  r  rP   r   r  reorder_sink_verbose_loggingtorch._loggingr  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   r[   %reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_rowremoved_ops)rh   r  r  r   r   r  r  r   r  r  has_collectivesr  r  r6  s   `            ri   r  zScheduler._initD  sS    <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCd003C
:>9='')##**177+<+<+A+A+CDJJ 	DOO	 ?C# $$& 	# &*ZZ;
 !AJJL!O;

 -1JJ8
$($BRBRBT8
;>CLLNC8
8
 AE@Q@Q@V@V@X 35 13 L 	"
 ::JJ##

 	!!#33DJJ?
""$<@JJ"Gq1::<?"G  	##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ//. ))$/""$__TZZ0
**688DDJ,,.$$(;(;&(OO,,AASSU5&* $ B
 ..D.AB ))70

  ''177//44671773356DJ ##(O(O11UAJJ 0 0
 ;< WW<<#PP #( JJ D$TYY/*. # K4::V 88; !  CCDJJODJ""$ ""((CCDDTZZPDJJJ4::VDJ!??!!..EE**,4::&	djj) 6@\! :<'//	
 -7LM D;
8
H #HBB Bs$   5a%?a*.-a/(a5$a::bc                   i }t         j                  j                  D ]d  }t        t         j                  j                  |   t        j
                        s9t        | t         j                  j                  |   d       ||<   f |S )N)r  )rW   r   graph_inputs_originalr   r)   DonatedBufferrZ  )rh   name_to_donated_bufr   s      ri   r#  zScheduler.get_donated_buffers  sp     GG11 	D!''77=r?O?OP,BGG11$7 $-#D)	 #"rk   c                6    t         j                  j                  S rt   rW   r   current_devicerg   s    ri   rX  zScheduler.current_device&  s    ww%%%rk   c                .    |t         j                  _        y rt   rW  r  s     ri   rX  zScheduler.current_device*  s    !'rk   c                    t         j                  j                  dd      dk(  rddlm}  || j
                  d       yy)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r  r]  r  )rh   r]  s     ri   rL  zScheduler.debug_draw_graph.  s1    ::>>:DASH+6 Irk   c                    t         j                  t        j                        r8t         j	                  d|       | j
                  D ]  }|j                           y y )Nz%s:)r  isEnabledForloggingINFOr  r  r  )rh   labelr   s      ri   debug_print_nodeszScheduler.debug_print_nodes5  sF    GLL)HHUE"

 #  "# *rk   c                6   |j                         J d       |j                         rt        | |      S t        |t        j
                  t        j                  f      rt        | |      S t        |t        j                        rt        | |      S t        |      )Nz2All nodes passed to scheduling must have an origin)rN  is_no_opr9  r   r)   r   rJ  r   r  r  r  r  s     ri   r  zScheduler.create_scheduler_node;  s    !- 	
@	
- ==?)$55r00"2C2CDE t,,boo.,T488%d++rk   c                   t               }g }| j                  j                         }t        j                  j
                  j                         D ]  }|D cg c]%  }||v rt        | j                  |   t              s|' }}|s6|j                  |       |D cg c]  }| j                  |    }}t        j                  dkD  }t        | |d|      }|j                  |       |D ]  }|| j                  |<     | j                  D 	cg c]  }	|	j!                         |vs|	 c}	t#        |      z   | _        y c c}w c c}w c c}	w )Nr   FrT  rW  )r   r  r   rW   r   listsr   r   rE  r9  r  r&   combo_kernels_autotuner>  r   r  r  r   )
rh   removed_node_namesfe_nodeskept_node_namesnamesr   r   rW  fe_noder   s
             ri   r,  zScheduler.create_foreach_nodesH  sN   .8l11668WW]]))+ 	8E "?*"4#4#4T#:<RS E  %%e,:?@$d''-@F@$;;a?O0*/ /	G OOG$ 807''-81	88 "ZZ
4==?BT+TD
N
5 A
s   *D<EE#Ec                *   '()  G 'fddt         t                 't        j                  '      ( j                  D ]  }|j                         D ]  }|j                         }t        |j                  j                  t        j                        rt        |j                               dkD  r^|j                         D ]J  }|(v r/|(v r+(|   }(|   }||z   }(D ]  }(|   |u s(|   |u s|(|<    6|(v r	(|   (|<   C(|   (|<   L   d) fd)	 	 d	 	 	 	 	 	 	 	 	 d()fd}	i }
t        j                  j                   j#                         D ]  }t        |t$        j&                        r|j(                  D ]  }d|
|<   	 4t        |t        j*                        sO|j-                         D cg c]  }t        |t$        j&                        s|! }}|D ]  }|j(                  D ]  }d|
|<   	   d} j                  D ]s  }|j                  J t/        |j                  j1                         d 	      }|D ]8  }t        |t$        j2                        sJ d
}||
vs&|j                         |
|<   : u  j                  D ]@  }t4        j7                  d|j                         |r|j                  J t/        |j                  j9                  d
      d 	      }|D ]d  }||
v sJ | d|
        |
|   x} j:                  |   j                         D ]*  }|j=                  t?        |j                                      , f t        |j@                  jB                        dk(  rGtE        tG        |j@                  jB                              x}rt        |tH              r|jJ                  }nd}|j                         D ]7  }t        |jM                               dk  sJ |jM                         D ]  } )|      } |	||       |j=                  t?        ||             (|   jN                  D ]  }|j                         |j                         k(  r%t        |j                  tP              sJ |j                  j                         D ]c  }|j                         } )|      }||j                         v }|j=                  tS        ||j                         |               |	||d
       e   : t        j                  jT                  |j                            D ]8  } |	||d
       |j=                  tS        ||j                         d
             : t        j                  jV                  |j                            D ]'  } |	||d       |j=                  t?        |             ) |j@                  jX                  D ]6  }t        |tR              r |	|jZ                  ||j]                  |             8 |j_                   j`                         |j                         D ]  }|jM                         D ]y  }|j                          j`                   )|      <   |j                          j`                  |<    jb                  je                  ||       jb                  |j                         <   {  C t        j                  jg                         D ]3  }t4        j7                  d|        |	|ti        t?        |                   5 |rt        j                  jj                  D ]  }|j9                  d
      D ]|  }||
v sJ | d|
jm                                 |
|   x}s) j:                  |   jo                         D ]4  }t4        j7                  d||        |	|ti        t?        |                   6 ~   j`                  D ]  }|t        j                  j                   v rE |	|ti        t?        |                   t        j                  jp                  js                  |       d|t        j                  jt                  v s |	|ti        t?        |                    tw        t        j                  j                   jm                               D  ci c]  \  } }|| 
 }!} }t        j                  jp                  D cg c]  }|!|   	 c}t        j                  _<         j                  D ]C  }|j                         D ].  }|j{                  (|j                            jN                         0 E  j|                  D ]-  } j|                  |   j{                  (|   jN                         / t               }"|"j                  d       (jO                         D ]]  \  }}#|"j                         5  |#jN                  D $cg c]  }$|$j                          }%}$|"j                  d| d|% d       ddd       _ |"j                  d       |"j                         j                         }&t        j7                  d       t        j7                  d|&       yc c}w c c}} w c c}w c c}$w # 1 sw Y   xY w)zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                  >    e Zd ZdZ	 	 d	 	 	 	 	 ddZddZd	 fdZy)
1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nc                @    |xs g | _         |xs
 t               | _        y rt   )r]  r   
membership)rh   r]  rv  s      ri   rr  z:Scheduler.compute_dependencies.<locals>.DedupList.__init__|  s    
 #[b
","<
rk   c                    || j                   v ry | j                  j                  |       | j                   j                  |       y rt   )rv  r]  r   r  )rh   	node_users     ri   r   z8Scheduler.compute_dependencies.<locals>.DedupList.append  s5    /

!!),##I.rk   c                    t        j                  | j                  |j                        }| j                  |j                  D cg c]  }|| j                  vs| c}z   } ||      S c c}w rt   )r   r  rv  r]  )rh   r6  new_membershiprA  	new_items	DedupLists        ri   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{*at.FA* 	 !N;;*s   A+A+r  )r]  zOptional[list[_T]]rv  zOptional[OrderedSet[_T]]r   rR  )rx  r^   r   rR  )r6  DedupList[_T]r   r~  )rv   rw   rx   r  rr  r   r}  )r|  s   ri   r|  rt  r  s;     -17;=)= 5= 	=/<rk   r|  r   c                N    | j                   v r j                   |          S | S rt   )rb  )r  r  rh   s    ri   r  z.Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677Hrk   Fc                P     |          j                  t        |||             y rt   )r   r  )used_by_namer-  r   r  name_to_usersr  s       ri   add_userz0Scheduler.compute_dependencies.<locals>.add_user  s)     &./66K9rk   Nc                    | j                   S rt   r  r  s    ri   rp  z0Scheduler.compute_dependencies.<locals>.<lambda>  s
    AFF rk   r  Tzscheduling %s)unbacked_onlyc                    | j                   S rt   r  r  s    ri   rp  z0Scheduler.compute_dependencies.<locals>.<lambda>  s
    !&& rk   z not in )r  )mutating_bufr  )r  )r  zscheduling output %sz+scheduling output %s for unbacked symint %srJ  'z': r!  rK  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)r  r  r   r  )FF)
r  r  r-  r  r   rs   r  rs   r   rR  )Er	   r^   r  r   r  rt  r  r   r   r$  r)   r=   r   r%  rW   r   r  r   r   r^  r    	TensorBoxr  r  get_unbacked_symbol_defsSymbolr  r  r  rE  r  r3   r   r  r  r  r2   r  r'  r]  r\   r4   additional_buffer_depsadditional_star_depsr   r   r   r  rb  r  r  r;  rD  graph_outputsr   r  mutated_inputsr  r  r  mutated_input_idxsrJ  r<  rO   r  r(  r)  r  compute_dependencies_log)*rh   r   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_nodevalfsrk  sym_sizehas_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesrN  r   r   	node_modealt_namer+  out_buf
other_nameis_aliasadd_depr?  r  r   r   r   	inp_nameslogbufr  rb  r  r  r|  r  r  s*   `                                      @@@ri   rZ   zScheduler.compute_dependenciesl  sB
   	< 	<@ @K?V?V@
 JJ 	LD((* L MMO	 tyy//?D,,./!3!%!1!1!3 LI M1i=6P -i 8 -i 8#(5=#0 >C -c 2e ;#0#5#>5=c 2> #m33@3Ki03@3Ki0LL	L<	 !&!				 <		 			
 		 		 MO&
 77''..0 
	BC#uzz*** >B9=226>C. (+||~S!Auzz9RASS! BAnn B=A6r:BB
	B ',#JJ 	HD99((( $*		224:J$  * H!!U\\222 /3+::8<215H	H  JJ V	DIIotyy1*yy,,,'-II222F(($
 . GA >> #X&D%EF> <A>>K#'#4#4Q#7#C#C#E GC --gclln.EFGG D$$++,1 d&6&6&=&=!>??S?sI.HH	 	 '') E3,,./1444 # 1 1 3 EH%h/HXt,%%ghY&GH -h 7 = = E==?dmmo=$)$))5FGGG'+yy'<'<'> EG)0)9)9);J)/
);J (073F3F3H'HH -- '$.1408L!" %ZtD%EEEEB 7799$--/J S$5 !!''4==?D"QR	S 7777H 4$6!!''"234
 ((.. F!$0TYYd.>.>t.DEF %%d&;&;< '')  # 1 1 3 H>AllnD))&*:;69llnD))(3//33HhG ++CLLN;aV	r 002 	>HII,h7Xz'(*;<=	>
 'ww,, N111E NA >> #X&D&I&I&K%LM> ;1==q=(,(9(9!(<(M(M(O NHII M ( !
 %Xz'(:K/LMNNN )) 	:Dqww+++z'$-89&&**40***z'$-89	: ,5QWW5I5I5N5N5P+Q
'E4D%K
	 
 )*(>(>&
 $IdO&
"
 JJ 	CD'') CmCLLN;AABC	C // 	SD''-77d8K8Q8QR	S  !c'--/ 	4JC 4/4{{;!;;#c%234 4	4 	c  "))+ &&';< &&'I3OK TX
&
" <4 4s6   5i4i42i9i?j	j5j	j		j	c           
         ddl m}m}m}m} t        t        j                  j                  j                               } | j                  |      }t        j                  j                  j                  s | j                   j                         t        t        j                  j!                               } | j                  ||      \  }}	}	t#        t%         j                              D 	cg c]  }	g g f c}	|D ]}  }
|
j&                  dk(  r|
j(                  dk(  r"|
j*                  j-                         }|
j.                     d   j1                  |       |
j2                     d   j1                  |        ddlm}  |        	 	 	 	 	 	 d fd}g }t9         j                        D ]H  \  }}|j1                  |       |j1                   |||t%         j                        dz
  k(               J | _
        y c c}	w )Nr   )r  compute_memory_timelineFreeableInputBufferget_freeable_input_bufr   )register_check_mem_opc                X   |    d   }|    d   }|||g}t        j                  t        t        j                  d            t        j
                  j                  j                  j                  g |d       }dj                  |    j                          |_        t        |      S )Nr   r   r  )r  c                $    | |d   |d   |d   dfS )Nr   r   r   )alivedeadis_final_stepr{   )tensor_argsr  s     ri   rp  zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>  s*    !.q!1 -a 0)6q)9C rk   )r$  r:  r  nontensor_argsunflatten_args
mem_check_)r)   MemoryCheckKernelr=   r  r  r  _inductor_debugcheck_memory_stepdefaultr  r  r  r  )step_idxr  expected_newly_aliveexpected_newly_deadr  r   rh   step_allocs_deallocss         ri   construct_mem_check_nodezEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node  s     $8#A!#D "6x"@"C24GWN''!e)<=yy00BBJJ- D %/tzz(/C/L/L/N.O"PD,T488rk   )r  )r  r
  r  rs   r   r  )r:  r  r  r  r  r   rW   r   r  r   r  r  r  r&   r  r=  r;  rj  r   
size_alloc	size_freerT  r  
start_stepr   end_step#torch._inductor.runtime.debug_utilsr  r  )rh   r  r  r  r  r  name_to_freeable_input_bufr  buf_info_listr  buf_infor  r  r  	new_nodesr  r   r  s   `                @ri   rJ  z#Scheduler.insert_memory_check_nodes{  s   	
 	
 )31773G3G3L3L3N(O"4::|< 	# %%===

D,, *4AGG4L4L4N)O5JJ&
q! $C

O4C
RHC
 & 	HH""a'H,>,>!,C//1H !4!45a8??I !2!23A6==hG	H 	N	9	9*.	9&	92 	 , 	GAtT"(1DJJRS@S;SU	 
eC
s   3Hc                |  	 t         j                  syg }t        | j                        D ]  }dd	d}|j	                         D ]  }t        	fd|j                  D              }|r\t        j                  d|j                                t        j                  j                  j                  |j                                d} |j                          xr | }|s|j                  |       t        j                  d|j                                t        j                  j                   j                  |j                                |j"                  j$                  D ]  }|j&                  | j(                  v s| j(                  |j&                     j                  }|D cg c]0  }|j*                  j                         |j                         k7  s/|2 c}| j(                  |j&                     _          t-        t        |            | _        | j                  D ]  }|j/                           yc c}w )	z0
        Remove any nodes without users
        Nc                r    | j                   xs* | j                         t        j                  j                  v S rt   )r  r  rW   r   r  )r+  s    ri   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_user  s&    ||Tt}}!'':T:T'TTrk   Fc              3  .   K   | ]  } |        y wrt   r{   )r   ur  s     ri   r   z2Scheduler.dead_node_elimination.<locals>.<genexpr>  s     #Ma$6q$9#M   zremoved dead buffer: %sTzremoved dead operation: %s)r+  r  r   rs   )r&   use_dcer   r  rt  r   r  r  r  r  rW   r   r7  r  r"  r   r  r   r   r   r=  r   r   r  )
rh   updated_nodesr   active_buffersr   can_eliminater?  r  r  r  s
            @ri   r(  zScheduler.dead_node_elimination  s    ~~
 TZZ( 	DU #N'') * ##M399#M M II7HGG++//?%)N* !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22 DyyD$4$44 $ 0 0 ; A A',="#0AT]]_0TA=((39-	8 (=12
 JJ 	#D  "	#=s   %0H9H9c                
    |duS )z:Check if store mode requires cross-thread synchronization.Nr{   )rh   r  s     ri   mode_requires_synchronizationz'Scheduler.mode_requires_synchronization  s    4rk   c                    t        t                  t               g dfd|D ]  }|j                         D ]  }||<   	  |D ]
  } |        S )z?
        Ensure nodes is in topologically sorted order
        c                    | vrdj                  |        t        | j                  d       D ]&  }|j                  vr |j                            ( j	                  |        y y )Nc                    | j                   S rt   r  )ds    ri   rp  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>  s
    aff rk   r  )r  r  rk  r   r   )r  r   rE  r*  seenvisits     ri   r  z2Scheduler.topological_sort_schedule.<locals>.visit  se    }!!"6"6<LM 2Cxx|3 ,sxx01	2
 a  rk   )r  r\   r   rR  )r   r\   r  r  )rh   r  r   r   rE  r*  r  r  s       @@@@ri   r'  z#Scheduler.topological_sort_schedule  sy     +,.59V*,	! 	!  	*D--/ *%)T"*	*  	D$K	rk   c                <    t               }t        |t        t        t        t
        t        f      r-|j                  D ]  }|j                  |j                          nt        dt        |       d       fd|D        }t        t         fd|D                    S )Nz+get_unmet_dep_nodes is not implemented for .c              3  X   K   | ]!  }j                   |   j                          # y wrt   )r=  r  r  s     ri   r   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>&  s%     Xc))#.??AXs   '*c              3  <   K   | ]  }j                   |     y wrt   r  )r   r  rh   s     ri   r   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>'  s     Qat66q9Qs   )r   r   r   r  r9  r   r  rk  r  r   RuntimeErrorr   r   )rh   r  
unmet_depsr   unmet_dep_opss   `    ri   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodes  s    &0l
)&"$	
 // )sxx() =d5k]!L  YZXJQ=QQRRrk   c                z   g }t         j                  | j                  d      }i }| j                  D ]P  }| j                  |      }t	        |      ||<   |D ]*  }|j                  |g       }|j                  |       |||<   , R |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|j                  |
       |
D ]7  }|j                  |g       D ]  }||xx   dz  cc<    |j                  |       9 |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|rJ d       |S c c}	}w c c}	}w )zU
        Sort nodes by their topological order, return a list of node lists.
        r   r   zTopological sort failed!)	r  fromkeysr  r  r   r  r   r]  r  )rh   r  r  childrenr   r  r   cr  rb  zero_deg_nodesr+  s               ri   ro  z!Scheduler._topological_sort_nodes)  sF    djj!,#%JJ 	"D,,T2Dd)E$K "LLb) !"	" ).@1a!@@LL(# $LLB/ %D$K1$K%		! -2KKMDDAqQ!VaDND  444y A Es   D1%D1D7D7c                j   i }| j                   D ]w  }t               }|j                  D ]B  }| j                  |j                     j                         }|j                  |       |||   z  }D |||j                         <   ||_        y t        | j                         D ]  \  }}||_
        ||_         y)z.
        Populate each node.ancestors
        N)r  r   rk  r=  r   r  r  r  r   r  r^  r_  )rh   name_to_ancestorsr   r   r   dep_node_namer  s          ri   r)  zScheduler.compute_ancestorsC  s    
 9;JJ 	'D)3I.. > $ 0 0 : K K Mm,.}==	> 2;dmmo.&DN	' %TZZ0 	#KE4"DN"DN	#rk   c                H   t         j                  sy | j                  D ]  }t        |t        t
        f      r#|j                         st         j                  dk7  r=|j                         D ]3  }t        |t              r|j                         r$|j                          5  y )Nhalide)r&   rO  r  r   r   r   rR   cpu_backendr   r  rz  )rh   r   r  s      ri   rz  zScheduler.merge_loopsV  s    00JJ 	$D d]4F$GHKKMf&8&8H&D) $!%75;L;L;N!!#$	$rk   c                   t        ddd      5  t        d      D ]  }t        |      }t        j	                  d|dz   |       | j                  |d      }t        |      }t        j	                  d	|dz   ||       ||k(  s|dk(  slt        j	                  d
|dz           n t        j                  st        j                  r| j                  |d      }|cddd       S # 1 sw Y   yxY w)zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTr  r  z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   rj  r   r  r  fuse_nodes_oncer&   rO  loop_index_inversion_in_fusion)rh   r  r  old_lennew_lens        ri   r  zScheduler.fuse_nodesq  s     #4QU
 	 2Y e*  EE
 ,,UU,Ke*  TE	 g%A$$Eq1u ', 1188,,UT,J;	 	 	s   A7C!AC!!C*c                    g }| j                   D ]4  }|j                  t        |t              r|j	                         n|g       6 || _         y)zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r  r   r  r  )rh   r  r   s      ri   rB  zScheduler.process_grouped_nodes  sJ     .0	JJ 	D!+D2F!GdV	 
rk   c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        ddd      5  |j                  |      cddd       S # 1 sw Y   yxY w)
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)r  dynamo_compile_column_usN)r   r   rX  rM  r   r  )rh   r  r  r9  s       ri   r  zScheduler.benchmark_fused_nodes  st     5zA~~q$$&$""6*#"&%D
 	8
 007	8 	8 	8s   
A%%A.c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        d      5  |j                  |||      cddd       S # 1 sw Y   yxY w)r  r   generate_kernel_code_from_nodeshint_overrideN)r   r   rX  rM  r   r  )rh   r  benchmark_kernelr  r  r9  s         ri   r  z)Scheduler.generate_kernel_code_from_nodes  sw     5zA~~q$$&$""6*;< 	::'} ; 	 	 	s   A%%A.c                    || _         | j                  |      }t        d      5  |j                  |      cddd       S # 1 sw Y   yxY w)r  benchmark_codegened_moduleN)rX  rM  r   r  )rh   moduler  r9  s       ri   r  z$Scheduler.benchmark_codegened_module  sH     %""6*67 	>55f=	> 	> 	>s	   ?Ac                   t         j                  j                  }|syt        j	                  d||       |j
                  D ]  }|j                         }t        |dd      r||vr%|j                  }||   }t        |t        j                        r'|j                  |j                         |j                  }t        |t        j                        s||k7  st        j                  d|||        y y)z
        Check if selecting a Triton template would cause layout conflicts.
        Returns True if there's a conflict and we should fall back to ATen.
        FzNode %s has constraints %sr$  NzOLayout conflict detected for %s: template expects %s but layout is frozen to %sT)rW   r   buffer_layout_constraintsr  r  r  r  r  r$  r   r)   FlexibleLayout freeze_layout_with_exact_stridesr  FixedLayoutr  )rh   
multi_nodeconstraintsinpinp_namer$  expected_layouts          ri   !_has_layout_conflict_for_templatez+Scheduler._has_layout_conflict_for_template  s     gg77		.
KH$$ 	C||~H3$/8;3NZZF)(3O&""3"34 44_5K5KL&"..1o6Oe#	 -	0 rk   c           
        t        | j                        D ]&  \  }}t        |t              st        |j                  t
        j                        s=|j                  }t        j                  j                  s|j                         \  }}n t        d |j                         D              }t        |t        j                  j
                  j                        r| j!                  |      r||j                         D ]4  }t        |t        j                  j"                  j$                        s2|} n t        t        j                  j"                  j$                        sJ d       t        |t        j                  j
                  j                        rt        j&                  ri }||d<   t        j&                  D ]k  }|j                  |      }	|	j)                         D 
ci c]  \  }
}t        |
t              r|
| }}
}t+        |j)                         d       d   }|||<   m |j                  j-                  |       n|j                  j/                  |       Nt
        j0                  j3                  |j4                        5  |j7                         }ddd       j8                  }t        |t
        j:                        sJ |j8                  }t        |t
        j<                        sJ |j>                  rtA        ||j>                         |jB                  |_!        | jE                  ||||       ) yc c}}
w # 1 sw Y   xY w)a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        c              3  |   K   | ]4  }t        |t        j                  j                  j                        r| 6 y wrt   )r   r  r  r5  ExternKernelCaller)r   timings     ri   r   z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>	  s6       &) & % @ @ S S  #r  zZNo extern kernel detected to fallback to when layout constraints fail for Triton templatesNr  c                    | d   S rD  r{   r  s    ri   rp  z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>9  s    qQRt rk   r  r   )#r  r  r   r   r   r)   MultiTemplateBufferr&   rH  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr  r  r   r  r5  r  multi_kernel_hintsr]  r  finalize_as_triton_callersfinalize_as_triton_callerr  current_originsrV  output_noder   
StorageBoxOperationBufferorigin_noder8   r$  _replace_node)rh   r  r   r  min_node_unfusedr  choicecallershinttimingsr  rb  triton_timingsout_tensorboxout_storage
out_buffers                   ri   r2  z)Scheduler.finalize_multi_template_buffers  s    !, L	DGAt$.:		2114 "YY
**PP*4*C*C*E'$a'+*4*C*C*E	($ $OO&&??
 ==jI&0&?&?&A &F) & % @ @ S S  4: 0 %&  *"EOO$D$D$W$W   y   $OO&&??
 00QS(8 %+$=$= 3D&0&?&?d&?&SG -4MMO.$(Aq#-a1I#J !"1.N .
 &))=)=)?^%TUV%WF,2GDM3 		<<WE		;;<LMYY..z/A/AB C$4$@$@$BMC+00!+r}}===(--
!*b.@.@AAA))&}j6L6LM$.$5$5
!"":z1dCYL	Dh.C Cs   %M
MM	c                   t        ||       | j                  |      }|| j                  |<   || j                  |j	                         <   || j
                  |j	                         <   i t        j                  |j                  j                  |j                        D ]:  }| j                  j                  |j                  d       x}s,|j                  |<   < dfd} ||j                        |_
         ||j                  j                        |j                  _	        t        |j                         |j                               D ]3  \  }	}
|	| j                   |
j	                         <   |
j"                  |	_        5 |j$                  |_        |j&                  |_        |j(                  |_        |j*                  |_        y )Nc                ,    t        fd| D              S )Nc              3  @   K   | ]  }|j                          y wrt   )r  )r   r   rb  s     ri   r   z?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>c  s     Kscjj)9:Kr)  r   )r  rb  s    ri   rename_depsz,Scheduler._replace_node.<locals>.rename_depsb  s    KdKKKrk   )r  rj  r   rj  )r  r  r  rE  r  r  r  r  r   r   rk  r  r  r   r  rt  r=  r  r^  r_  r   r]  )rh   r&  r  r  r   new_scheduler_noder   	real_namer*  new_outold_outrb  s              @ri   r  zScheduler._replace_nodeN  s    	"*j9!77
C*

1-?$--/*3E0 ??4#3#3#9#94;R;RS 	7C 3377$GGyG.1hh +	7	L 1<111
- 0;**000
&&, !$**,d.>.>.@!
 	*GW 4;DW--/0#MMGM		* (,~~$'+~~$'+~~$(,%rk   c                &    t        d |D              S )Nc              3     K   | ]q  }t        |j                  d       xrU |j                  duxrE t        |j                  j                  d      xr# |j                  j                  j                  dk(   s yw)r   Nscatter_moder  )r9  r   r   r1  r  s     ri   r   z,Scheduler._any_atomic_add.<locals>.<genexpr>x  so      

 	 AFFF# 9d"9^49 ((L89
s   A7A9)r   )rh   	node_lists     ri   _any_atomic_addzScheduler._any_atomic_addw  s     

 
 
 	
rk   c                "   | j                  |d|      }t        j                  |      }t        j                  j
                  j                         }|j                         sd }||fS |j                  d|      }t        |t              sJ ||fS )NT)r  r  triton_)kernel_namesource_code)r  r   loadr  r  async_compileAsyncCompileuse_process_poolr   r   r   )rh   r  r  src_codemodr9  futs          ri   compile_kernelzScheduler.compile_kernel  s     77D 8 
 x(55BBD--/C
 Sz  &&9(&SCc<000Szrk   c                >    !"#$%&'()* t        d fD              }t        j                  s|st        j	                  d      S j                         r(t        j                         t        j                        r j                         sj                         rt        j	                  d      S j                         }|d   j                         sJ j                  dk(  r(t        j                  dk7  rt        j	                  d      S j                         }t        t!        j"                  ||            } j%                  |      rt        j	                  d      S ddlm t+              *|d   j                         J dfd$|rt        d	 fD              rj                         durj                         nj                         )t        )t        j,                        sJ  j/                  )      rt        j	                  d
      S i #g !t        j0                  D ]A  })j3                  |      t5        j7                         d       D ]~  \  }}	t        |t8        j:                  j<                  j>                        s5)jA                  |      5  !jC                  |g jE                  ||jF                               ddd        tI        d      }
d}i }!D ]V  \  }}}	 ||jK                          )jA                  |      5   j[                  |      \  }}|||<   ||
k  r|}
|}ddd       X |)j\                  |<   t        |t^              sJ |#|<   D t        j`                  tc        d )jd                  D              }tg               xr  xr |t        jh                  k  "tI        d      tI        d      c&'d%"sR)j3                         )jk                         \  %&t5        j7                         tm        jn                  d            }n)jd                  D cg c]  }|df }}r(r jq                  |      n jq                  |      \  '}n4st        j	                  d
      S js                         'tu        '      (g !d}|D ]  \  }}t        |t^              ss&tw        |d      r|jx                  )jx                  k7  r?r
|&'z   k\  r nZ|dz  }|t        jh                  kD  r n@)jA                  |      5  !jC                  |g jE                  |             ddd        t{        !      dk(  rt        j	                  d
      S d!"#$%&'() fd}t        j}                  |!d   d         S  jE                  |       jE                  |        jE                  |      d $ *fd}t        j}                  |d         S # 1 sw Y   xY w# tL        $ rR}tN        jQ                  tR        jT                        r$tN        jW                  dsdndtY        |             Y d}~rd}~ww xY w# 1 sw Y   xY wc c}w # 1 sw Y   xY w)
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c              3     K   | ]>  }|j                         xr( t        |j                         t        j                         @ y wrt   )r  r   r  r)   r  r  s     ri   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  sE       
  MMO J1..0"2H2HIJ 
s   AATr   r  r   CompilationErrorNc           
     t   t         j                  t        j                        r| ||z   k  rFt         j	                  dj                         j                         t        ||z   | z  d             y t         j	                  dj                         j                         t        | ||z   z  d             y y )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  rb  rc  DEBUGr  r  rB   rC   )ms_fusedms1ms2r~   r   s      ri   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}}5cCi'$$S..0..0"sSyH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rk   c              3  @   K   | ]  }|j                         d u  y wrt   r  r  s     ri   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  s#      %
23A!-%
s   Fc                    | d   S rD  r{   r  s    ri   rp  z-Scheduler.speedup_by_fusion.<locals>.<lambda>  s    aPQd rk   r  r  infException in compiling %s: %sr  r  c              3  <   K   | ]  }t        |t                y wrt   )r   r   )r   r  s     ri   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  s      %<=
167%r  r   allowed_prologue_inpsc            	        t        d      } d }i }rQrt        t        j                        sJ j	                         j                         \  t        fd      D ]  \  }}}	 ||j                         }n!s|j                  }|j                          nd }r>j#                  |      5  j%                  |      \  }}	|||<   || k  r|} |}d d d        ||k(  xs z   |   z   kD  }
|st'        |j(                        dk(  s|j(                  d   j*                  d	k  s|
s|} n r
 |        r| z   k  rJ|Ht,        j.                  r|d <   j1                         nj3                  |       |j4                  d <   y
y# t        $ rR}t        j                  t        j                        r$t        j                  dsdndt!        |             Y d }~d }~ww xY w# 1 sw Y   xY w)NrN  c                    | d      S r   r{   )rA  r  s    ri   rp  zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>f  s    nQqT&: rk   r  rO  r  r  r   r   rm  TF)r  r   r)   r  r  r  r  r*  r5  
precompiler  r  rb  rc  rG  r  r  swap_as_triton_callerr  r   	launchersn_spillsr&   r  r  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr  re   	mod_fusedresr  rH  pathfusible_choicebench_epiloguer  r  r  future_choicesget_choice_timings_async hint_override_best_fusion_choicerK  
min_choicerI  rJ  	ms2_fusedr  rh   s              ri   benchmark_when_readyz9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyY  s5   $U|"& +%*ZAWAW*XXX%/%>%>%@N&0&?&?&AOJ%+&:&N
 2@ 0"-FFI!!-"(--/C!/"+"3"3CNN,"&C &'==fE 	9-1-L-L ) &.NHd
 3;K/',6/728	9 	9 '&0 N"Sy>&+AI+MM '   #CMM 2a 7 #a 0 9 9Q > ..4O!a0"d "|S#6 ',#)*D%100AP8>"==<
 #<<_M 8CJ..t4 u % !%227==A&,, ?2A
z #A
 !!	9 	9s%   .4F5$G7	G4"AG//G47H	c                    ddl m}  	 d   d   d   fD ]  }||j                           j                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       y        t        d      rWz   k\  rOfj                  vr?j                  j                  f       t        d      j                  fd	       z   k  S # | $ r Y y	$ r}d
t        |      v rY d }~y d }~ww xY w)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior{   )rI  rJ  rH  path1path2
path_fuseds   ri   rp  zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s(    053605365?8@3;sSy3I% rk   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsrh  r*  r  mathisinfr   r-  r  r   rO  r  )rh  r>  r  rI  rJ  rH  rr  rs  rt  rD  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2rK  rh   rO  s      @@@@@@ri   rf  z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  s   A *!,)!,/2  )
 ?JJL) "&!@!@)!,"JC
 zz#CD$!%!@!@)!,"JC
 zz#DE$+/+J+J/2,(Hj
 zz(+CD$xc2 0>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E AE +5E !5E A3E E.E.E)(E))E.rq   )rH  r  rI  r  rJ  r  r   rR  rS  )?r   r&   benchmark_fusionra   ro   r  r   r  r)   TritonTemplateBufferr  r   r   r   r  r   r  r  r3  triton.compiler.errorsrD  r  r  r  r  r  r  r]  r  r  r5  TritonTemplateCallerrU  r   r?  r  r  r*  r  r  rb  rc  rG  r  r  r  rX  r   benchmark_epilogue_fusionr   choicesr    max_epilogue_benchmarked_choicesr  operator
itemgetterr  r  r  r9  rQ  r   rr   )+rh   r~   r   is_multi_templatenode_list_1node_list_2node_list_fusedr  r  r  rY  rZ  r[  re   r\  r  rH  r^  num_triton_callerschoice_timings_iterr  rs  triton_choicesunfused_timerf  rD  r`  r  r  r  ry  rz  r{  ra  rb  rc  rK  rd  rI  rJ  re  r  rO  s+   ```                      @@@@@@@@@@@@@@@@@@ri   speedup_by_fusionzScheduler.speedup_by_fusion  s       
 U^ 
 

 &&/@$$T** u668":Q:QR!!  $$T**oo'Q**,v ;;%F$6$6($B$$T**oo'y{KHI
 0$$T**;u% #..0!!!	"  %
8=u~%
 "
 $557tCO # ''),,. 
 j"*@*@AAA55jA#((//  - TVN!'!:!: *R!+!:!:=!I!'(<(<(>N!S IFA% @ @ U U !#99&A &-- &!%!4!4$36CWCW "5 ""   %U|FJ 1? 5-FFI
!!-"MMO $99&A 5)-)H)H%v*$ /7F+#l2+3L.4O5 55( =H
**=9!/3KLLLBQ0?U*RX $==N!$ %AKASAS% "
 )* R&&R&&*Q*QQ % U|U5\HC15J+!+!:!:!<",";";"=
C&,"((*0C0CA0F'# 8B7I7I&J!1v&J#&J ' ..{;33K@ U '',,U33224<UE3O	 TVNN(; $!&*BC ((?@44
8X8XX!lcCi&?!#!F$K$KK55f= "))G$"5"5o"FG /8 >"a'#((//V! V! V!p  --$nQ&7&:  !% 3 3K @ $ 3 3K @&*&9&9/&J#F FP  --09PQR9S .  q " % !%227==A&,, ?2A
z #A
 !!5 5F 'KT sC   :1XX"1$Z Z:%ZX"	Y=+AY88Y= Z
Z	c                <    | j                   |j                            S )z0Look up the node in Scheduler name_to_fused_node)r  r  r  s     ri   r*  zScheduler.get_fused_node  s    &&t':':'<==rk   c                   t         j                  d|j                         |j                                |j                         }|j                         |k(  sJ | j	                  |      j                  ||      }|j                  |       |j                  |       |j                  |       | j                  j                  |j                         D ci c]  }|j                         | c}       |S c c}w )Nzfusing %s with %s)r  r  r  r   rM  ro   r  r  r  r  r   )rh   r~   r   rX  r  node3r  s          ri   fuse_two_nodeszScheduler.fuse_two_nodes  s     	,enn.>@PQ!!#!V+++  (--eU;5!5!&&U__EV'W

e(;'WX (Xs   C5c                    | j                  ||      r-| j                  ||      s |       r| j                  |||       yyNTF)r   will_fusion_create_cycler  )rh   r~   r   
speedup_fnrX  s        ri   fuse_if_speedupzScheduler.fuse_if_speedup  s?     MM%'11%?uk:rk   c                   |rg }i }t               }|D ]  }||v rt        ||         dk\  sJ ||   j                  d      }t        ||         dk(  r|j                  |       |j	                         \  }}	|	|k(  rt        ||	      sJ |}
n||k(  sJ t        ||	      sJ |	}
| j                  |
      |
ur|j                  r3|j                  j                  }|J |j                  |       ||f||<   | j                  ||	|j                  |      s|j                  |        t        |      D ]l  }||   \  }}| j                  | j                  |j                        | j                  |j                        |j                  |      s\|j                  |       n |D ]  }|j                  |        |ryy)z
        Evaluate pending template fusions for a set of fusion candidate nodes.
        The fusion candidate nodes are pointwise nodes as potential epilogue
        or prologue fusions
        r   r   N)r   r   r  r  r   r  r  r*  re   r   r  rc   r   r~   r   )rh   template_fusion_candidatesrX  template_futuresfuture_to_pending_fusionfusions_to_remove	candidatepending_fusionr~   r   r  fcands                ri   "_evaluate_pending_template_fusionsz,Scheduler._evaluate_pending_template_fusions-  s    )-/  % @J|7 %9	!;;6yABaGH "<I!F!J!J1!M1)<=B%)))4->>@uI%-eU;;;$)M I----eU;;;$)M &&}5]J!((&--44A=(=$++A.3A92M,Q/ ++un&@&@+ *--i8K%9P ""23 0'?'B$''''(<(<=''(<(<="..	 &))$/0 ' 2*..q12q )rk   c                    	 	 	 	 	 	 d fd}|D ]H  \  }} |||        j                  |      } j                  |      }t        ||      r||f j                  v rO j                  |||      sc j	                  ||      rv j                  ||      }	|	j                  t        |	j                  |||	j                        }
t        ||      rY||f j                  vsJ  j                  j                  ||f       t        ||      }||vrg ||<   ||   j                  |
       n
|
|<   |
|<   (|	j                  s6 j                  ||       K y )Nc                b   j                  |       v sj                  |      v rj                  j                  |       j                  j                  |                  }|J |j                         \  }}|j                  }j	                  |d        j	                  |d        j                  |      |u sJ j                  |      |u sJ  |       rj                  | |      rj                  ||       j                  |       v rj                  |      v ry y rt   )r*  r  r   rc   r  r  r  )	r~   r   r  	node_key1	node_key2
is_speeduprX  pending_fusionsrh   s	         ri   resolve_pending_fusionsz<Scheduler._try_fusion_pairs.<locals>.resolve_pending_fusions{  s1   
 ##E*o=&&u-@!0!4!4''.#''(;(;E(BC" &111'5'F'F'H$	9+77
##It4##It4**95BBB**95BBB!|t'D'DUE'R##Iy+F+ ##E*o=&&u-@rk   )rc   r~   r   re   r  )r*  r  r%  r   r  r  rc   r}   re   r  r  r   rb   r  )rh   possible_fusion_pairsr  template_fusion_nodesrX  r  r  r~   r   
fusion_resr  template_pw_nodes   ` ` `       ri   _try_fusion_pairszScheduler._try_fusion_pairss  s   	G$	G$	G 	G8 2 +	?LE5 $E51''.E''.E #5%0ENd&@&@@}}u.33E5A!33E5A
))5%2$.$:$:##)00	&N *%7 %u~T5O5OOOO2266u~F+B5%+P(+3HHFH12BC-.>?FF~V1?.1?.!--##E5+>W+	?rk   c                @   t               }|j                         D ]  }|j                         \  }}|j                  }||v st	        ||      r3|j                  |       | j                  |      |u sJ | j                  |      |u sJ | j                  ||||        y rt   )r   r   r   rc   r  r  r*  r  )rh   rX  r  seen_pair_speedup_fnr  r  r  is_speedup_fns           ri   _finish_pending_fusionsz!Scheduler._finish_pending_fusions  s    
 @J| .446 	SN#1#B#B#D Iy*66M 448J99  $$]3&&y1Y>>>&&y1Y>>>  I}kR	Srk   c           
         t        |D cg c]  \  }}t        ||      s| c}}      }g }|D ]<  \  }}t        ||      r||v r|j                  ||f       *|j                  ||f       > |}y c c}}w rt   )r   r  r  r   )rh   possible_fusionsdeferred_prologue_fusionsn1n2epilogue_template_nodesnew_possible_fusionss          ri   _handle_template_overlapz"Scheduler._handle_template_overlap  s     #-.MFB2DR2LRM#
  "& 	6FB!"b)b4K.K)00"b:$++RH5		6 0 Ns
   A2
A2
c                   | j                  |       t        |      }t        j                  t        j
                        r@t        j                  d       |D ]&  }t        j                  d|j                                ( i }i }g }| j                  ||      }t        j                  st        j                  r2t        j                  r"t        j                  r| j                  ||       | j                  |||||       | j!                  ||       | j#                  ||       |j%                          |r'| j                  |||||       | j#                  ||       t'        |d       }| j)                  |      }|S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sc                    | j                   S rt   r  r  s    ri   rp  z+Scheduler.fuse_nodes_once.<locals>.<lambda>3  s
    !++ rk   r  )r  r   r  rb  rc  rG  r  r  get_possible_fusionsr&   r3  r4  r  r  r  r  r  r  clearr  r'  )	rh   r  r  rX  r   r  r  r  r  s	            ri   r  zScheduler.fuse_nodes_once  sv    	!!%( '""7==1;<# A  )=)=)?@A  	
 OQ  	"  44
 %%)<)<&&&&))*:<UV!	
 	$$[/B//0E{S##%$"")%  334I;W{(=>..u5rk   c                   t        | j                        }d}t        | j                        }t        j	                  d|       t        t        j                  |             D ]  \  }}t        j                  |      }t        |      dk  r+|||kD  r n| j                  |      st        j	                  d|       \|dz  }t        j                  dkD  }t        |d   j                  |d|      }t        j                  d	t        |      |       |D ]  }	|j                  |	        |j                  |       | j                   j#                  |j%                         D 
ci c]  }
|
j'                         | c}
       ! t)        |d
       | _        | j+                  | j                        | _        t        j                  d||t        | j                               | j-                  | j                         yc c}
w )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Trj  z0ComboKernels: Combining %d nodes for %d-th groupc                    | j                   S rt   r  r  s    ri   rp  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>]  s
    q{{ rk   r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  r   r  r  r  r>  r  rk  speedup_by_combo_kernelr&   rl  r  r  r  r  r  r  r   r  r  r'  r  )rh   r  rX  r  num_nodes_orignumr2  rW  r  r   r  s              ri   r9  z#Scheduler.create_combo_kernel_nodes7  s    !,TZZ		FU'&DDTJ
 	NC 3CCINI9~!'EL,@//	:		EsKQJE$;;a?O4!&&*. /	K HHBI
 " )""4()OOK(##**4?4I4I4KLq{*L7	< K-BC
33DJJ?
R

O		
 	!!$**- Ms   !G=
c                H    |D ]  }|j                  | j                          y rt   )r  r  )rh   r  r   s      ri   r  zScheduler.prune_redundant_depsg  s%     	?D%%d&=&=>	?rk   c                   
 g 
t        t        t        t        f             d
 fd}t        j                  t
              }|D ]=  } j                  |      r|j                         D ]  }||   j                  |        ? |j                         D ]
  } ||        t        j                  rat        j                  t
              }|D ]&  }t        |dd      }	|	s||	   j                  |       ( |j                         D ]
  } ||         j                  
      

j                   j                  d       t         j#                  dt%        
             
S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                |   t        |       D ]  \  }}| |dz   |dz   t        j                  z    D ]  }||f}|v rj                  |       j	                  ||      rj                  |       B|j                         s|j                         scj	                  ||      swj                  ||f         y rD  )r  r&   )max_fusion_buffer_group_pairwise_attemptsr  r   r   r  r  )	r  node1_indexr~   r   r  r  r  r  rh   s	        ri   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairsv  s    &/&6 @"U"!Ok'FF'G @E
 !%.Cd{ HHSM}}UE3CD(//4++-1A1A1Cu&6J )//?!@@rk   r   NT)r  reversezfound %d possible fusionsr  r  r   rR  )r   r   r\   r  r   r   unfusable_noder   r   r   r&   aggressive_fusionr  *get_possible_fusions_with_highest_priorityr  score_fusion_keyr  r  r   )rh   r  r  r  buffer_names_groupingr   r   node_groupinggroup_groupingr   r  r  s   ` `       @@ri   r  zScheduler.get_possible_fusionsk  sn    % 13D DEFH	@ 	@( !, 7 7 = 	8D""4(--/ 8%c*11$78	8
 399; 	+MM*	+ ##(44T:N 7gt4"5)0067 "0!6!6!8 /./  JJ
 	$"7"7F4c:J6KLrk   c                    t        t                  d fd|j                         j                  j	                         |j                         j                  j	                         z  |j
                  j                  j	                         |j
                  j                  j	                         z  z
  t         fdD              }|r t        ||      d       |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        c                   t        | t              rq| vrmj                  |        | j                         j	                        ryt        | j                  z        xs" t        fd| j                  z
  D              S y)NFc              3  H   K   | ]  } j                   |           y wrt   r  r   r  
found_pathrh   s     ri   r   zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s+      H #4#:#:1#=>H   ")r   r   r  r   issubsetrs   r   r   )r   combined_ancestorscombined_namesr  rh   visiteds    ri   r  z6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 23G8KD!++-667IJ !   ?@ C H!%2D!DH E  rk   c              3  H   K   | ]  } j                   |           y wrt   r  r  s     ri   r   z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s!     WqJt66q9:Wr  zwill create cycler  )r   r   r   _dictr   r   r   r  )rh   r~   r   cycler  r  r  r  s   `   @@@@ri   r  z"Scheduler.will_fusion_create_cycle  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWW#IeU#$78rk   c                    ddl m 	 	 	 	 d fd} ||      } ||      }t        fd|D              }t        fd|D              }|j                  |      }d}	|D ]  }
	 |	t	        |
d         z  }	  j                  ||      }t        j                  j                  j                  |	d	|z        ry
y# t
        $ r Y  yw xY w)a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   )buffer_reuse_keyc                0   g }| j                   j                  D ]y  }j                  j                  |j                        }|s+t        |j                        dk(  sD|j                  j                         s_|j                  |j                         { |S rD  )
r   r   r=  r  r   r   r  r   has_tensor_outputr   )r   ru  rH  r   rh   s       ri   _find_single_user_inputszKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,, ,&&**27733syy>Q.3883M3M3OMM#((+, Mrk   c              3  .   K   | ]  } |        y wrt   r{   r   r   r  s     ri   r   z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>       #Sc$4S$9#Sr  c              3  .   K   | ]  } |        y wrt   r{   r  s     ri   r   z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>  r  r  r   r   F    T)r   r\   r   zlist[ir.Buffer])r/  r  r   intersectionr
  r  r1  rW   r   r   statically_known_gt)rh   r~   r   r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingr  s   `           @ri   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$ 	C3s1v;.	 ,,UE:	 77//iP  s   $B88	CCc                   t        |j                         D cg c]  }|j                          c}|j                         D cg c]  }|j                          c}z         }t        d |j                  j                  D              }t        d |j                  j
                  D              }||z  }t               }	|j                  j                  D ]:  }
| j                  |
j                  |      s |	j                  |
j                         < t        d |j                  j
                  D              t        d |j                  j
                  D              z  }t        d |j                  j                  D              t        d |j                  j                  D              z  }||z
  }||	z
  }||z  }t        |      |kD  S c c}w c c}w )Nc              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s     &TCsxx&Tr  c              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s     %R3chh%Rr  c              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>&  s      $
CHH$
r  c              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>(  rp  r  c              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>+  s      %
CHH%
r  c              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>-  s     DCsxxDr  )
r   r   r  r   r  r   $can_buffer_be_removed_through_fusionr   r  r   )rh   r~   r   	thresholdr   fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionr  all_read_namesall_write_namesunique_readsunique_writesunique_io_bufferss                   ri   (fusion_prevent_too_many_reads_and_writesz2Scheduler.fusion_prevent_too_many_reads_and_writes
  s    &).):;T]]_;+0??+<=4t}}=>
 '&T5;L;L;S;S&TT%%R%:K:K:Q:Q%RR'7:K'K$ :D%**11 	BI88 0 .11)..A		B $ $
 % 1 1 7 7$
 
C5+<+<+B+BCCD
 % %
 % 1 1 8 8%
 
D5+<+<+C+CDDE
 &(DD (*GG )=8$%	11M <=s   GG
c                    t        t        |j                  |j                  z
        t        |j                  |j                  z
              }|dkD  S )aA  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heuristic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r  r^  r_  )rh   r~   r   proximity_scores       ri   are_long_distant_nodesz Scheduler.are_long_distant_nodes:  sE    * %//12%//12
 ##rk   c                   i }|j                   j                         D ci c]  }|j                  | }}|j                   j                         D ci c]  }|j                  | }}|D ]}  }t        j                  j                  |      }	||   }
||   }t        |
t              rt        |t              sdt        |
       dt        |       ||<   k|
j                         |j                         k7  r(d|
j                          d|j                          ||<   t        |
j                        t        |j                        k7  rd||<   |
j                         }|j                         }||k7  rd| d| ||<   |
j                         |j                         k(  rd|
 d| ||<   Ed}t        |	t        j                        sd|	j                    }d	|
 d| d
| ||<    t#        |      S c c}w c c}w )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        znot MemoryDep: r   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r  zLayout: zUnknown reason: z. )r   r  r   rW   r   r{  r   r2   r   r   rV   r   
get_offsetnormalize_with_stride_orderr)   r~  r$  r  )rh   r~   r   common_buf_namesreasonsr   node1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  ri   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reasonU  s    383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX( ,	H''$$X.C$X.G$X.Ggy1GY9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G'! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#56'

|4
"7)6'"ZLI HU,	\ 7|c YXs   G5G:c                   t         j                  syt        d ||fD              ry|j                  j	                         }|j                  j	                         }||z  }|syt        d |j                  D              }||z
  ryt        |      dkD  ryt        |j                  j                        dkD  s"t        |j                  j                        dkD  ryt        t        |j                  j                              }t        t        |j                  j                              }t        |t              rt        |t              sy|j                  j                  D 	ci c]  }	|	j                  |	 }
}	|j                  |
vry|
|j                     }t        |t              sy|j                         }|j                   |j                   k7  r|j"                  |j"                  k7  ry|j"                  |j"                  k7  st        |j$                        dk7  ryt        |j&                  j(                        dk7  ry|j&                  j*                  ryd|j&                  j(                  v rd|j&                  j(                  v sJ t        d |j&                  j-                         D              }t        |      dk7  ryt        t        |            }||j&                  j(                  d   k(  rd}d}n"||j&                  j(                  d   k(  sJ d}d}d	d
lm} |j&                  j2                  d	   }t        |      dk7  ryg }t4        j6                  j9                  |      D ]:  }|j;                  t<        j>                  j@                  jC                  |             < tE        |      } |||d	         }|y|j&                  j(                  |   |j&                  j(                  |<   ||j&                  j(                  |<   |jG                  dd       | jI                  ||      }t        |tJ              sJ tL        jO                  d|       |S c c}	w )aW  
        Attempts to enable fusion between two nodes by inverting indexing patterns.

        This optimization targets cases where node1 has a contiguous write and
        node2 has a contiguous write but discontiguous read. By inverting the
        indexing in node2's read and write operations, we can make them compatible
        with node1 for potential fusion.

        Args:
            node1: First scheduler node (source)
            node2: Second scheduler node (target for inversion)

        Returns:
            int: Fusion score if successful, 0 if optimization not applicable
        r   c              3  <   K   | ]  }|j                           y wrt   r  r  s     ri   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s     2aqxxz2r  c              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s      .
CHH.
r  r   r   index0index1c              3      K   | ]  }|  y wrt   r{   )r   r  s     ri   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s     %Ttd%Ts   r   )generate_inverse_formulaTFz!Shared memory after inversion: %d)(r&   r  r   r   buffer_namesr   rk  r   r   r  r  r  r   r2   r   rI  r   r   r  r   r   	subblocksget_read_exprs$torch._inductor.invert_expr_analysisr  varsr   Add	make_argsr   rW   r   r   combine_modular_indexing_pairsr   r`  r1  r
  r  r  )rh   r~   r   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writer   node1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexr  r  simplified_termstermsimplified_read_exprinverse_formulascores                          ri   $shared_data_after_inverting_indexingz.Scheduler.shared_data_after_inverting_indexing  s   & 442E5>22 #..;;="..;;=03EE" $. .
 % 8 8.
 $
  $&88'(1, u  &&'!+s53D3D3K3K/Lq/P$u006678
4 1 1 8 89:*i0
9
 161B1B1I1IJ##JJ??,.":??3+y1 "++- !2!22  K$4$44??k...#j6J6J2Kq2P u{{))*a/ ;;   222EKK666	
7
 &%Tu{{7Q7Q7S%TT A%./0	 228<<&O' : :8 DDDD&O'Q[[%%a(
z?aII''	2 	D##  ??E	  ##3423GTUW "
 7<kk6P6P7
""?3 8G""#34 	""4/((6%%%%;UCm Ks   "Qc                t   t         j                  rt        d ||fD              ry|j                         s|j                         ry|j                  j                         }|j                  j                         }||z  }|sy|j                  j                         D ci c]  }|j                  | }}|j                  j                         D ci c]  }|j                  | }}g }	|D ]y  }
||
   }||
   }|j                         |j                         k(  s/|	j                  t        j                  j                  j                  |j                         d      ||f       { t        |	      dk(  ryt!        |	t#        j$                  d            \  }}}t'        |t(              rt'        |t(              sy|j*                  |j*                  k7  r3|j-                         |j-                         k(  r| j/                  |      S yd}|j1                         s|j3                  ||      }nV|j1                         s|j3                  ||      }n3t4        j7                  d|j9                         |j9                                |r*t;        j<                  t>        | jA                  ||            S dS c c}w c c}w )a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c              3  <   K   | ]  }|j                           y wrt   r  r  s     ri   r   z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>2  s      8
AHHJ8
r  r   r   r   r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)!r&   rO  r   r  r   r   r  r   r  r   rW   r   r   	size_hintr   r   r  r  r  r   r2   r|  rI  dep_size_hintr   r  r  r  r  r/  r0  r
  r1  )rh   r~   r   r(  r)  r*  r   r  r  
candidatesr   r  r  _numel	reordereds                  ri   !shared_data_after_reordering_loopz+Scheduler.shared_data_after_reordering_loop"  s     00C 8
!&8
 5
 
 %"3"3"5"..;;="..;;=03EE"383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX 
. 	K$[1G$[1G3356689 !!((2273D3D3FQR2S	 z?a $'zx7J7J17M#N '9-Z5Sw///
   "g&7&7&99))'22	!!#77II##%77II##Q    KKT55eUCD	
 	
g YXs   J0J5c                    t        |t        t        f      xr) |j                          xr t	        |j
                         S )z>
        Is this node unfusable under any conditions.
        )r   r  r9  r  rT   r   r  s     ri   r  zScheduler.unfusable_node}  sD    
 t79OPQ C$$&&C7		BB	
rk   c                   |j                         t        j                  j                  k  ry|j	                         }|j                         }d}|||z  kD  r	 |d       yt        d |j                         D              }|t        j                  j                  j                  j                  fk(  r	 |d       yd	d} ||j                         j                        r|j                         s	 |d       yy)
zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     K   | ]J  }|j                   <|j                   j                         D ]  }|j                  dk(  r|j                   ! L y w)Ncall_function)r   rN  r  rO  )r   r  r  s      ri   r   zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sT      
vv!VV'')	
 tt&	 HH

s   AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                <    | j                   dk  xr | j                  S )Nr   )itemsizeis_floating_point)r  s    ri   low_prec_fpzGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBrk   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r  ztorch.dtyper   rs   )r   rW   r   invoke_quant_opsrc  re  r   r   r  r  r  constant_pad_ndr  r  r  r  )	rh   prologue_noder  rO  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERrV  rJ  s	            ri   (check_prologue_fusion_heuristics_fusablez2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHI!>>@h rk   c                <    t        |t              rt        |t              syt        |j                  t        j                        r$t        |j                  t        j                        sy|j                         s|j                         ryt        j                  dk(  ry|j                  |j                  }}|\  }}|\  }}|j                         s,|j                         s||k7  st        |      t        |      k7  ryt        |j                  j                        dkD  s"t        |j                  j                        dkD  ry j                  t        t        |j                  j                                    }	 j                  t        t        |j                  j                                    }
t!        |	|
      t        j"                  kD  ryd fd} ||      s ||      ryg }t%        t'        ||            D ]  \  }\  }}||k7  s|j)                  |       ! t        |      dk7  ry|d   }||   ||   }}t*        j,                  j.                  j1                  ||      r|||fS t*        j,                  j.                  j1                  ||      r|||fS y)ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        Nr  r   c                ~   | j                   j                  D ]  }|j                  j                  v rj                  |j                     }n%j                  j                  |j                        }|s]t        j                  j                  j                  ||       st        |j                  t              r y yr  )r   r   r   r<  r=  r  rW   r   r7  r8  r   r  r9  )r   r?  r@  rh   s      ri   has_reusable_bufferzIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer  s    ((..  99 ; ;; $ ; ;DII FI $ 0 0 4 4TYY ?I ,,66y$G&y'<'<>TU  rk   r   r  )r   r   r   r)   r   r  r&   r  rB  r   r   r   r  r=  r  r  r  small_memory_access_thresholdr  r  r   rW   r   r   statically_known_lt)rh   r~   r   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryrT  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  ri   "get_expand_dim_for_pointwise_nodesz,Scheduler.get_expand_dim_for_pointwise_nodes  s]    %/z%7W uzz2#4#455::r'8'89 ))+u/M/M/O ) #\\5<<()1&)1& !!#/1=!S%77 u  ''(1,E4E4E4L4L0MPQ0Q "//T%:K:K:R:R5S0TU!//T%:K:K:R:R5S0TU"$67223 	  u%)<U)C !'0]M1R'S 	0#C#'7'!#**3/	0 "#q(*1-,',' ' 77//O66WW11..Q66rk   c                   ||u ryt        |t              r|j                  |      S t        |t              ryt        ||      }|j	                         r0| j                  |j                               j                  ||      ryt        |t              st        |t              r	 |d       yt        |t        t        f      r|j	                         s	 |d       yt        |t        t        f      r|j	                         s	 |d       y|j                         |j                  z  r	 |d       y|j	                         r!t        j                  s	 |d       y|j                         s|j	                         r	 |d       y|j!                         }t        |t"        j$                        s	 |d	       y|j'                         }t)        d
 |j*                  D              |z
  }|j-                         |z  r	 |d       y|j/                         s|j/                         r	 |d       y|j1                         dd D ]B  }	|	j3                         }
|
D ]+  }t5        fd|j6                  D              r" |d         y D t        |t8              s|gn*|j:                  D cg c]  }|j	                         s| c}}t=        |      dk(  sJ |d   }t=        d   j>                        dk(  rSt=        d   j>                  d   j6                        dk(  r+d   j>                  d   j6                  d   j@                  |u s	 |d       y| jC                  |||      sy|j	                         r9|j/                         s |j                         st        jD                  s	 |d       y|j-                         tF        jH                  jJ                  z  s+|j-                         tF        jH                  jJ                  z  r	 |d       y|j                         }|j                         }||k7  r |d||       y~| jM                  |||      }t        |tN              sJ |r<|t        jP                  k  r)t        jR                  r| jU                  ||      }|dk\  r|}t        jV                  rP| jY                  ||      x}r<|\  }}}|j[                  ||       | jM                  ||      }t        |tN              sJ t        j\                  r,|t        jP                  k  r| j_                  ||      }|dk\  r|}t`        jc                  td        jf                        r4t`        ji                  d|jk                         |jk                         |       tF        jl                  jo                  | |||      sy|j                         |j                  z  rY| jq                  ||      xrE tF        jl                  jq                  | |||      xr! | j                  |      jq                  ||      S tF        jl                  js                  | |||      xr! | j                  |      js                  ||      S c c}w )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  <   K   | ]  }|j                           y wrt   r  )r   r  s     ri   r   z%Scheduler.can_fuse.<locals>.<genexpr>g  s     Ec3<<>Er  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr   c              3  :   K   | ]  }|j                   v   y wrt   r   )r   r+  prologue_nodess     ri   r   z%Scheduler.can_fuse.<locals>.<genexpr>w  s     QttyyN:Qr  z7template prologue can only fuse nodes with a single user   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r#  z%s and %s has %s shared data):r   r  r7  r  r  rM  r   can_fuse_multi_outputs_templater  r  r9  r   r   r&   r  r   r  r)   r}  get_allowed_prologue_inpsr   r  r  r  r   rt  r   r  r   r   r   rd  r   rQ  r  rW   r   no_fuse_buffer_namesr1  r
  score_fusion_memory_thresholdrO  rA  $expand_dimension_for_pointwise_nodesrf  rv  r  r9  r  rb  rc  rG  r  r  r  r   can_fuse_verticalcan_fuse_horizontal)rh   r~   r   can_reorderr$  rO  r  rQ  unsupported_prologue_argsr   	node_outsr   r  template_snodestemplate_snoder  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizerj  s                          @ri   r   zScheduler.can_fuse$  s    E>e45&&u--e45 u%4#3#3$

)
)%
7$8 e12j'7
 ABu8:PQR%%'()u8:PQR%%'()$$&8,-))01!!#u'8'8':HI779Hh(?(?@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--/53Q3Q3SPQ"__.N&s+ % ,,.	$ %CQsyyQQUV$%% "%);< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sS**,!!#))12""$qww'C'CC""$qww'C'CC56!!#""$W,fg> 4454M 5 
 +S111 !F$H$HH11$($J$J5RW$X!$)$9!66#FFueTTOT6E3Z{<<ZU $ 8 8 F/555 11!F$H$HH$($M$Mu%! %)$9!))'--8##.  !	 yy!!$u6GH$$&8 &&ue4 MII//eUDUVM$$V,>>ueL 9900eU$5 M""6*>>ueLMs Bs   'Y2=Y2c                   |j                         }t        ||      }t        t              }|j                  D ]j  }| j
                  j                  |j                  |j                        }t        |t              r| j                  |||      rW||   j                  |       l |j                  j                  D ]  }t        |t              s|j                  | j
                  j                  |j                  |j                              }	|	sV|	D ]&  }
| j                  |
|      s|	j!                  |
       (  t#        d t$        j&                  j)                  |j+                               D              }||z  r	 |d       y|j-                         }|D ]E  }| j.                  |   j1                         }|| j2                  |   j4                  z  s= |d        y y)a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  4   K   | ]  }|j                     y wrt   r  r  s     ri   r   z.Scheduler.can_fuse_vertical.<locals>.<genexpr>  s      $
 HH$
r  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r  r  r   r   rk  rb  r  r   r   r4   r*  r   r   r  r2   fusable_read_and_writer  r   r  r  r  r   r   r=  r  r  r   )rh   r~   r   node1_buf_namesrO  remaining_deps_by_namer   r   cd	remainingrH  remaining_depsnode1_op_namesr  s                 ri   rp  zScheduler.can_fuse_vertical  s     002u%7B47H++ 	5C((,,SXXsxx@D#w'D,A,A#ue,T"4(//4		5 ##** 		-Bb),.22%%))"''277;I # -B222r:!((,-		- $ $
 445K5R5R5TU$
 

 O+
 +,224" 	D&&t,==?G 7 7 @ J JJ>?		 rk   c                   |j                   |j                         vry|j                  j                  D cg c]  }|j                   |j                  k(  r| }}t        |      dk7  ry|d   t        t              ryt        t              sJ t        j                  t        j                        ry| j                  |j                     }|g}t        |t              r|j                  }d}|D ]R  }	|	j                  j                   D 
cg c]  }
|
j                   |k(  r|
 }}
|s8|dz  }t#        fd|D              rR y |dk  S c c}w c c}
w )NFr   r   c              3     K   | ]q  }t        |t              xr[ t        |j                  t        j
                         xr4 |j                  j                  k(  xr |j                  j                  k(   s y wrt   )r   r2   r!   r   r#   TMPr   )r   r?  r  s     ri   r   z-Scheduler.fusable_weak_dep.<locals>.<genexpr>5  sm      
 	 4+ ,+DJJAA,JJ%++-, II+,s   A7A:)r   r  r   r  r  r   r   r3   r2   r!   r   r#   r  r  r>  r   r   r   )rh   weak_depr~   r   r  mutating_writesr,  relevant_reading_nodesnum_concurrent_readsreading_noder?  relevant_readss       `       ri   r*  zScheduler.fusable_weak_dep  sn    == 6 6 88 **11
zzX222 
 

 1$"eW%%+++u{{DHH5++H,A,AB	"'e78%*\\" 2 	L )44::99	) N 
 " A%  
 +  !	" $q((K
*s   "EEc                $   t        |t              rd| j                  j                  |j                  |j                        }||j                  k7  sHt        |j                  t        j                        s$t        |j                  t        j                        ryt        j                  r9|j                  |j                  k7  r |j                         }|j                         }| j                  |j                        ry|j                  |j                  k(  xr\ t        |j                         t        |j                         k\  xr/ |j                   d t        |j                          |j                   k(  S t        |t"              r| j                  j                  |j                  |j                        }| j                  j                  |j                  |j                        }|j                  |j                  k(  r|j                  ||k(  ryyrC  )r   r2   rb  r  r   r!   r   r#   r  r&   rO  r|  rI  r  r  r   r   r3   )rh   r?  r  	read_name
write_names        ri   r  z Scheduler.fusable_read_and_writeC  sw   dI&--11$))TYYGI UZZ'&tzz488<&u{{DHH=00T]]enn5T ~~') 11%**= 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rk   c                B    t         j                  j                  ||      S rt   )rW   r   get_dep_size_hint)rh   r   r-  s      ri   r=  zScheduler.dep_size_hintj  s    ww((k::rk   c                    fd}|r5t         j                  ||      rt         j                  ||      } ||d      S t        |j                  j
                        t        |j                  j                        z   }t        |j                  j
                        t        |j                  j                        z   }	t        ||	      dz  t        ||	      k  r||	kD  r||}}|j                  j
                  |j                  j                  z  D 
cg c]4  }
|
|j                  j
                  v s|
|j                  j                  v r|
6 }}
 |t         fd|D              d      S |j                  j
                  |j                  j                  z  |j                  j
                  |j                  j                  z  z  } |t         fd|D              d      S c c}
w )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        c                    r| |fS | S rt   r{   )r8  is_mix_order_reductionreturn_is_mix_order_reductions     ri   _construct_return_valuez>Scheduler.score_fusion_memory.<locals>._construct_return_valuez  s"     1 ./ rk   Tr  c              3  B   K   | ]  }j                  |        y wrt   r=  )r   r   r-  rh   s     ri   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     ISD&&sK8Ir  Fc              3  @   K   | ]  }j                  |        y wrt   r  r  s     ri   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     FC""3'Fr)  )
r   r   r   r   r   r   r  r  r  r   )rh   r~   r   r-  r  r$  r  r8  node1_dep_lennode2_dep_lenr   r  common_memory_depss   `  ``        ri   r1  zScheduler.score_fusion_memorym  s   	 %):)C)CE5)Q
 &66ueDE*5$77E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT }m,q03}m3TT},$eu !,,22U5F5F5M5MM%++111SE<M<M<T<T5T D  +IDII5  $//558I8I8P8PP##e&7&7&>&>>
 'F3EFF
 	
s   9Gc                   t        |      dk(  r|S i }|D ]  \  }}|j                         |j                         k(  sJ |j                         }t        | j                  |      j	                  ||            }||vr	||fg||<   p||   j                  ||f        t        |j                         t        j                  d            d   }t        |      dkD  sJ |S )Nr   r  r   )
r   r   r
  rM  get_fusion_pair_priorityr   r  r]  r  r  )rh   r  "possible_fusions_group_by_priorityr~   r   r  fusion_pair_priority&possible_fusions_with_highest_prioritys           ri   r  z4Scheduler.get_possible_fusions_with_highest_priority  s   
  A%##  	+ - 	LE5##%)9)9);;;;%%'F#&  (AA%O$  $+MMENL23GH 33GHOOEN	 25.446H<O<OPQ<R2

2. 9:Q>>>55rk   c                B    t        j                  j                  | g| S )z-
        Shim for list.sort(key=...)
        )rW   r  score_fusionr  s     ri   r  zScheduler.score_fusion_key  s     yy%%d3U33rk   c                    t        t        j                  j                               }t	        | j
                        D ]9  }|j                  || j                         |j                  |j                         ; y)zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rW   r   r;  r   r  r  r  r  r]  )rh   r  r   s      ri   rG  zScheduler.compute_last_usage  s]    
 ))A)A)CDTZZ( 	8D 3T5L5LM&&t7	8rk   c                   t        | j                  t        j                  j                  z
  t        j                  j
                  j                  z
        D ]i  }|| j                  v rT| j                  |   }|j                         s2t        j                  j
                  j                  |j                         f|t        j                  j                  v st        j                  j                  |   }t        |t        j                        r*t        j                  j
                  j                  |       t        |t        j                        r|j                   }t        |t        j"                        r|j%                         sJ t        j                  j
                  j                  |j                          l | j                  j'                          y)z*Free any buffers that are no longer neededN)r  rM  rW   r   r7  r7  freedr=  rF  codegen_freer   r  r   r)   r~  r  r   r  is_input_bufferr  )rh   r   r   r  storages        ri   free_bufferszScheduler.free_buffers  sK   %%gg%%&gg""(()
 	DD
 t'''&&t,<<>GG((55chh?---gg**40c2#5#56GG((55c:R%6%67!hhG"7BMM:w?V?V?XXGG((55gllC)	D, 	!!'')rk   c                    | j                   j                         D ]  }|j                           | j                          y rt   )r  r   flushr  )rh   r9  s     ri   r  zScheduler.flush  s3    }}++- 	GMMO	rk   c                   t        |t              sJ t        d   dxx   dz  cc<   t        j                  t        d            5  |j                          |j                          d d d        |j                  }t        |t        j                        sJ dt        |             |j                  t        j                  j                         | j                          y # 1 sw Y   |xY w)Nr  extern_callsr   F)increase_kernel_countztype(node)=)r   r  r   rW   set_kernel_handlerr.   rC  r  r   r)   r  r   r3  r   r7  r  )rh   scheduler_noder   s      ri   codegen_extern_callzScheduler.codegen_extern_call  s    .*CDDD
 	^,1,!!&u"EF 	&002##%	& ""$0B[T$ZM2BB0QWW))*	& 	&s   !C""C+c                P   t        |j                        r|j                  
J | d       t        j                  j                  |       t        |j                        }|t        d|j                         t               s|j                  dk(  rLt        j                  j                  |      x}j                  dk  rt        |t        j                               t        |j                        r,|j                  dk(  st!        t        j                                ||       S )Nz( should have been normalized in loweringzUnsupported device type: r      rn  )rR   r   r   rW   r   add_device_infor-   r  r$   r  r   get_device_propertiesmajorr5   inspectcurrentframer6   )rh   r  device_schedulingdevice_propss       ri   create_backendzScheduler.create_backend  s    &++&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII|v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$V[[E-A#G$8$8$:;; &&rk   c                    |J || j                   vr| j                  |      | j                   |<   | j                   |   S rt   )r  r  r  s     ri   rM  zScheduler.get_backend  sB    !!!&$($7$7$?DMM&!}}V$$rk   c                    d fd}|j                         D ci c]8  }|j                  *|j                  j                         D ]  } ||      |fd  : }}}t        |j	                               }|rMt        |t        j                  d            \  }}t        j                  j                  j                  |       y y c c}}w )Nc                    | j                   vrLj                   j                  t        | j                  j                        D  ci c]  \  }} | |
 c} }       j                       S c c} }w rt   )rN  r  r  r   r  )r  r  rh   s     ri   	get_orderz*Scheduler.enter_context.<locals>.get_order  s\    ,,,$$++i>V,WdaQT,WX''** -Xs   A+
r   r  )r  ztorch.fx.Noder   r
  )r   r   rN  r   r   r  r  r  rW   r   r7  enter_context)rh   r   r  r  r  rV  r  lasts   `       ri   r  zScheduler.enter_context  s    	+ ^^%
vv!VV'')	
  q\1t#

 
 w||~&'x':':1'=>GAtGG  ..t4 
s   =Cc                    	 | j                   |   j                  }t        fd|D              xr || j                  vxr || j
                  vS # t        $ r Y yw xY w)NFc              3  ^   K   | ]$  }|j                   xs |j                         v  & y wrt   )r  r  )r   r+  r  s     ri   r   zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>8  s)     VC3C CCVs   *-)r=  r  KeyErrorr   rb  r  )rh   r   r  r  s     ` ri   r  z.Scheduler.can_buffer_be_removed_through_fusion0  sn    	$$T*00E VPUVV 4D1114D333	
  		s   A 	AAc                   |j                   }t        |t        j                  j                  j
                        rk|j                  x}r]t        |      \  }}|t        j                  v s|t        j                  v r+t        |t        j                  j                        sJ d| S t        j                  j                  j                  j                  st        j                  yt        |t               r)|j"                  D ]  }| j%                  |      }|s|c S  y|j                   J |j'                         s|j)                          dS t        |j                   t        j*                        ryt        |j                   t        j,                        ryt/        |j                   dd      ryt1        |j                         ry	| j3                  |      x}r|S t        j                  j4                  rt7        |      ry
y)z
        Return the reason why we should partition the inductor graph on this node,
        or None if the node is cudagraphable.
        zcustom partition op: Nz6partition includes all ops when cudagraphs is disabledz opszDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape ops)r   r   r  r  r)   r;  r  rM   r&   custom_should_partition_ops_ops
OpOverloadr   r[   rE   wrapperr   r   should_partitionrR   r   
DeviceCopyConditionalr  rQ   &_uses_cudagraph_unsafe_unbacked_symintcudagraph_skip_dynamic_graphsr  )rh   r   r  r  op_overload_packet_nameop_overload_namer  r  s           ri   r  zScheduler.should_partition=  s    ))gu11@@A%%%B%8DR8H5#%5'6+M+MM#v'I'II!"ejj&;&;<<<./?.@AA &&--886>>FKd./ "..u5!M" yy$$${{}oo'(--dii/#dii0$499148)!$)),0@@FF6FM ==66-d3*rk   c                T   t               }t        j                  s|S | j                  D ]  }|j                  }|t        |t        j                  j                  j                        sA|j                  }|Pt        |      \  }}|t        j                  vr|t        j                  vr|j                         D ]g  }t        j                  j                  j!                  |      }t#        |t$        j&                  t$        j(                  f      sW|j+                  |       i  |S )zc
        Collect output unbacked symints from ops in config.cudagraph_unsafe_unbacked_ops.
        )r   r&   cudagraph_unsafe_unbacked_opsr  r   r   r  r  r)   r;  r  rM   r  rW   r   r   r   r"   r#   UNBACKED_INTUNBACKED_FLOATr  )rh   unsafe_symintsr   r  r  r  r  syms           ri   &_get_cudagraph_unsafe_unbacked_symintsz0Scheduler._get_cudagraph_unsafe_unbacked_symints|  s   
 4><33!!JJ 	,DiiGgu'9'9'H'HI$$Bz8DR8H5#%5'v/S/SS$F,P,PP779 ,gg&&//4!#(9(94;N;N'OP"&&s+,'	,0 rk   c                    | j                         }|sy t        |      }|D ]I  }t        j                  j                  j                  |      }|j                  D ]  }||v sd| c c S  K y )Nz'uses cudagraph-unsafe unbacked symint: )r  r  rW   r   r   r   r    )rh   r   r  node_symbolsr  simplified_symfree_syms          ri   r  z0Scheduler._uses_cudagraph_unsafe_unbacked_symint  s~     DDF5d; 	PCWW--66s;N*77 P~-DXJOOP	P rk   c                    i }|j                  t        j                  j                         | j                  D ]3  }|j
                  j                         D ]  \  }}|j                  ||<    5 |S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r  rW   r   r  r  rf  r]  r   )rh   rE  r   r   scheduler_buffers        ri   get_name_to_nodeszScheduler.get_name_to_nodes  sr     UWAGG001JJ 	;D*.*>*>*D*D*F ;&&%5%:%:T";	; rk   c           	        t        t        j                  j                        D ci c]  \  }}||
 }}}t        t        j                  j	                               D ci c]  \  }}||
 }}}g t        j                  _        t        |      D ]  \  }}|j                  rg }|j                  D ]"  }|j                  |j                  |             $ g }	|j                  D ]0  }
|	j                  |j                  |
j                                      2 t        j                  j
                  j                  t        |||	|j                                yc c}}w c c}}w )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        N)r  rW   r   r  r;  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr  rN   constant_names)rh   
signaturesr`  r   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingr   s              ri   compute_graph_partition_mapsz&Scheduler.compute_graph_partition_maps  sT    (11E1E'F%
##tD#I%
! %
 (11I1I1K'L&
##tD#I&
" &
 "$'0'< 	#L)''
 M!-- J$$%>%B%B4%HIJ  N!.. W%%&@&D&DT]]_&UVW GG""))! !",,	!	%
&
s   E!E c                   	 	 	 	 dd	 	 	 	 dd} t               j                  d |D         } |j                  fd|j                         D           ||      }t               }|D ]F  }t        j
                  j                  j                  |      }|j                  |j                         H t        t        |t        j                  d                  S )	ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        c                    t        | t        j                        r
t               S t        | t        j                        rt        |       S t        dt        |              )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )r   r)   r~  r   r  r  r  r   r   s    ri   get_input_node_symbolszKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sN     $ 2 23!|#D")),)$// *,I$t**VWWrk   c                &    t        d | D              S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c              3     K   | ]N  }t        |t        j                  t        j                  t        j                  t        j
                  f      r| P y wrt   )r"   r#   SIZEFLOATr  r  r   rk  s     ri   r   zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sH      !		

))++	 s   AAr   )symbolss    ri   filter_symbolszCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s         rk   c              3  2   K   | ]  }t        |        y wrt   r  r  s     ri   r   z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     It,T2Ir  c              3  4   K   | ]  \  }} |        y wrt   r{   )r   r  r   r  s      ri   r   z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>!  s     Nwq$$T*Ns   r   r  )r   z0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   OrderedSet[sympy.Symbol])r  r  r   r  )r   r  r  r]  rW   r   r   r   r    r  r  
attrgetter)	rh   	partitionr  r  candidate_symbolsr]  rk  symplified_sr  s	           @ri   !get_graph_partition_symbol_inputsz+Scheduler.get_graph_partition_symbol_inputs  s    	XB	X%	X 	-	%	, 7Ijl6H6HIyI7
 	!  N+:K:K:MN	
 ++<=(2" 	2A77++44Q7LJJ|001	2
 &(*=*=f*EFGGrk   c           
         g }t        t        j                  j                               } j	                         }d fdt        t        |      t        |            D ]2  \  }}t               }|D ]+  }	|j                  |	j                  j                                - |j                  |      }
t        j                  j                  |D 	cg c]  }	|	j                   c}	      }t        |j                  |j                   z  D cg c]  }t#        |t$              s|j&                    c}      |z
  }t         fd|D              }t               }|D ]  }	|j                  |	j(                          ||z
  D cg c]  }||v r|
 }}|j                  |       |D ci c]  }||v r|||    }}|D ci c]  }||v r|||v  }}|D cg c]  }||v r||vr| }}|
j                  |       t         fd|
D              }
|
D cg c]  } |      s||    }}|D cg c]!  }|t        j                  j*                  v s |# }} j-                  ||      }t/        ||||||      }|j1                  |       |j3                  ||
z
        }5 |ddd   S c c}	w c c}w c c}w c c}w c c}w c c}w c c}w c c}w )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        c                    j                   j                  | d      }|yt        |j                  j                  t
              r'j                  j                  | d      x}r |      S yy)z
            Checks if buf_name resolves to a NoneLayout buffer (following mutation_real_name).
            Buffers with NoneLayout are not allocated so graph partition should not
            take them as inputs or outputs.
            NFT)r=  r  r   r   r$  r=   r  )r  r   r,  is_unallocated_bufferrh   s      ri   r  zFScheduler.get_graph_partition_signature.<locals>.is_unallocated_buffer:  sh     ""&&x6C{#((//:6 !% 7 7 ; ;Hd KK9K0;;rk   c              3  V   K   | ]   }j                   j                  ||       " y wrt   r  r  r   r   rh   s     ri   r   z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>m  ,      / ''++D$7/r   c              3  V   K   | ]   }j                   j                  ||       " y wrt   r  r	  s     ri   r   z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>  r
  r   Nr   )r  r  r   rs   )r   rW   r   r;  r  r  r   r  rf  r   r  r(   r  r  r   r   r  r   r4   r   r]  r  r  r:   r   r  )rh   
partitionsskip_cudagraphsr  unmet_output_namesrE  r   r  output_namesr   returned_output_namesr   rA  partition_input_namesrM  r   extra_input_namesr  input_deallocationextra_output_namesr  r  symbol_inputspartition_signaturer  s   `                       @ri   get_graph_partition_signaturez'Scheduler.get_graph_partition_signature.  s^    
'(@(@(BC--/	, *-Z (?";*
 g	%I~ -7LL! A##D$8$8$=$=$?@A %1$=$=>P$Q! '11<<.78d!!8K  "-!2!2[5G5G!G)!W5   " %/ /1/ %!
 5?L ! =$++DOO<= 2L@!<' ! !
 "(():; 2<' l4((K  2"<' d222" " 2"<'D8L,L " " "(();<$. /1/ %! 2,T2 T"L  "7$!''BSBS:SN  !BB;M #:"# 12!6!<!<"%::"Kg	R $B$y 9*!
""s6   J

#J$
>J)"J.9J3J8J=%!KKc                   |j                   j                         D ci c]$  \  }}|t        j                  j                  vr||& }}}|j
                  j                         D ci c]$  \  }}|t        j                  j                  vr||& }}}|j                  D cg c].  }|j                         t        j                  j                  vr|0 }}|j                  D cg c]   }|t        j                  j                  vr|" }	}t        |j                  ||||j                  |	      S c c}}w c c}}w c c}w c c}w )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        )r  r]  rW   r   r7  r  r  maybe_get_namer  r:   r  r  )
rh   r  r   rT  r  r  r  r   r  r  s
             ri   .clean_removed_buffer_from_partition_signaturesz8Scheduler.clean_removed_buffer_from_partition_signatures  sK    !* 5 5 ; ; =
f177222 &L
 
 '99??A
c177222 #I
 
 "..
""$AGG,C,CC 
 
 "00
177222 
 

 '##$$
 	
)






s   )D/')D5!3D;$%E c                p   	
 ddl 	t               g g t        |      D ci c]  \  }}||
 c}}d	 fd
d
fd}|D ]5  }t        |j                  j
                        |<   |   dk(  s. 
|       7 g }d}|t        |      k  rsr}r0	j                        \  }}|j                  |        ||       r0r0	j                        \  }}|j                  |        ||       r0|dz  }|t        |      k  rrzr}|t        |      kD  rt        d      |S c c}}w )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                    |    | f}j                  |       rj                  |       y j                  |       y rt   )r  heappush)r   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesrh   s     ri   insert_pending_nodeszHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  s>    ,T2D9O$$T*6H2ODrk   c                    | j                   j                  D ]*  }|   dkD  sJ |xx   dz  cc<   |   dk(  s# |       , y )Nr   r   )r`  
succ_nodes)r   	succ_noder#  node_to_indegrees     ri   update_indegreezCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  sT    !]]55 4	'	2Q666 +q0+#I.!3(3	4rk   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r   r\   r   rR  )	r   r  r  r   r`  
pred_nodesheappopr   r  )rh   r  r`  r   r(  r0  	num_itersr  r  r   r#  r'  r!  r"  s   `       @@@@@@ri    reorder_for_minimizing_partitionz*Scheduler.reorder_for_minimizing_partition  sU    	9=CEGI4=e4DEysDsE	E 	E	4  	+D%()A)A%BT"%*$T*	+
 -/	#e*$#':)--(?@4%% *
 &--(;<4%% &
 NI #e*$#': s5z!  ] Fs   D2c           	     X   ddl m}m} t        t        j
                  j                               } ||| j                  | j                  t        t        j
                  j                  j                               |      \  }}| j                  |      } ||||      \  }}	||dz  k  r|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r   )estimate_peak_memoryprepare_planning_inforD  )r:  r/  r0  r   rW   r   r;  r=  r  r  r   r-  )
rh   r  r/  r0  r  default_peak_memoryr  reordered_nodesreorder_peak_memoryr  s
             ri   rE  z0Scheduler.maybe_reorder_for_minimizing_partition   s     	H"177#;#;#=>:O##qww++0023;
77 ??F!57"
Q
 !4s!::""rk   c                   g }g }g }dd}|D ]n  }| j                  |      du}|r*t        |j                        dk(  r|j                  |       B|r ||      r|j                  |       ^|j                  |       p ||z   |z   S )a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        c                    | j                         D ]0  }|j                  D ]  }t        |j                  t              r  y 2 yrC  )rt  r  r   r   rD  )r   r   rE  s      ri   only_output_userzPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_userM  sC    '') %99 %C%chh
;$%% rk   Nr   r  )r  r   rk  r   )rh   r  frontmiddlebackr6  r   r  s           ri   rF  z6Scheduler.reorder_for_partition_with_simple_dependency?  s     *,*,(*	  	$D#44T:$FC(?(?$@A$ET"!&6t&<D!d#	$ v~$$rk   c                   g }d}g }g }| j                   D ]S  }| j                  |      du}|r)||k7  r$|j                  |       |j                  |       g }|}|j                  |       U |r"|j                  |       |j                  |       | j                  ||      }| j	                  |       | j                  ||       ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        TN)r  r  )r  r  r   r  r  _log_graph_partitions)rh   r  r  cur_partitionr  r   node_should_partitionr  s           ri   rC  zScheduler.graph_partition_  s     +-
')JJ 	'D$($9$9$$?t$K!3H!H!!-0&&~6 "2N  &	' m,"">277!? 8 

 	))*5"":z::%%rk   c                `   t         j                  t        j                        sy t	        d t
        j                  j                  D              }|sy t        d |D              }t        |      |z
  }t         j                  dt        |      ||       t        t        ||            D ]  \  }\  }}t         j                  d|t        |      |j                  rdndt        |j                        t        |j                               |j                  sm|D ]  }	| j!                  |	         y )Nc              3  2   K   | ]  }t        |        y wrt   )rR   )r   r  s     ri   r   z2Scheduler._log_graph_partitions.<locals>.<genexpr>  s     OVF^Or  c              3  :   K   | ]  }|j                   rd   yw)r   N)r  r  s     ri   r   z2Scheduler._log_graph_partitions.<locals>.<genexpr>  s     !Pq?O?O!!Ps   zCCreated %d graph partitions: %d cudagraphable, %d non-cudagraphablez3  Partition %d: %d nodes, %s, inputs=%d, outputs=%dznon-cudagraphablecudagraphable)cudagraphs_logrb  rc  rG  r   rW   r   device_typesr   r   r  r  r  r  r  r  _log_non_cudagraphable_node)
rh   r  r  has_gpu_devicecudagraphable_countnon_cudagraphable_countr  r   r  r   s
             ri   r;  zScheduler._log_graph_partitions  s   
 **7==9 O!'':N:NOO!!PZ!PP"%j/4G"GQ
O#		
 *33z:3N)O 	;%A%	9  EI'0'?'?#_I))*I**+ ''% ;D44T:;	;rk   c                   | j                  |      }|sy|j                         }|j                  |j                  j                         nd}d| g}t	        |j                        j
                  }|j                  d|        |F|j                   ddj                  d |j                  D               d}|j                  d|        t        j                  d	|dj                  |             |Z|j                  j                  d
d      }|r;|j                         j                  d      D ]  }	t        j                  d|	        yyy)z)Log details for a non-cudagraphable node.Nzreason=zir=r}  r  c              3  2   K   | ]  }t        |        y wrt   )r  )r   r  s     ri   r   z8Scheduler._log_non_cudagraphable_node.<locals>.<genexpr>  s     2Pa3q62Pr  rx  zfx=z
    %s: %srG  r  z         %s)r  r  r   r  r   rv   r   rO  r  r1  rB  r  rP  r  stripsplit)
rh   r   r  rI  r  partsir_typefx_strrG  lines
             ri   rD  z%Scheduler._log_non_cudagraphable_node  s1   &&t,MMO	151F$))++-D6(#$tyy/**s7)_%'q2P7<<2P)P(QQRSFLL3vh(\9dii6FG !,,**=$?K'--/55d; >D"((=>  rk   c                    t        d      5  t        j                  j                  j                  r| j                         n| j                  | j                        	 cd d d        S # 1 sw Y   y xY w)NzScheduler.codegen)r   r  r  r&   rC  _codegen_partitions_codegenr  rg   s    ri   r3  zScheduler.codegen  sX    -. 	 ??))99 ((*]]4::.	 	 	s   AA&&A/c                ^   ddl m} t        j                  j                  }t        | j                        }t        j                  j                         5  t        j                  j                  dd| ||       | j                  |       t        t        j                  j                  |      sJ | j                  |      }|t        j                  j                  _        t        j                  j                  j                          t        j                  j                  }t        j                  j                  j                  t        j                  j                         \  }}ddd       t        j                  j                  j#                         t        j                  j                  j%                  ||       t        j                  j                  j&                  j)                  |j*                  D 	cg c]  }	|	j-                          c}	       y# 1 sw Y   xY wc c}	w )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)r/  rT  rW   r   r7  r  r  set_current_wrapper_codeinit_wrapper_coderR  r   r  rY  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr  r  r  )
rh   r   r  rT  rX  graph_partition_id
graph_namepartition_coder  r   s
             ri   _codegen_partition_wrapperz$Scheduler._codegen_partition_wrapper  s    	Bgg22!$"?"?@WW--/ 	TGG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQKKIVI8AAGG  5GG  --/J ! 4 4 = =agg>R>R SNA/	T2 	
88^T	334F	R	&&--)2)?)?@T]]_@	
9	T 	T: As   DH?H*H'c                L     t         j                  d fd       } |       S )Nc               3    K   j                          j                  ryt        j                  j                        rZj                  j                  J d       t
        j                  j                  j                  j                  j                         	 d  j                  rGt        j                  j                        r(t
        j                  j                  j                          d _        y # j                  rGt        j                  j                        r(t
        j                  j                  j                          d _        w xY ww)Ndevice should have an index)
%update_graph_partition_default_devicer"  rH   r   r   rW   r   r7  codegen_device_guard_entercodegen_device_guard_exit)r  rh   r  s   ri   ctxz1Scheduler.use_default_device_context.<locals>.ctx  s    66z:N**/@++000 2288D 1D $$??//553..3D//444 GG((BBD.2+	 ..3D//444 GG((BBD.2+s    BEC;  AE;AEE)r   zIterator[None])
contextlibcontextmanager)rh   r  r  rl  s   ``` ri   use_default_device_contextz$Scheduler.use_default_device_context  s&     
	"	"	3 
#	3* urk   c                    t        |      dk(  r|d   j                  sy dd}	 	 	 	 	 	 dd}d }t        ||      D ]  \  }}|j                  r ||      } n |y t        ||      D ]  \  }}|j                  s |||      r y  || _        y )Nr   r   c                4    | d   j                         }|J |S r   r   )r   partition_devices     ri   get_cudagraph_partition_devicezWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s'    (|668#///##rk   c                @    | D ]  }|j                         }||k7  s y yrC  rr  )r   target_devicer   r  s       ri   all_on_target_devicezMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s/     " !*]* ! rk   )r   r]   r   r  )r   r]   rv  r  r   rs   )r   r  r  r"  )rh   r  r  rt  rw  cudagraph_partition_devicer   r  s           ri   ri  z/Scheduler.update_graph_partition_default_device  s     z?a
1(D(D 	$
	$	5A		 &*"$'
J$? 	 Iy++-KI-V*	 &-$'
J$? 	 Iy''0D51 		 'A#rk   c                   | j                         \  }}t        |      dkD  rt        d   dxx   t        |      z  cc<   | j                  ||      5  t	        ||      D ]V  \  }}t        |      dk\  sJ dt        |              |j
                  r| j                  |       E| j                  ||       X 	 ddd       t        | j                        }t        j                  j                  j                  |       |dkD  rqt        j                  j                  J |t        t        j                  j                        k(  s.J d| dt        t        j                  j                                yy# 1 sw Y   xY w)	z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   r  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )rC  r   r   ro  r  r  rR  re  r  r  rW   r   r7  set_all_partition_namesr  )rh   r  r  r   r  num_partitionss         ri   rQ  zScheduler._codegen_partitions:  sh    "&!5!5!7
Jz?QZ !78C
OK8,,ZD 		J(+J
(C J$	99~* KCPYNK[\* ++MM),33IyIJ		J d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@ 		J 		Js   A&E55E>c                   t         j                  rdd l}t        j                         }t               }t        |      D ]  }|j                  dk(  r/|j                  |j                  j                  j                  k(  r nQ|j                  |j                  f}||vs"J d|j                   d|j                   d       |j                  |        | j                  | _        | j                   J | j                  rBt         j"                  j$                  r(t&        j(                  j*                  j-                          |D ]p  }t.        j1                  t2        j4                        r4	 t.        j7                  d|j9                         |j;                                | j?                  |       |jA                         x}r|| j                  k7  s |jC                         s|jE                         r| jG                          || j                  k7  r| j                  rGtI        | j                  jJ                        r(t&        j(                  j*                  jM                          || _        tI        |jJ                        rF|jN                  J d       t&        j(                  j*                  jQ                  |jN                         || _)        | jT                  jW                  |jX                         |jE                         rP|j[                  t]        |j_                                     \  }	}
}| ja                  |      jc                  |
||	       nH|jC                         r-te        jf                  th        |      }| jk                  |       n|jm                         rqte        jf                  tn        |      }| ja                  |      }d	d
l8m9} d	dl:m;} ty        |||f      r|}nt{        dtK        |             |j}                  |       nty        |t~              r!| ja                  |      j                  |       nYty        |t        t        f      r!| ja                  |      j                  |       n"ty        |t              sJ |j                          t         j"                  j                  r| ja                  |      j                          | j                  jW                  |j                                | j                  jW                  |j                                ty        |t              sP|jA                         }|>|jJ                  dk7  r/| ja                  |      j                         r| jG                          t        d |j_                         D              r	|| _        jd | _        s | j                  | j                  k7  rU| j                  J tI        | j                  jJ                        r(t&        j(                  j*                  jM                          d | _        | jG                          y # t<        $ r( t.        j7                  d|j9                                Y w xY w)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0rh  r   )CUDACombinedSchedulingrW  ztype(self)=rP  c              3  <   K   | ]  }t        |t                y wrt   )r   r   r  s     ri   r   z%Scheduler._codegen.<locals>.<genexpr>  s     JA:a/Jr  )Nr&   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  r"  rX  r  r   autotune_at_compile_timerW   r   r7  write_get_raw_stream_headerr  rb  rc  rG  r  r  r  r  r  r   r  r  r  rH   r   rk  r   rj  r   rM  r  r]  r  r   r   rM  codegen_templater/  r0  r  r  r  r>   codegen.cuda_combined_schedulingr  r[  rX  r   r  codegen_combo_kernelr  codegen_mix_order_reductionr   r   codegen_noder9  r  debug_sync_kernelcodegen_syncr  r  r6  r   ready_to_flushr   )rh   r  r  stackr  framer  r   r  r  r  r  backend_r  rX  r9  s                   ri   rR  zScheduler._codegenZ  s    44.++-E7A|D!%  JJ"22%--*E*E*N*NN~~u||4$ ,U^^,<Aell^ LJ J
  #99!!))) &&6==+Q+QGG  <<> Y	*D.
IIO224 t$**v*d111~~''')JJLT000**/@++000 ,,FFH*0D'(5%||7V9VV7,,GGU $D%%,,T__=!484W4W)*51-   (99!8X !{{#<dC((."{{#=tD++F3T8h9O(PQ&G(KDJ=)9::,,T2D"9:  (DDTJD#5}"EF  (55d;!$(>??? }}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;*&v-((0??AJJLJ9IJJ%)"%)"sY	*v $"="== &&222 !4!4!9!9: $$>>@!

} ! IIPs   '3X99-Y*)Y*c                    |d   j                         }| t        j                  _        || _        |J | j                  |      }|j                  ||      S )r  r   )r   rW   r   r  rX  rM  benchmark_combo_kernel)rh   r2  node_benchmark_resultsr  r9  s        ri   r  z Scheduler.benchmark_combo_kernel  sZ     1((* $!!!""6*--i9OPPrk   c                   |}|d   j                         t        fd|D              sJ d       t        j                  syddlm} dg }}i }t        |      D ]  \  }}|j                         }	| j                  |	      rt        j                  d       	 | j                  |	      \  }
}|
|f||<   t        j                  |
      rt        j                  d|        y		 ||
z  }|j                  |        	 | j                  ||      \  }}}||z
  dk  xs |dk  }t        j!                  t"        j$                        rP||kD  s|r%t        j                  dt'        ||z  d             n$t        j                  dt)        ||z  d             ||z
  |k  xs |S # |$ r.}d
t        |      v rt        j                  d       Y d}~ y d}~ww xY w# |$ r-}d
t        |      v rt        j                  d       Y d}~y d}~ww xY w)rA  r   c              3  D   K   | ]  }|j                         k(    y wrt   rr  )r   r   r  s     ri   r   z4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr>  s     K44??$.Ks    z<All nodes in a combo kernel group must be on the same deviceTrC  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFru  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speeduprF  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r   r&   r  r~  rD  r  r   r3  r  r  r  rw  rx  r  r   rb  rc  rG  rB   rC   )rh   r  subkernel_nodesrD  rI  
path1_listr  r  r  r2  r  r^  r  rJ  	ms2_clone_path2_listsmall_kernelr  s                    @ri   r  z!Scheduler.speedup_by_combo_kernel  s(      #..0K?KK 	
J	
K ,,;rZ!#!/2 	$HAu)I ##I.  R55i@D13T
&u-::b>$$U ! " 2ICd#9	$<	*.*E*E!7+'CK Y,9c	""7==1SyL  E#)C2
   Ic	#0
 Y$44Q $ *c!f4$$]      	&#a&0  Y 	s=   AF9G G""G
GGH"H ?H  Hc                p    | j                   |   }|j                  J |j                  j                         S rt   )r=  r   
get_layout)rh   r  r   s      ri   get_buffer_layoutzScheduler.get_buffer_layout<  s5    x(xx###xx""$$rk   c                   | j                   D ]  }|j                         s|j                  j                  D ]  }t        j
                  j                  j                  |j                        }|s9t        |      dk(  sHt        |j                  t        t        f      ri|j                         g k(  s}t        j
                  j                  j!                  |j                           y r  )r  rR   r   r   rW   r   r  r  r   r9   r   r$  r=   r<   r  zero_dim_cpu_tensor_listr  )rh   r   r?  rT  s       ri   r!  z$Scheduler.update_zero_dim_cpu_tensorA  s    JJ 	HD{{} ,,22 
HDWW3377		BF+F3u< *"MMJ8I+J! #OO-388<<TYYG
H	Hrk   )r  zlist[ir.Operation]r   rR  )r   z!dict[str, SchedulerDonatedBuffer]rU  )r  rV  r   rR  rQ  )re  r  r   rR  )r   r  r   r\   )r  Optional[str]r   rs   r  )r  r\   r   r  )r   r  r  r  r   tuple[float, str]rt   r  r  r  rs   r  Optional[int]r   r  )r   r   r  r  r   r  )r  ir.MultiTemplateBufferr   rs   )
r&  ir.OperationBufferr  r  r  r
  r   r   r   rR  )r2  r  r   rs   )r  r  r  r  r   z)tuple[Optional[LambdaFuture], ModuleType])r~   r\   r   r\   r   ra   )r   r\   r   r\   )r~   r\   r   r\   rX  OrderedSet[BaseSchedulerNode]r   r\   )r~   r\   r   r\   r  ru   rX  r  )r  ,dict[BaseSchedulerNode, list[PendingFusion]]rX  r  r   rR  )
r  1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r  &dict[BaseSchedulerNode, PendingFusion]r  r  rX  r  r  rs   )rX  r  r  r  )r  r  r  r  )r  r  r  rs   r   r  )r  r  r   rR  r  )r  r  r  rs   r   r  r  )r~   r\   r   r\   r  r
  r   rs   )r~   r\   r   r\   r  z'Union[tuple[str, ...], OrderedSet[str]]r   r  r  r  )rM  r\   r  r\   rO  r  r   rs   )r~   r\   r   r\   r   z/Optional[tuple[int, SchedulerNode, sympy.Expr]])FT)
r~   r\   r   r\   rr  rs   r$  rs   r   rs   )r  r4   r~   r\   r   r\   r   rs   )r?  r1   r  r2   r   rs   r  )r   r1   r-  rs   r   r
  )TFT)r~   r\   r   r\   r-  rs   r  rs   r$  rs   r   zint | tuple[int, bool])r  r  r   r  )r  r   r   r   )r  r  r   rR  )r  r  r   BaseScheduling)r  rV  r   r  r)  )r   r  r  r\  r   rs   )r   r\   r   r  )r   r  )r   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r  list[GraphPartitionSignature]r   rR  )r   r]   r  r  r   r  )r  list[PartitionType]r  z
list[bool]r   r  )r  r:   r   r:   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r  r  r  r  r   rR  )r   r]   r  r:   r   rR  )r  r  r  r  r   z'contextlib.AbstractContextManager[None]r2  r  r   z(tuple[float, float, list[Optional[str]]])r  r  r   rs   )r  r  r   z	ir.Layout)arv   rw   rx   r  rr  r  r#  propertyrX  setterrL  rf  r  r,  rZ   rJ  r(  r  r'  r  ro  r)  rz  r  rB  r  r  r  r  r2  r  r3  r?  r  r*  r  r  r  r  r  r  r  r9  r  r  r  r  r  r	  r  r9  rA  r  rQ  rf  r   rp  r*  r  r=  r1  r  r  rG  r  r  r  r  rM  r  r  r  rF   r  r  r  r  r  r  r  r-  rE  rF  rC  r;  rD  r3  re  ro  ri  rQ  rR  r  r  r  r!  r=  r>  s   @ri   r  r  :  sR   
U9n	# & & ( (7#,"HMP^KZ+#Z ,	 6S*4#&$6!F	808	8, (,	*  %	
 
&
> 
>*6
>	
>$0$	$LVDp'8&'8 +'8 	'8
 '8 
'8R
 RV0AN	2 u&u/@u	un>  ! 3	
 
"  ! '	
 3"D2$PD2 3D2 
	D2LO?PO? @O?  L	O?
 3O? O?bS2S @S00K0 $U0(E&E E 
!	EN..`?6 &6  6  
;	6 p,&,/@,	,\7&7/@7	7r.2&.2/@.2MP.2	.2`$&$/@$	$6< < !< B	<
 
<|M&M/@M	M^Y
&Y
/@Y
	Y
v
9(9 )9 	9
 
9v`&`/@`	8`L "*.uM uM !uM 	uM
 $(uM 
uMn3&3/@3	3j-)-)(9-)BS-)	-)f%N; !.3*.3
 3
 !3
 	3

 (,3
 $(3
 
 3
j6 Q6	:6@4@4	4	8*4
'*%5$

+:
	
=~ ! !F%	"	D '1' 
'RBH BH QBH 
"	BHHK -K @JK 	&K Z"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@ &	B &D";'"; 2"; 
	";H>0)
 )
 +)
 
	)
V-;X	06-A--A;X-A	-A^@BHQ4Q	1QN5`%
Hrk   c                  <    e Zd Zd fdZddZddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ		 	 	 	 ddZ
	 	 	 	 	 	 	 	 dd	Z	 d	 	 	 	 	 	 	 dd
ZddZddZddZd dZddZ	 	 	 	 d!dZd"dZ	 	 	 	 	 	 d#dZ	 	 	 	 d$dZ	 d	 	 	 	 	 d%dZ xZS )&r  c                0    t         |           || _        y rt   )r3  rr  r  )rh   r  r6  s     ri   rr  zBaseScheduling.__init__R  s    "rk   c                R    | j                   r| j                   j                          y y rt   )r  r  rg   s    ri   free_buffers_in_schedulerz(BaseScheduling.free_buffers_in_schedulerV  s    >>NN'') rk   c                    t               S )z0Return a set of .codegen.common.BackendFeature()r   r  s     ri   get_backend_featuresz#BaseScheduling.get_backend_featuresZ  s
    |rk   c                    t         )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r  s      ri   rp  z BaseScheduling.can_fuse_vertical^  
     "!rk   c                    t         )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r  s      ri   rq  z"BaseScheduling.can_fuse_horizontalf  r  rk   c                     y)au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Fr{   r  s      ri   rk  z.BaseScheduling.can_fuse_multi_outputs_templaten  s     rk   c                    |j                         s|j                         rt        j                  ||      S t        j	                  ||      rt        ||      S t        |t
              r|j                  |      S t        j                  ||      S )z 
        Fuse two nodes
        )	r  r>  ro   r   r   r  r   r<  r   r  s      ri   ro   zBaseScheduling.fusez  sx     !1!1!3-225%@@77uE*5%8867??5))%**5%88rk   c                    t         )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )rh   rd  s     ri   rN  zBaseScheduling.group_fn  r  rk   c                    t         )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )rh   r  epilogue_nodesrj  s       ri   r  zBaseScheduling.codegen_template  s
     "!rk   c                    t         zD
        Generate a kernel given a list of pre-fused nodes.
        r  )rh   r  r  r  s       ri   r  z.BaseScheduling.generate_kernel_code_from_nodes  s
     "!rk   c                    t         r  r  r  s     ri   r  zBaseScheduling.codegen_node  
     "!rk   c                    t         rt   r  r  s     ri   r  z*BaseScheduling.codegen_mix_order_reduction  r  rk   c                    t         )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  rg   s    ri   r  zBaseScheduling.codegen_sync  r  rk   c                     y)z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr{   rg   s    ri   r  zBaseScheduling.ready_to_flush  s    
 rk   c                    t         )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  rg   s    ri   r  zBaseScheduling.flush  r  rk   c                    t         )r  r  r  s     ri   r  z$BaseScheduling.benchmark_fused_nodes  
     "!rk   c                    t         )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r  )rh   r   s     ri   r  z)BaseScheduling.benchmark_codegened_module  s
    
 "!rk   c                     y)z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r{   r  s      ri   r  z'BaseScheduling.get_fusion_pair_priority  s     rk   c                    t         )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r  )rh   r2  r  s      ri   r  z%BaseScheduling.benchmark_combo_kernel  r  rk   c                |    |r:ddl m}  |||      }t        j                  j                  j                  ||       y y )Nr   )'set_kernel_post_grad_provenance_tracing)r+  r  rW   r   r7  write_provenance_debug_handle)rh   node_scheduler6  r  debug_handles        ri   codegen_commentzBaseScheduling.codegen_comment  s>    
 UBL GG  >>\ rk   )r  zOptional[Scheduler]rQ  )r  r  r   zOrderedSet[BackendFeature]r  r  )rd  r  r   z"tuple[tuple[sympy.Expr, ...], ...])r  r\   r  r  rj  r  r   r  rt   r  )r   z(Union[FusedSchedulerNode, SchedulerNode]r   rR  )r   r  r   rR  rS  r  )r   r   r   r  r  r  )r  r  r6  r  r   rR  )rv   rw   rx   rr  r  r  rp  rq  rk  ro   rN  r  r  r  r  r  r  r  r  r  r  r  r  r=  r>  s   @ri   r  r  Q  s   #*"&"/@"	""&"/@"	"
&
/@
	
9&9/@9	9"3"	+""(" 4" 4	"
 
"$ (,		"*	" 	" %		"
 
	""""""0"	""&/@	"4"	1" &*2 # 
	rk   r  )r   z$torch._inductor.codecache.LocalCache)r  r\   r   r  )r  r\   r   zOptional[Callable[[Any], Any]])r  r\   r   rg  )r!  r   r   r  )r   r\   r  r  r=  re  r   rR  )r  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   rR  )r  r  r  r  r   r  r   rR  )r{   )r  zlist[list[int]]rd  r  r  r  r   z	list[int])r  r  rY  r  r   rR  r  rS  )r   z	ir.IRNoder   r  )r   r\   r   r  )r~   r\   r   r\   )
__future__r   r  rm  rW  r  r  r  rc  rw  r  r_  r  r   r  r/  r   r   concurrent.futuresr   r   r   r	   r
   r   r   r   r   typing_extensionsr   torch.utils._ordered_setr   r)   r   collections.abcr   r   r   typesr   r   r  torch._inductor.async_compiletorch.utils._pytreer  _pytreer  torch._dynamo.utilsr   r    torch._inductor.autotune_processr   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr    torch.utils._sympy.symbolr!   r"   r#   torch.utils._tritonr$   r  r%   r&   r'   r(   r*   analyze_preserves_zero_maskr+   codegen.commonr,   r-   r.   comm_analysisr/   r0   r1   r2   r3   r4   excr5   r6   fx_utilsr7   r8   r9   r:   r;   r<   r=   r   r>   r:  r?   r@   runtime.hintsrA   runtime.runtime_utilsrB   rC   r   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   virtualizedrW   	getLoggerrv   r  _logginggetArtifactLoggerr  r  r  rB  r   r]   ry   r^   r_   	dataclassra   r}   r   r  rZ  r\   r  r  r  r  r  r  r&  rD  r  r  r9  r   r  r  r   r  r>  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r{   rk   ri   <module>r     s    "          	     , 3 S S S ' /  <<    $ $ $ 6 E ? 7 M > O O * D D D M M ; : 2 $    J ( 7 &    (  g!^^--hA
NN44XO  >>;;$  11(LI 34y 4T]t_ D D D* ( ( (V Vr h8 h8 h8V 4_ 4 4y1 y1x 2 2(' #L T"
 
 #
*  *K
*K4*K ,*K 
	*KZW 1 W"5. 5L*% L*^
@	$@ $ 
	,}** }*@[G0 [G|w:!3 w:tb, bP #%+#++  + 	+\0%01C0	08
1 
 
 
> +9??, 4$
&VVP@T@H T@Hn@e erk   