
    i                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlmZ ddlmZ erd d	lm Z m!Z!m"Z" d d
l#m$Z$ d dl%Z%d dl&Z&d dl'Z&d dl(m)c m*Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z< d dl=m>Z> ddl?m@Z@mAZAmBZBmCZCmZmDZD ddlEmFZF ddlGmHZHmIZImJZJ ddlKmLZLmMZM ddlCmNZNmOZOmPZPmQZQ ddlRmSZSmTZT ddlUmVZV ddlmWZWmXZXmYZYmZZZm[Z[m\Z\ ddl]m^Z^ ddl_m`Z`maZa ddlbmcZc ddldmeZemfZf ddlgmhZh ddl)miZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZz dd l{m|Z|  ej                  e~      Ze&j                   j                  e~d!      Ze&j                   j                  e~d"      Ze&j                   j                  e~d#      Zed$   Zd%ed&<    ed'      Z ed(      Z G d) d*      Zej                   G d+ d,             Zej                   G d- d.e             Z G d/ d$      Zej                  dWd0       ZdXd1ZdYd2ZdZd3Z ej                  d45       G d6 d7             Zd[d8Z G d9 d:      Z	 	 	 	 	 	 	 	 d\d;Z G d< d=e      Z G d> d?e      Z G d@ dAe      Z	 	 	 	 d]dBZ	 	 	 	 	 	 	 	 d^dDZ G dE dFe      Z G dG dHe      Z G dI dJe      Z G dK dLe      Z	 d_	 	 	 	 	 	 	 d`dMZ	 	 	 	 	 	 dadNZej                   G dO dP             Z ejH                         ZdbdQZdcdRZ	 	 	 	 dddSZ G dT dC      Z G dU dV      Zy)e    )annotationsN)Counterdefaultdict)AnyGenericOptionalTYPE_CHECKING	TypeAliasTypeVarUnion)	ParamSpec
OrderedSet   )ComputedBuffer)CallableIteratorSequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)ReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitmaybe_log_cudagraph_partitionsympy_product)Vfusionloop_orderingcompute_dependenciesBaseSchedulerNoder
   PartitionType_T_Pc                     e Zd ZdZedd       Zedd       Ze	 	 	 	 	 	 dd       Zedd       Z	e	 	 	 	 	 	 dd       Z
e	 	 	 	 	 	 dd       Zedd       Ze	 	 	 	 	 	 dd	       Zedd
       Ze	 	 	 	 	 	 dd       Zedd       Zy)MixOrderReductionz
    This class contains utility functions to decide if we should fuse reductions
    reducing across different dimensions of the same input tensor.
    c                f    | j                         xr  t        d | j                         D              S )Nc              3     K   | ]V  }t        |t              rD|j                         r4t        |j                  t              r|j                  j
                  d u X y wN)
isinstanceSchedulerNodeis_reductionnoder   _split_size.0subnodes     S/var/www/html/engine/venv/lib/python3.12/site-packages/torch/_inductor/scheduler.py	<genexpr>z7MixOrderReduction.is_split_reduction.<locals>.<genexpr>m   sK      +
'=1$$&7<<8	 LL$$D0+
s   AA)rc   all	get_nodesrd   s    ri   is_split_reductionz$MixOrderReduction.is_split_reductionk   s3      " 
s +
>>++
 (
 	
    c                `   | j                  |      rd }d }|j                         D ]n  }t        |t              r*|j	                         rt        |j
                  t              s?|j
                  j                  J t        j                  j                  j                  t        |j
                  j                              }|j
                  j                  J t        j                  j                  j                  t        |j
                  j                              }||}|}t        j                  j                  j                  ||      sJ | d|        t        j                  j                  j                  ||      reJ | d|         |J ||fS |j                  d   S )N v.s. r   )rn   rl   ra   rb   rc   rd   r   _original_rangesrT   graphsizevarssimplifyrS   _original_reduction_rangesstatically_known_equalsgroup)clsrd   xnumelrnumelrh   	curxnumel	currnumels          ri   get_numel_rnumelz"MixOrderReduction.get_numel_rnumelu   s   !!$'FF>>+ 4w6,,."7<<@||44@@@GG,,55!',,"?"?@	 ||>>JJJGG,,55!',,"I"IJ	 >&F&F77++CC	 4 	{34  77++CC	 4 	{34 148 %%%F##::a= ro   c                    | j                  |      }| j                  |      }t        |      dk7  st        |      dk7  s||k(  ryt        |      t        t        |            k(  S )N   F)r~   lentuplereversed)ry   node1node2g1g2s        ri   has_mix_reduction_ordersz*MixOrderReduction.has_mix_reduction_orders   sZ     !!%(!!%(r7a<3r7a<28RyE(2,///ro   c                R   d}|j                   j                  D ]&  }t        |t              s|j                  |k(  s$|} n |sy|j
                  }|j                   j                  }|sDt        |t              sJ t        |              |j                  d   j                   j                  }|sJ t        |      t        |j                        z
  syt        j                  j                  j                  t!        |j"                        t!        |j%                                     ryy)z@
        The access to 'buf' is not a broadcast access.
        NFr   T)read_writesreadsra   r/   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r   rT   rs   rt   rw   rS   sizevalues)ry   bufrd   	found_depdepr   r   s          ri   _is_full_accessz!MixOrderReduction._is_full_access   s    
 	##)) 	C#y)chh#o		
 %%00
d$67HDJ<H7Q33>>Jz:&E4F4F)GG
 7733)..)=9J9J9L+M
 ro   c                    g }|j                         |j                         z  }|D ]9  }| j                  ||      s| j                  ||      s)|j                  |       ; |S r`   )used_buffer_namesr   append)ry   r   r   outcommon_readsr   s         ri   get_common_readz!MixOrderReduction.get_common_read   se     ..053J3J3LL 	 C""3.33F3FsE3R

3	  
ro   c                >    t        | j                  ||            dkD  S Nr   )r   r   ry   r   r   s      ri   has_common_readz!MixOrderReduction.has_common_read   s!     3&&ue4599ro   c                    | j                  |      }t        j                  j                  j	                  |d   |d   z  d      S )Nr   r   fallback)r~   rT   rs   rt   	size_hint)ry   rd   r   s      ri   	get_numelzMixOrderReduction.get_numel   s>    !!$'ww))"Q%"Q%-!)DDro   c                $    | j                  |      S r`   )r   r   s      ri   get_fusion_scorez"MixOrderReduction.get_fusion_score   s    
 }}U##ro   c                    t         j                  j                  syt        j                  j
                  ry|j                         r|j                         sy|j                         j                  }|dvst        |      dk7  ry|j                         r|j                         sy|j                  |j                         z  s|j                  |j                         z  ry j                  ||      syt        j                  ||      }t!        |      dk(  ry j#                  |      }t%        j&                  |d   |d         }t%        j(                  |d   |d         }d}t        j                  j*                  j-                  t%        j.                  ||z  |            syt        j                  j*                  j-                  t%        j.                  ||dz              syt        j                  j*                  j-                  t%        j.                  |d            syt        j                  j*                  j-                  t%        j0                  |d   |            r||fn||f\  }	t3         fd	j4                  j6                  D              syt9        d
 j;                         D              ryt        j                  j*                  j=                  |d      syt3        d |	j;                         D              }
|
S )zP
        Check whether we can fuse two reductions with mix loop orders.
        F)cudaxputritonr   r   i  P r   i   c              3  V   K   | ]   }j                  |j                         " y wr`   )is_contiguous_loadr   )rg   r   ry   contiguous_nodes     ri   rj   z-MixOrderReduction.can_fuse.<locals>.<genexpr>9  s*      
 ""388_=
   &)c              3     K   | ]T  }|j                         rB|j                  j                  j                  t        j
                  t        j                  fv V y wr`   )rc   rd   datareduction_hintr>   INNERDEFAULTrf   s     ri   rj   z-MixOrderReduction.can_fuse.<locals>.<genexpr>@  sR      
 ##% LL,,##%%
s   AAi @  c              3  t   K   | ]0  }|j                         r|j                  j                         d v  2 yw)>   sumprodN)rc   rd   get_reduction_typerf   s     ri   rj   z-MixOrderReduction.can_fuse.<locals>.<genexpr>S  s=      
 ##% LL++-
s   68)r#   r   mix_order_reductionrT   rs   cpp_wrapperrN   
get_devicer   rF   rc   	ancestorsget_operation_namesr   r]   r   r   r~   sympyMaxMinrt   evaluate_exprGeEqrk   r   r   anyrl   statically_known_leq)ry   r   r   device_typer   r   nrowncol
size_thres
other_noder   r   s   `          @ri   can_fusezMixOrderReduction.can_fuse   s   
 }}00 77||~U\\^&&(--.";/8;!!#5+=+=+?OOe7799OOe7799  ++E59 )88F|!!!%(yyA1&yyA1&
 

 ww--ehhtd{J.OP
 ww--ehhtTAX.FG
 ww--ehhtT.BC ww--ehhr!ud.CD EN 	$   
&2288
 
   
 +446
 
 
 ww44T9E  
 &//1
 
 
ro   c                &    | j                  ||      S r`   )r   r   s      ri   are_mix_order_reductionsz*MixOrderReduction.are_mix_order_reductions^  s     ||E5))ro   c                   ddl m} |j                         D ]  }t        |t              sJ |j
                  }|j                  |j                     }|D cg c]  }|j                  |k(  s|j                    }}t        |      dk(  rr|D ]u  }	|j                  |	   }
|j                  }t        |j                               }t        j                   j"                  j%                  |
||      }|d   dk(  rk|d   dk(  rt  y  yc c}w )Nr   )MemoryUsageTyper   FT)torch._inductor.loop_bodyr   rl   ra   rb   _bodymemory_usageLOADbuffer_name
index_namer   indexing_exprsr   listkeysrT   rs   rt   stride_vars)ry   r   parent_noder   rd   	loop_bodyentrieseindex_namesr   
index_exprr   var_symbolsr   s                 ri   r   z$MixOrderReduction.is_contiguous_loadd  s   =))+ 	!DdM222

I,,_-A-ABG18QAAMMS<P1<<QKQ;1$ * !
&55jA
&11
 #:??#45gg..:: $B1,B10D !	!2 + Rs   D*DNrd   rX   returnbool)rd   rX   r   ztuple[sympy.Expr, sympy.Expr]r   rX   r   rX   r   r   )r   strrd   rX   r   r   )r   rX   r   rX   r   	list[str])rd   rX   r   intr   rX   r   rX   r   r   )r   r   r   rX   r   r   )__name__
__module____qualname____doc__staticmethodrn   classmethodr~   r   r   r   r   r   r   r   r   r    ro   ri   r]   r]   e   sb   
 
 
 #! #!J 	0%	0.?	0		0 	0  B 	%	.?			 	 :%:.?:	: :
 E E $%$.?$	$ $ s sj *%*.?*	* *
  ro   r]   c                      e Zd ZU ded<   ded<   ded<    ej
                  e      Zded	<    ej
                  e      Z	d
ed<   ddZ
ddZddZddZddZddZddZddZddZddZy)SchedulerBuffer	Scheduler	schedulerz	ir.Bufferrd   Optional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr<   
mpi_bufferc                B    | j                   }|J |j                         S r`   )r   get_name)selfops     ri   defining_op_namez SchedulerBuffer.defining_op_name  s#    ~~{{}ro   c                @    t        | j                  j                        S r`   )hashrd   r   r   s    ri   __hash__zSchedulerBuffer.__hash__  s    DIINN##ro   c                v   t               }| j                         }|j                  | dt        | j                        j
                          |j                  | d| j                  j                          | j                         r-|j                  | dt        | j                                       | j                         r-|j                  | dt        | j                                       t        | j                        dk  r0|j                  | d| j                          |j                         S |j                  | d       |j                  d      5  | j                  D ]  }|j                  | d        	 d d d        |j                  d	       |j                         S # 1 sw Y   *xY w)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rK   r   	writeliner   rd   r   layoutget_aliasespformatget_mutationsr   r   indentgetrawvalue)r   resultr   users       ri   	debug_strzSchedulerBuffer.debug_str  s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! 1 JJ 1D$$vQZ011 S!!!##	1 1s   &F//F8c                6    | j                   j                         S r`   rd   r   r  s    ri   r   zSchedulerBuffer.get_name      yy!!##ro   c                   | j                   J | j                   j                         sy | j                   j                         sL| j                   j                         s2t	        | j                   j                         t        j                        r4t        j                  j                  j                  | j                          y t        t        j                  d      r| j                         t        j                  j                  v rt        j                  j                  | j                            }|| j                   j"                  v r$| j                   j"                  |   j                   }n#| j                   j$                  |   j                   }t        j                  j                  j'                  || j                          y t        j                  j                  j                  | j                          y )Nargs)rd   should_allocateget_inputs_that_alias_outputget_mutation_namesra   get_output_specr&   CommBufferLayoutrT   rs   wrapper_codecodegen_allocationhasattrkernelr   inplace_update_buffersr   name_to_donated_buffername_to_bufcodegen_inplace_reuse)r   input_buffer_nameinput_buffers      ri   allocatezSchedulerBuffer.allocate  sV   yy$$$yy((* II224yy++-$))335r7J7JKGG  33DII> AHHf%188#B#BB !" ? ? P DNN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>ro   c                   | j                   J t        | j                   j                  t        j                        st        | j                         ry| j                  D ]  }t        |j                   t              s y yNFT)rd   ra   r
  r&   r:   rO   r   
OutputNode)r   uses     ri   can_freezSchedulerBuffer.can_free  sg    yy$$$dii&&6:SII;
 :: 	C#((J/	 ro   c                ,   i }|D ]o  }t        |j                        |v r>|j                  |t        |j                                 |t        |j                        <   X||t        |j                        <   q t        |j	                               | _        y r`   )idrd   merger   r   r   )r   r   r  r+  s       ri   	set_userszSchedulerBuffer.set_users  st    &( 	+C#((|v%'*yy3881E'Fr#((|$'*r#((|$		+
 &--/*
ro   c                R    | j                   J | j                   j                         S r`   )rd   r  r  s    ri   r  zSchedulerBuffer.get_aliases  s%    yy$$$yy5577ro   c                R    | j                   J | j                   j                         S r`   )rd   r  r  s    ri   r  zSchedulerBuffer.get_mutations  %    yy$$$yy++--ro   c                R    | j                   j                         j                         S r`   )rd   r  r   r  s    ri   r   zSchedulerBuffer.get_device  s    yy((*5577ro   Nr   r   r   r   r   Noner   r   )r   r   r   r8  r   zSequence[str]r   Optional[torch.device])r   r   r   __annotations__dataclassesfieldr   r   r<   r   r   r  r  r   r'  r,  r0  r  r  r   r   ro   ri   r   r     sz    
O,,-K--dCE>C.?k.?.?3/J+ 
$$($?B
+8.8ro   r   c                      e Zd ZU dZded<   y)SchedulerDonatedBufferNr   r   )r   r   r   r   r=  r   ro   ri   rA  rA    s    /3K,3ro   rA  c                      e Zd ZU ded<   ded<   ded<   ded<   ded<   d	ed
<   ded<   dZded<   ded<   ded<   dZded<   ded<   ded<   dZded<   dRdZdSdZdTdZ	dTd Z
dTd!ZdUd"ZdTd#ZdVd$Z	 	 	 	 	 	 dWd%ZdXd&ZdYd'ZdZd(Zd[d)Z	 	 	 	 	 	 d\d*ZdVd+Zd]d,Zd]d-ZdVd.ZdVd/Z	 	 	 	 d^d0ZdTd1ZdTd2Zed]d3       Zed]d4       ZedZd5       Z edZd6       Z!d_d7Z"d`d8Z#dad9Z$dbd:Z%dZd;Z&dZd<Z'dZd=Z(dZd>Z)dZd?Z*dZd@Z+dZdAZ,dZdBZ-dcdCZ.dZdDZ/dVdEZ0	 dd	 	 	 	 	 dedFZ1edfdG       Z2edfdH       Z3edfdI       Z4	 	 	 	 	 	 dgdJZ5	 	 	 	 	 	 dhdKZ6edidL       Z7djdMZ8edjdN       Z9dkdOZ:dldPZ;e<	 	 	 	 dmdQ       Z=y)nrX   OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]rx   
last_usager   	min_order	max_orderr=   mpi_nodedict[str, str]mutation_renamesNzOptional[ir.Operation]rd   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_nameOptional[float]override_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFr   writtenc                "    || _         d | _        y )Nc                     g S r`   r   )r  kwargss     ri   <lambda>z,BaseSchedulerNode.__init__.<locals>.<lambda>  s    B ro   )r   debug_device_str)r   r   s     ri   __init__zBaseSchedulerNode.__init__
  s    $-& 	ro   c                Z   || _         t               | _        t        t                  | _        d| _        |j                         D cg c]  }t        | j                  ||        c}| _	        | j                  D ci c]  }|j                         | c}| _        i | _        y c c}w c c}w )NF)r   rd   r   )rd   r   r   r   rD  rS  get_outputsr   r   rK  r   rM  rI  )r   rd   outputr   s       ri   _init_from_nodez!BaseSchedulerNode._init_from_node  s    	#$
   **,
  .. 
 @D||L 3L !#
  Ms   B#;B(c                T    t        |       j                   d| j                         dS )Nz(name=)r   r   r   r  s    ri   __repr__zBaseSchedulerNode.__repr__(  s'    t*%%&fT]]_,?qAAro   c                H   | j                         }t               }|j                  | dt        |       j                   dt        t        | dd            j                   d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j                  |j                                # 	 ddd       |j                  d       	 |j                  | j                                |j'                         j)                         S # 1 sw Y   XxY w# t         $ r t"        j%                  dd       Y Lw xY w)#Longer form printout for trace logsr  (rd   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        r  Ignoring error in debug_str()Texc_info)r   rK   splicer   r   getattrr  r   writesrR  r   r  r[  r  r	  debug_str_extra	Exceptionlogwarningr  rstrip)r   r   r   r   s       ri   r  zBaseSchedulerNode.debug_str+  s   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   %5E25E> 2E;> F! F!c                     y)N r   r  s    ri   rp  z!BaseSchedulerNode.debug_str_extraD      ro   c                $    | j                  |       S r`   )rX  r  s    ri   _debug_str_for_devicez'BaseSchedulerNode._debug_str_for_deviceG  s    $$T**ro   c                   t        | j                  dd       }d}t        |t        j                  j
                  j                        r'd|j                  |j                         gdd      z   }nct        |t        j                  j
                  j                        r5d|j                  |j                         |j                         gdd      z   }|  | S )Nr   rv  z, F)shorten	multiline)rn  rd   ra   torch	_inductorr&   	Pointwise
str_helperget_size	Reductionget_reduction_sizer   )r   
maybe_datadata_strs      ri   debug_str_shortz!BaseSchedulerNode.debug_str_shortJ  s    TYY5
j%//"4"4">">?j33$$&'% 4  H 
EOO$6$6$@$@Aj33..0*2O2O2QR 4  H
 z""ro   c                p    t         j                  d| | j                  | j                  j                         y )Nz(%s: unmet_dependencies = %s, writes = %s)rr  inforR  r   ro  r  s    ri   log_detailszBaseSchedulerNode.log_detailsY  s,    6####		
ro   c                     yNFr   )r   self_dep	other_deps      ri   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_paira       ro   c                    d | j                   j                         D        D ci c]  }||v r|||    c}| _        | j                  | j                   j	                  | j                               y c c}w )Nc              3  4   K   | ]  }|j                     y wr`   r   rg   r   s     ri   rj   z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>i  s     QcQ   )r   reads_and_writesrI  set_read_writesrename)r   renamesr   s      ri   update_mutated_namesz&BaseSchedulerNode.update_mutated_namesf  sp     RT-=-=-N-N-PQ!
w '$-!

 	T--44T5J5JKL!
s   A2c                X    | j                  | j                  j                  |             y r`   )r  r   	with_readr   r   s     ri   add_fake_depzBaseSchedulerNode.add_fake_depn  s!    T--77<=ro   c                B    t        d | j                         D              S )Nc              3  `   K   | ]&  }|j                         xs |j                          ( y wr`   )r  r  )rg   r   s     ri   rj   z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>r  s-      
9<COO4!2!2!44
s   ,.)r   r[  r  s    ri   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutationq  s%     
@D@P@P@R
 
 	
ro   c                h    || _         | j                   j                  | _        | j                          y r`   )r   r   rR  
prune_deps)r   rws     ri   r  z!BaseSchedulerNode.set_read_writesv  s(    "&"2"2"8"8ro   c                b    | j                         }t        fd|D              }||z
  | _        y )Nc              3  B   K   | ]  }j                  ||        y wr`   )get)rg   kmutation_real_names     ri   rj   z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>  s     !U1"4"8"8A">!U   )used_or_aliased_buffer_namesr   rD  )r   future_used_buffersr  used_bufferss     ` ri   set_last_usagez BaseSchedulerNode.set_last_usage{  s0     88:!!U!UU&)<<ro   c                F    | j                   D ]  }|j                           y r`   )rK  r'  )r   r   s     ri   mark_runzBaseSchedulerNode.mark_run  s    << 	CLLN	ro   c                    t        d t        j                  | j                  j                  | j                  j
                        D              S )Nc              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s      
 HH
r  )r   	itertoolschainr   r   ro  r  s    ri   r   z#BaseSchedulerNode.used_buffer_names  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
ro   c                \   t               t        j                  | j                  j                  | j                  j
                        D cg c]*  }t        |t              r|j                  s|j                  , }}t        |      dkD  r|j                         }j                  |       t        j                  j                  j!                  |      rC|j#                  fdt        j                  j                  |   j%                         D               t        |      dkD  rS c c}w )z
        Returns buffer names used by this node, including aliases.

        Note: is_fake WeakDeps are excluded since they are purely for ordering
        and should not affect buffer lifetime.
        r   c              3  *   K   | ]
  }|vr|  y wr`   r   )rg   alias
used_namess     ri   rj   zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>  s#       J.	 s   )r   r  r  r   r   ro  ra   r1   is_faker   r   popaddrT   rs   name_to_bufferr  extendr  )r   r   depsr  s      @ri   r  z.BaseSchedulerNode.used_or_aliased_buffer_names  s     '1l
 !t'7'7'='=t?O?O?V?VW
sG, HH
 

 $i!m((*CNN3ww%%))#. !"!7!7"224	 	 $i!m !
s   /D)c                L     t         fd j                  D               _        y )Nc              3  f   K   | ](  }|j                   j                  j                  vr| * y wr`   )r   r   available_buffer_namesrg   r   r   s     ri   rj   z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>  s/      -
xxt~~DDD -
s   .1r   rR  r  s   `ri   r  zBaseSchedulerNode.prune_deps  s#    ", -
..-
 #
ro   c                     d fdt        fd j                  j                  D              } j                   j                  j	                  |             y )Nc                    t        | t              syj                  j                  | j                     j                         }|t        j                  j                  v S r  )	ra   r1   r   r#  r   r   rT   rs   removed_operations)r   op_namer   s     ri   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  sF    c7+nn00:KKMGagg8888ro   c              3  4   K   | ]  } |      s|  y wr`   r   rg   r   r  s     ri   rj   z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  s      
\#5FC
   r   r.   r   r   )r   r   r   r  remove_reads)r   	to_remover  s   ` @ri   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps  sN    	9  
++11
 
	 	T--::9EFro   c                F    t        | || j                  j                         y r`   )_prune_redundant_depsr   r#  )r   name_to_fused_nodes     ri   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps  s     	d$68R8RSro   c                R    | j                   J | j                   j                         S r`   )rd   get_operation_namer  s    ri   r   zBaseSchedulerNode.get_name  r3  ro   c                "    | j                         S r`   r   r  s    ri   get_first_namez BaseSchedulerNode.get_first_name  s    }}ro   c                B    t        d | j                         D              S )Nc              3  <   K   | ]  }|j                           y wr`   r  rg   rd   s     ri   rj   z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s     Gd$--/G   )r   rl   r  s    ri   r   z%BaseSchedulerNode.get_operation_names  s    Gdnn6FGGGro   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wr`   r  rg   r   s     ri   rj   z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     AS#,,.Ar  )r   rK  r  s    ri   get_buffer_namesz"BaseSchedulerNode.get_buffer_names  s    ADLLAAAro   c                B    t        d | j                         D              S )Nc              3  Z   K   | ]#  }t        |t              xr t        |d        % yw)T)disallow_fp32_opsNra   rb   r(   rg   ns     ri   rj   zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s7      
  q-( G+AFG
s   )+rk   rl   r  s    ri   can_codegen_in_low_precisionz.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
ro   c                B    t        d | j                         D              S )Nc              3  V   K   | ]!  }t        |t              xr t        |       # y wr`   r  r  s     ri   rj   z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s-      
 q-(K-H-KK
s   ')r  r  s    ri   r(   z-BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
ro   c                    | gS r`   r   r  s    ri   rl   zBaseSchedulerNode.get_nodes  s	    vro   c                    | j                   S r`   )rK  r  s    ri   r[  zBaseSchedulerNode.get_outputs  s    ||ro   c                     | j                   |   S r`   )rM  )r   buf_names     ri   
get_outputzBaseSchedulerNode.get_output  s    ##H--ro   c                R    | j                   J | j                   j                         S r`   )rd   r   r  s    ri   r   zBaseSchedulerNode.get_device  s%    yy$$$yy##%%ro   c                L    | j                         }|d uxr |j                  dk(  S Ncpu)r   r   r   devices     ri   is_cpuzBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::ro   c                X    | j                         }|d uxr t        |j                        S r`   )r   rN   r   r  s     ri   rN   zBaseSchedulerNode.is_gpu  s'    "T!9fV[[&99ro   c                     yr  r   r  s    ri   rc   zBaseSchedulerNode.is_reduction      ro   c                     yr  r   r  s    ri   is_native_matmulz"BaseSchedulerNode.is_native_matmul  r  ro   c                     yr  r   r  s    ri   is_split_scanzBaseSchedulerNode.is_split_scan  r  ro   c                     yr  r   r  s    ri   is_templatezBaseSchedulerNode.is_template  r  ro   c                     yr  r   r  s    ri   	is_externzBaseSchedulerNode.is_extern  r  ro   c                     yr  r   r  s    ri   
is_foreachzBaseSchedulerNode.is_foreach  r  ro   c                     yr  r   r   read_deps     ri   can_inplacezBaseSchedulerNode.can_inplace  r  ro   c                     yr  r   r  s    ri   has_side_effectsz"BaseSchedulerNode.has_side_effects  r  ro   c                \
    ddl m} t         t              rt        j
                  rt        j                  j                   j                         t        j                        r{t        t        j                  t        j                  j                  j                   j"                        rt%        t        j                  dd      t'        t        j                  d      sy j(                  t        j                  j*                  z   j,                  j.                  z  }d fd} j1                         D ]  }|j2                  }|J |j5                         rJ|j7                         s:|j9                         s*|j;                         t        j                  j<                  v ro j>                  j@                  D ]h  }|jB                   j,                  jD                  v r$ j,                  jD                  |jB                     }n/ j,                  jF                  jI                  |jB                        }|s|t        j                  jJ                  jM                  |       st        |jN                  tP              r|jR                  J |jR                  D cg c]   }|j2                  j;                         |vr|" }	}tU        |	      dk(  s|	d   jV                  s&|	d   j2                   u s9|j2                  Gt        |j2                  jY                         tZ        j\                  tZ        j^                  tZ        j`                  f      r|jN                  rft        |jN                  j2                  tZ        jb                  tZ        jd                  f      r(tU        |j2                  j7                               dkD  r ||j2                  |j2                        s+ ||      s5t        j                  jf                  ji                  |j;                         |j;                                t        t        j                  t        j                  j                  j                   j"                        rnt        j                  jj                  jm                  |j;                                t        j                  jj                  jm                  |j;                                |j;                         t        j                  jn                  |j;                         <      yc c}w )	z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNr  c                   | j                   j                        }| j                         t               }| j                  D ]  }|j
                  }t        |t              s |j                         | j                   j                  vs| j                   j                  |      |urd|fd|j                  j                         D        z  }t        |      dkD  s y y)Nc              3  @   K   | ]  }|j                   k(  r|  y wr`   r  )rg   or  s     ri   rj   z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>>  s%      vv)    r   FT)r   get_fused_noder   r   r   rd   ra   rX   r  r  r   r  r   )buf_to_be_inplaced
fused_noder  r  	user_noder  r   s        @ri   single_index_in_fused_nodezKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node&  s    
 ,55DDTJJ)224H %/LD*00 ! II	!)->? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= '!* ro   r   )r  r   r   r   )8codegen.wrapperr
  ra   rb   r#   inplace_buffersrT   rs   has_featurer   r)   INPLACE_BUFFERSr   r}  r~  codegensimd
SIMDKernelrn  r  r   r  r   completed_operationsr[  rd   r  r  r  r   removed_buffersr   r   r   r"  r#  r  r  	can_reuser   NopKernelSchedulerNoder   r   r  r  r&   r:   r9   MutationLayoutSHOULDREMOVEFallbackKernelr8   r  make_inplacer  r  r!  )
r   r
  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usess
   `         ri   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update  s   
 	; t]+&&##DOO$5~7U7UVqxx)@)@)E)E)P)PQ188[$7C &) NNgg(()nn112 	 	D ##% C	CxxH''',,.88:..0<<>QWW%<%<<((.. 899 E EE $ E Edii PI $ : : > >tyy II ,,66y$G&y'<'<>TU$??666 "+&66??,4II &N & N+q0*1-99*1-22d:%NN6 *%NN::< " " 4 4 " = =! &11 * ) 5 5 : :!#!2!2BNN C! !$INN$O$O$Q RUV V1)..#((K6yA
 2293E3E3GX%HHeoo&=&=&B&B&M&M HH..2293E3E3GHHH..223<<>B &..0 77G q8C	0&s   %T)c                R   t         j                  sy |r| j                  ry | j                  J | j                  j	                         }g }|D ]0  }|j
                  dk(  r|j                  d       |j                  d       d|j
                   d|j                   }d|j                  v r|d|j                  d    z   }|j                  |       d|j                  v s|j                  d    }|j                  d	d
      d   }|j                  d|j                  dd      j                  dd      j                  dd      j                  dd      z          |j                  d       |j                  d       3 t        |      dk(  ry |j                  |       d| _        y )Nr\  rv  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr   {z{{}z}}rg  \z\\z#pragma CMT END ORIGINr   T)r#   comment_originrS  rd   get_originsr   r   targetmetarsplitreplacer   
writelines)	r   buffer	only_onceorigins	out_linesr  op_info_strr-  stack_trace_last_lines	            ri   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info  s    $$yy$$$))'')	 	%AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(:(:3(:(KB(O%  "+33C>WS$'WT4(Wf	   !9:  $3	%6 y>Q 	)$ro   c                (    | j                  dd      S )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implr  s    ri   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s    55t 6 
 	
ro   c                (    | j                  dd      S )NTFrB  rE  r  s    ri   get_read_buffer_sizesz'BaseSchedulerNode.get_read_buffer_sizes  s    55u 6 
 	
ro   c                (    | j                  dd      S )NFTrB  rE  r  s    ri   get_write_buffer_sizesz(BaseSchedulerNode.get_write_buffer_sizes  s    55 6 
 	
ro   c                Z    t        | j                  ||      j                         d      S )NrB  r   )start)r   get_read_write_buffer_accessesr   )r   rC  rD  s      ri   rF  z3BaseSchedulerNode.get_read_write_buffers_sizes_impl  s3     //+N 0 fh	
 	
ro   c                    t         t              ri S t         t              rt         j                  t              ri S t         t              r`t         j                  t
        j                        r< j                  j                  t        j                  j                  j                  u ri S ddt         t              r@ t         j                         d         t         j                         d         z        nt        d      t!        j"                  t$              }|r9 j&                  j(                  D ]   }||j*                     j-                  |       " |r9 j&                  j.                  D ]   }||j*                     j-                  |       " |r&t1        d  j&                  j(                  D              n	t1               }|r&t1        d  j&                  j.                  D              n	t1               }d fdt         t2              rt1         fd|D              }||z
  }||z
  }i }||z  D ]  }	t5        fd	||	   D              |	t6        j8                  j:                  v rt6        j8                  j:                  |	   }
n;|	t6        j8                  j<                  v rt6        j8                  j<                  |	   }
n	 	 	 	 d fd
 |
      }|	|vr|||	<   ||	xx   |z  cc<    |S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        c                X    t         j                  j                  j                  | d      S )Nr   r   )rT   rs   rt   r   )ss    ri   try_size_hintzGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s"    77##--a!-<<ro   r   r       eAc              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     BCsxxBr  c              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>       CCsxxCr  c                    j                   j                  |    j                  }t        d |D              }t	        |t        |      z
        dkD  S )Nc              3  4   K   | ]  }|j                     y wr`   rm   )rg   r  s     ri   rj   z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>#  s     !>$))!>r  r   )r   r#  r   r   r   )r   r   r   buf_usesr   s       ri   is_materializedzIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized!  sG    NN..s399E!!>!>>Hx*V"44599ro   c              3  J   K   | ]  } |j                         r|  y wr`   r   )rg   r   rZ  r   s     ri   rj   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>'  s#      )_S$++-N)s   ##c              3  "   K   | ]  }  y wr`   r   )rg   r   
node_numels     ri   rj   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>0  s     $RCZ$Rs   c                   | syt        | t        j                        r| j                         S t        | j                  t
              rj                  j                  | j                            j                  }d}|D ]x  }t        |j                  t              sJ t        |j                  j                  t              r5|j                  j                         D ]  }| |j                        z  } x y |S t        | j                  t        j                        r"t        fd| j!                         D              S  	t#        | j%                                     }t'        | j)                               t+        |      z  S )Nr   c              3  h   K   | ])  } t         j                  j                  |             + y wr`   )rT   rs   
get_buffer)rg   mut_nameget_buf_bytess     ri   rj   zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>R  s-      $ &agg&8&8&BCs   /2)ra   r&   TorchBindObjectrc  r
  r9   r   r#  r   r   rd   rX   r8   r[  r:   r   r  rS   r  rH   	get_dtypemin)
r   r   totr  	sched_buf	buf_elemsbuf_accessed_elemsrc  r   rR  s
         ri   rc  zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes9  sC    c2#5#56,,..

,=> !NN66s||~FLLEC % 	%)$))5FGGG%diinnkB-1YY-B-B-D E	 #}Y^^'D DE $%	% J

BMM: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  ro   )rQ  z
sympy.Exprr   r   )r   r   r   Sequence[BaseSchedulerNode]r   r   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r   )ra   r  ExternKernelSchedulerNoderd   r8   r&   r!  op_overloadr}  _prims	rng_primsgraphsafe_run_with_rng_staterb   rS   
get_rangesr   collectionsr   r   r   r   r   r   ro  r   r   r   rT   rs   r  graph_inputs)r   rC  rD  buf_accessesr   r   ro  r  buf_byte_accessesr  r   	buf_bytesrj  rc  rZ  r^  rR  s   `           @@@@@ri   rN  z0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d23Id56:II{<
 It67499b&7&78		%%||%%BBC I	= dM*&doo/23 1! 456J
 SJ"..t4''-- 3SXX&--c23 ''.. 3SXX&--c23
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d./( )%) O o-FO+E,. 1	9H!$$R<;Q$R!R177111gg,,X6QWW111gg**84!Q!! !F &c*I00.7!(+!(+y8+c1	9f ! ro   c                    | j                   y | j                   j                         }|y t        |      }|y t        j                  j
                  j                  |d      }t        d   dxx   |z  cc<   |S )Nr   r   inductor
flop_count)rd   get_origin_noder4   rT   rs   rt   r   r   )r   fx_nodeflopsresolved_flopss       ri   estimate_flopsz BaseSchedulerNode.estimate_flopsd  su    99))++-?w'=))33EA3F\*n<*ro   c                R    | j                   | j                   S | j                         S r`   )rO  _get_estimated_runtimer  s    ri   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtimet  s)    **6222**,,ro   c                   | j                         d   j                         d   }|j                  j                         }t	        t        |            syt        | j                        rt        | j                  t        j                        sJ 	 t        j                  rst        |       }t               }|j                  |      }|t        |t              sJ |S t!        |       }|t#        | j                        }|j%                  ||       |S t#        | j                        S t/        | j                        ryt1        |       }||S |j                  j3                         }		 t5               }
t7        |	      dz  }|
dk  rt9        d|
       |dk  rt9        d|       	 | j=                         }|dk(  s|| j?                         |
z  }|dz  }|S d}| j?                         }|dn|}||z  |z  d	z  }||
z  }tA        ||      }|dz  }|S # t&        $ r}t(        j+                  |       Y d}~yd}~wt,        $ r}t(        j+                  |       Y d}~yd}~ww xY w# t:        $ r Y yw xY w)
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?rS  )!rl   r[  rd   r  rN   r6   rL   ra   r&   IRNoder$   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupfloatr-   r,   	set_value
ValueErrorrr  r  	TypeErrorrQ    maybe_estimate_runtime_benchmarkmaybe_get_dtyperI   rG   AssertionErrorrq  r~  rG  max)r   r   r
  	cache_keycache	cache_valmsr   retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     ri   r  z(BaseSchedulerNode._get_estimated_runtimez  sw   
 nnq!--/2))+of-. #dii333LL I$ OI68E %Y 7I ,))U;;;((HNBz=diiHOOIRO8I7		BB TYY
 .t4?J((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.2247KKBcBI 99;*2*Y6#=%(<< }-#X	o    :  		sC   AH 6H H (>I$ 	I!H66I!II!$	I0/I0c                     y r`   r   r  s    ri   get_template_nodez#BaseSchedulerNode.get_template_node      ro   c                .    | j                         }|J |S r`   r  )r   templates     ri   get_template_node_or_throwz,BaseSchedulerNode.get_template_node_or_throw  s!    ))+###ro   c                f    t        d t        |       D              }| d| }| |   }| |dz   d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c              3  H   K   | ]  \  }}|j                         s|  y wr`   r  )rg   ir  s      ri   rj   zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s     PDAqaPs   ""Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        ri   get_prologue_template_epiloguez0BaseSchedulerNode.get_prologue_template_epilogue  sN     PIe,<PP.)n-!+-.00ro   )r   r   r   r8  )rd   ir.Operationr   r8  r5  )r   r   r7  r  r/   r  r/   r   r   r  rH  r   r8  )r   r.   r   r8  r9  )r  rP  r   r8  r  rC  r  rH  r   r8  r   rC  r  dict[str, BaseSchedulerNode]r   r8  r   rk  )r   zSequence[SchedulerBuffer])r  r   r   r   r;  r  zdependencies.Depr   r   T)r:  rK   r;  r   r   r8  r6  )rC  r   rD  r   r   r   )rC  r   rD  r   r   zdict[str, int]r   z
int | None)r   r  r   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)r  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]])>r   r   r   r=  rd   rO  rS  rY  r]  ra  r  rp  ry  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r   r  rC   r   r  r  r(   rl   r[  r  r   r  rN   rc   r  r  r  r   r  r  r  r)  r@  rG  rI  rK  rF  rN  r~  r  r  r  r  r   r  r   ro   ri   rX   rX     s   BB NN''$$#'D
 '""//266((''GT
#0B*2+#
!.7	
M>


=#2=HV=	=
6
GT">T	T
. H H B B 
 
 
 
.&;:@F 9=-$-15-	-^ 
 

 
 

 
 


!
37
	
J!!J!37J!	J!X  - U Un
 1&1	S1 1ro   c                 R    t         j                  j                  j                         S r`   )r}  r~  	codecache
LocalCacher   ro   ri   r  r    s    ??$$//11ro   c                   t        | j                  dd      }| j                  j                  }| j                  j                  g || j                  j                  | j                  j
                        }| j                  j
                  }t        j                  ||f      \  }}ddt        |ft        fd|D              z         }|S )Npython_kernel_namerv  c                p    t        | t        j                        xr t        | t        j                         S r`   )ra   r&   r  GeneratorStater'  s    ri   _is_tensor_irz@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir  s(    !RYY'P
1b>O>O0P,PPro   c              3  d   K   | ]'  } |      rt        |j                               nd  ) y wr`   )r   r  )rg   ar  s     ri   rj   z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>  s(     Ua}Q'7ajjl#TAUs   -0r9  )
rn  rd   inputsfill_non_provided_argsconstant_argsrV  pytreetree_flattenr   r   )snoder  r  rV  	flat_argsflat_args_pytree_specr  r  s          @ri   r  r    s     -A2F::D::,,*$*))*

D ZZF'-':':D&>'J$I$Q 	
U9U
U	VI ro   c                   t        | t              sy t        j                  j                  j
                  t        j                  j                  j                  t        j                  j                  j                  d}t        | j                  dd      }||vry t        | j                  t        j                        sy ||   S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr  rv  )ra   rl  r}  opsatenmmbmmaddmmrn  rd   r&   ExternKernel)r  mms_fnsr  s      ri   _get_mm_like_fnr    s    e67"YY^^..#iinn00 %		 4 4G
 !-A2F(ejj"//2%&&ro   c                R    d }d }t         j                  rt               }|y |} fd}ny t               }t	               }|j                  |      }|t        |t              sJ |S ddlm	  |       \  }}ddl
m}	 |	j                  |||      }
|j                  ||
       |
S )Nc                             S r`   r   )r  snode_args_kwargss   ri   rW  z2maybe_estimate_runtime_benchmark.<locals>.<lambda>  s    !25!9 ro   r   )r  r   )benchmarkerr  )r#   !runtime_estimations_mms_benchmarkr  r  r  r  ra   r  utilsr  $torch._inductor.runtime.benchmarkingr  	benchmarkr  )r  bench_fnargs_kwargs_fnmm_fnr  r  r  r  rV  r  r  r  s   `          @ri   r  r    s    HN//&=99%@I&(EY'I)U+++(!#LD&@			xv	6B	OOIRO(Iro   T)slotsc                  N    e Zd ZU ded<   ded<   ded<   ded<   ddZddZdd	Zy
)	WhyNoFuser   name1name2reasonztuple[Any, ...]r  c                X    |j                         | _        |j                         | _        y r`   )r   r  r  r   r   r   s      ri   rY  zWhyNoFuse.__init__8  s    ^^%
^^%
ro   c                J    || _         || _        t        j                  |        y r`   )r  r  
fusion_logdebug)r   r  r  s      ri   __call__zWhyNoFuse.__call__<  s    	ro   c                p    d| j                    d| j                   d| j                  | j                  z  z   S )Nzcannot fuse z with r  )r  r  r  r  r  s    ri   __str__zWhyNoFuse.__str__A  s6    djj\

|2>KK$))#
 	
ro   Nr   rX   r   rX   r   r8  )r  r   r  r   r   r8  r5  )r   r   r   r=  rY  r  r  r   ro   ri   r  r  1  s&    JJK
&

ro   r  c                    t        | t        t        f      rt        | t              } t        j                  | d      }d|v rdt        j                  |d       S |S )Nkey   )r  rg      )	ra   r   setsortedr   pprintr  textwrapr  )objr  s     ri   r  r  G  sR    #
C()Sc"^^C*Fv~HOOFG4566Mro   c                  0    e Zd ZddZddZddZd	dZeZy)
r*  c                &    t        |g      | _        y r`   r  r  s     ri   rY  zOutputNode.__init__R  s    ",cU"3ro   c                     yr  r   r  s    ri   rc   zOutputNode.is_reductionU  r  ro   c                     y)Nr   r   r  s    ri   r  z'OutputNode.get_inputs_that_alias_outputX  rw  ro   c                     y)NOUTPUTr   r  s    ri   r   zOutputNode.get_name[  s    ro   N)r   r0   r   r8  r9  r:  r5  )r   r   r   rY  rc   r  r   ra  r   ro   ri   r*  r*  Q  s    4 Hro   r*  c                    t        j                          j                  D ]N  }t        |t              r|j
                     j                         }|   j                         xx   dz  cc<   P d fdt        fd j                  D              }|r? j                  |z
   _         j                   j                  j                  |             yy)am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   c                    t        | t              rf| j                     j                         }|   j	                            dkD  xr  j
                  j                  | |         }|   k(  }|xs |S y)Nr   F)ra   r1   r   r   r   r   fusable_weak_dep)r   r  is_redundantis_self_depr#  name_to_dep_countr  rd   s       ri   r  z+_prune_redundant_deps.<locals>.should_pruneu  s    c7#!#((+<<>G,"7+446 nn55'0$  -W5=K.;.ro   c              3  4   K   | ]  } |      s|  y wr`   r   r  s     ri   rj   z(_prune_redundant_deps.<locals>.<genexpr>  s      ,s2Cr  Nr  )rr  r   rR  ra   r1   r   r   r   r   r  r   r  )rd   r  r#  r   r  deps_to_pruner  r  s   ```   @@ri   r  r  a  s     '2&9&9&;&& K#w'!#((+<<>G09BBDEJEK
    .. M "&"9"9M"IT--::=IJ ro   c                  8     e Zd Zd fdZddZddZddZ xZS )rl  c                    t         |   |       | j                  |       | j                  |j	                                y r`   superrY  r]  r  get_read_writesr   r   rd   	__class__s      ri   rY  z"ExternKernelSchedulerNode.__init__  5    #T"T1134ro   c                V    | j                          dt        | j                  dd        S )Nz.node.kernel = r  )r   rn  rd   r  s    ri   rp  z)ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbro   c                     yNTr   r  s    ri   r   z#ExternKernelSchedulerNode.is_extern  r  ro   c                    | j                   J t        | j                   d      xr | j                   j                         S )Nr  )rd   r  r  r  s    ri   r  z*ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVro   r   r   rd   r  r   r8  r5  r9  )r   r   r   rY  rp  r   r  __classcell__r  s   @ri   rl  rl    s    5
cWro   rl  c                        e Zd Zd fdZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y r`   r  r  s      ri   rY  zNopKernelSchedulerNode.__init__  r  ro   r  )r   r   r   rY  r  r  s   @ri   r  r    s    5 5ro   r  c                      e Zd ZU dZded<   ded<   	 	 	 	 	 	 d! fdZ	 	 d"	 	 	 	 	 d#dZ	 	 d"	 	 	 	 	 d$dZ	 	 	 	 	 	 d%d	Zd&d
Z	d'dZ
d(dZd'dZ	 	 	 	 	 	 d)dZd'dZ	 	 	 	 	 	 d*dZd+dZd,dZd-dZd-dZd-dZd-dZd.dZd/dZ	 	 	 	 d0dZd1dZ	 d2	 	 	 d3dZed4d       Zed4d       Zd5dZed6d       Zed- fd        Z  xZ!S )7rb   zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr;   r   c                f    t         |   |       | j                  |       | j                          y r`   )r  rY  r]  _compute_attrsr  s      ri   rY  zSchedulerNode.__init__  s,    
 	#T"ro   c                   t        | j                  t        j                  t        j                  f      sJ | j                  j                  ||      \  | _        }|| _        | j                  j                         }| j                  j                  |      j                  }| || j                        f| _        t        j                   xs t        |j                          }t        | j                  t        j                        r,| j#                  | j                  j%                  |             y | j#                  t'        j$                  | j                  g| j                  d|i       y )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizer'  )ra   rd   r&   r   TemplateBuffersimplify_and_reorderr   r   get_device_or_errorr   get_backendgroup_fnrx   r#   loop_ordering_after_fusionrN   r   r  extract_read_writesr%   )r   r%  r&  bodyr  r,  should_normalizes          ri   r"  zSchedulerNode._compute_attrs  s7   
 $))b&7&79J9J%KLLL II::'A&? ; 
T 
..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!23  		--8H-I   00JJ!%8Hro   c                *    | j                  ||       y )Nr$  )r"  )r   r%  r&  s      ri   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
ro   c                   t        d | j                  j                  D              }| j                  t	        j
                  | j                  g| j                  d|ij                  |      j                  | j                               | j                  j                  |        |r!ddlm} |j                  j!                          y y )Nc              3  N   K   | ]  }t        |t        t        f      s|  y wr`   )ra   r1   r0   r  s     ri   rj   z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>  s#      0
ZgwEW5XC0
s   %%r'  r   SIMDScheduling)r   r   r   r  r%   r.  r   r   r  r  rI  pointwise_read_writesclear_cachecodegen.simdr6  candidate_tilingscache_clear)r   r'  need_clear_tiling_cache	fake_depsr6  s        ri   refresh_dependenciesz"SchedulerNode.refresh_dependencies  s    
 &0 0
++110
 &
	 	,,

![[4= Yy!VD))*	
 	""..t4"4 ,,88: #ro   c                    | j                   j                  |      | _         | j                   j                  | _        | j	                  dd       y )NFTr'  r<  )r   reorder_iter_loopssizesr   r>  )r   	new_orders     ri   apply_new_loop_orderz"SchedulerNode.apply_new_loop_order  sA    ZZ22

 jj&&!!E4!Pro   c                   | j                   j                         }t        | j                   j                        |z
  }t	        t        |            }t	        t        |||z               }| j                  ||z          t        | j                  d         dk(  sJ | j                  d   | j                  d   d   | j                  d   d   ff| _        y )Nr   r   r   )r   get_original_num_rdimsr   	iter_varsr   rangerD  rx   )r   	num_rdims
num_pwdimspwdimsrdimss        ri   swap_pw_red_dimensionz#SchedulerNode.swap_pw_red_dimension	  s    JJ557	--.:
uZ()eJ
Y(>?@!!%&.14::a=!Q&&&ZZ]TZZ]1%5tzz!}Q7G$HH
ro   c                D    | j                   j                         | _         | S r`   )r   extract_pw_from_reductionr  s    ri   rO  z'SchedulerNode.extract_pw_from_reduction  s    ZZ99;
ro   c                    t         j                  |       sy t        | j                  t        j
                        sJ | j                  j                         5  | j                          d d d        y # 1 sw Y   y xY wr`   )r]   rn   ra   rd   r&   r   with_original_inner_fnr"  r  s    ri   cancel_reduction_splitz$SchedulerNode.cancel_reduction_split  s^     33D9$))R%6%6777YY--/ 	"!	" 	" 	"s   A11A:c                   t        | j                  t        j                  t        j                  f      sJ | j
                  j                  ||      | _        | j
                  j                  | _        | j                  j                         }| j                  j                  |      j                  }| || j                        f| _        | j                  dd       y )NTr@  )ra   rd   r&   r   r(  r   #expand_dimension_for_pointwise_noderB  r   r*  r   r+  r,  rx   r>  )r   	dimension	new_ranger  r,  s        ri   rT  z1SchedulerNode.expand_dimension_for_pointwise_node  s     $))b&7&79J9J%KLLLZZCCy

 jj&&..0>>--f5>>ht{{34
 	!!D$!Oro   c                    | j                   j                         | _         | j                   j                  | _        | j	                  dd       y )NTFr@  )r   merge_loopsrB  r   r>  r  s    ri   rX  zSchedulerNode.merge_loops/  s<    ZZ++-
jj&& 	!!D%!Pro   c                   d }| j                   d   }t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|rPt        xj
                  dz  c_        t        j                  d| j                         |       | j                  |       yt        j                  d| j                                y)Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r   r   num_varsdecide_loop_order_to_matchr'   num_loop_reorderingloop_ordering_logr  r   rD  )r   r  r  rC  
self_sizess        ri   r  z'SchedulerNode.reorder_loops_by_dep_pair;  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##W ro   c                $   | j                         }| d| j                  d    | d| j                  d    | d| j                   g}| j                  j	                         D ]  }t        |t              r|j                  }t        j                  j                  |      }t        |t        j                        rZ|j                  | dt        |j                                 t        | j                   t"              rR|j                  d| d       |j                  t%        j&                  | j                   j)                         d	             | j*                  J |j-                  | j/                                d
j1                  |      S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  rg  )r   rx   r   r   r  ra   r1   r   rT   rs   ra  r&   rd  r   r  r
  r   r;   r  r  r  rd   r  ry  join)r   r   linesr   r  r   s         ri   rp  zSchedulerNode.debug_str_extraR  sK   }}f$TZZ]O4f'

17fIdkk]+

 ##446 	OCc7+88gg((2!#r'9'9:LLH:Z

8K7L!MN	O djj(+LL6${34LL)=)=)?HIyy$$$T//12yyro   c                    | j                   S r`   )r   r  s    ri   rq  zSchedulerNode.get_rangesh      {{ro   c                <   t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  j                               xr' | j                  d u xs | j                  j                   S Ntype(self.node)=)
ra   rd   r&   r   r(  r   r   r   r   has_partial_accumulater  s    ri   rc   zSchedulerNode.is_reductionk  s    $))b&7&79J9J%KL 	
tDII !	
L DII0023 
JJ$Gdjj&G&G"G	
ro   c                    t        | j                  t        j                        sJ dt	        | j                               | j                  j                         dk(  S )Nrf  dot)ra   rd   r&   r   r   r   r  s    ri   r  zSchedulerNode.is_native_matmulx  sJ    $))R%6%67N<LDO;M9NN7yy++-66ro   c                L   t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  t        j                        xr. t        | j                  j                  t        j                        S re  )ra   rd   r&   r   r(  r   r   	SplitScanr  s    ri   r  zSchedulerNode.is_split_scan|  sy    $))b&7&79J9J%KL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
ro   c                J    t        | j                  t        j                        S r`   ra   rd   r&   r(  r  s    ri   r  zSchedulerNode.is_template  s    $))R%6%677ro   c                f    t        | j                  t        j                        r| j                  S d S r`   rm  r  s    ri   r  zSchedulerNode.get_template_node  s$    &tyy"2C2CDtyyN$Nro   c                f    | j                          | j                          | j                  |       y r`   )r)  r  r  )r   
index_varss     ri   runzSchedulerNode.run  s#    ""$Z ro   c                &   | j                   }t        t        t        |            t        t        t        |            k(  sJ t	        t        t        j                  j                  |      t        j                  j                  |                  }|S r`   )	r   r   mapr   dictzipr  r  from_iterable)r   rp  rB  r   s       ri   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 ro   c                   | j                  |      }	 t        j                  t        t        j                         |            5  t        j
                  j                  |       5   | j                  |  ddd       ddd       y# 1 sw Y   xY w# 1 sw Y   yxY w# t        $ r" t        j                  d| j                          w xY w)a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)rw  rT   set_ops_handlerrA   get_ops_handlerr   set_current_noder   rq  rr  fatalrd   )r   rp  r   s      ri   r  zSchedulerNode.codegen  s     00<
	!!"213D3D3F
"ST())$/( 

J'	( ( ( ( ( (
  	II/;	sA   1B  B$B4B<B B	
BBB B +Cc                    |r| j                   nt        | j                         \  }}t        j                  | j                  |t
        j                  j                  gt        |      z  g      S )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	r   r   r%   r.  r   r   SZeror   )r   	pointwise
keep_sizesignore_sizess       ri   "pointwise_or_reduction_read_writesz0SchedulerNode.pointwise_or_reduction_read_writes  sT     3<4;;$++AV 
L//JJ
%'',,#lBS1S0T
 	
ro   c                &    | j                  d      S )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r  r  s    ri   r7  z#SchedulerNode.pointwise_read_writes  s    
 666FFro   c                &    | j                  d      S )zD
        Get the memory dependencies in the reduction axes.
        Fr  r  r  s    ri   reduction_read_writesz#SchedulerNode.reduction_read_writes  s    
 666GGro   c                   | j                         ryt        d | j                         D              ryt        | j                  j
                        dk(  rt        |t        j                        rt        t        | j                  j
                              }t        |t        j                        sJ dt        |             |j                  |j                  k(  xr |j                  |j                  k(  S y)NFc              3  <   K   | ]  }|j                           y wr`   )r  r  s     ri   rj   z,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?Ss ?r  r   ztype(write_dep)=)r  r   r[  r   r   ro  ra   r%   r/   r  iterr   r   r   )r   r  	write_deps      ri   r  zSchedulerNode.can_inplace  s    ?D,<,<,>??t&&'1,l,,2
 T$"2"2"9"9:;Ii)?)?@WEUT)_DVBWW@>>Y__4X)..9XXro   c                   t               }t        | j                  t              r| j                  j	                         D ]  }|j
                  dk(  s|j                  dk(  s#d|j                  v r|j                  d   dk(  s,t        |j                        dk(  s\|j                  d   dk(  so|j                  d|j                  v r|j                  d   n(t        |j                        dk\  r|j                  d	   nd
        |S )Ncall_methodstoremode
atomic_add   r  r   r   r   rv  )r   ra   r   r;   rl   r   r5  rV  r   r  r  )r   buffers_store_as_atomic_addrd   s      ri   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(+

,,. GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr +*ro   c                p    | j                   | j                   j                  d      ryt        |          S )Ndevice_assert_asyncT)r   has_opr  r  r   r  s    ri   r  zSchedulerNode.has_side_effects  s2     ::!djj&7&78M&Nw'))ro   )r   r   rd   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r8  NN)r%  *Optional[tuple[dict[Any, Any], list[Any]]]r&  zOptional[Callable[_P, _T]]r   r8  )r%  r  r&  zOptional[Callable[..., Any]]r   r8  )r'  r   r<  r   r   r8  )rC  Sequence[int]r   r8  r7  r   rX   )rU  r   rV  r   r   r8  r  r5  )r   Sequence[Sequence[sympy.Expr]]r9  r  )rp  Sequence[sympy.Expr]r   r8  )rp  r  r   zdict[sympy.Expr, sympy.Expr])rp  r  r   r8  r  )r  r   r   rP  )r   rP  r  r  )"r   r   r   r   r=  rY  r"  r2  r>  rD  rM  rO  rR  rT  rX  r  rp  rq  rc   r  r  r  r  rq  rw  r  r  rC   r7  r  r  r  r  r  r  s   @ri   rb   rb     s   
 -,O : 
	 RV@D$N $> 
	F RVBF
$N
 $@
 
	
;;8<;	;<QI"PP),P	P"
Q!.7	. ,
7
8O!
8	%0 !%	
	
	 	
 G G H H + +& * *ro   rb   c           	     n     j                   } j                  t        j                  j	                  |D cg c]  }|j
                   c}             t         fdt        j                  |D cg c]  }|j                   c} D               j
                  j                  z
   _        y c c}w c c}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wr`   r   r  )rg   r   group_snodes     ri   rj   z2refresh_group_node_dependencies.<locals>.<genexpr>  s.      
xx{;;== 
   (+)
r   r  r%   
ReadWrites
merge_listr   r   unionrR  ro  )r  r   r'  s   `  ri   refresh_group_node_dependenciesr    s     F**6+JaAMM+JK
 	 
!'')O1!*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B-0B2r   c                   t        | t        t        f      sJ || _        || _        d | _        t        j                  |D cg c]  }|j                  |j                   c} | _        t        |        t        d | j                  D              | _        t        d | j                  D              | _        | j                         D ci c]  }|j                         | c}| _        y c c}w c c}w )Nc              3  4   K   | ]  }|j                     y wr`   rE  rg   r'  s     ri   rj   z"init_group_node.<locals>.<genexpr>       HHr  c              3  4   K   | ]  }|j                     y wr`   )rF  r  s     ri   rj   z"init_group_node.<locals>.<genexpr>  r  r  )ra   r   GroupedSchedulerNoder   r   rd   r   r  r   r  rf  rE  r  rF  r[  r   rM  )r  r   r   r'  r   s        ri   init_group_noder    s    
 k$68L#MNNNK%KK&,,%	A!)@!++	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@# ##K 
B#s   C*C*	C/c                      e Zd ZU dZded<   e	 	 	 	 	 	 d!d       Zd"dZd#dZe	d$d       Z
	 	 	 	 	 	 d%dZd& fd	Ze	d'd
       Zd'dZe	d(d       Zd)dZd'dZd'dZ	 	 	 	 	 	 d* fdZe	d(d       Ze	d(d       Zd+dZd'dZe	d,d       Ze	d,d       Ze	d,d       Ze	d,d       Ze	d-d       Zd.dZe	d,d       Zd/dZd0dZ d1dZ!d'dZ"e	d, fd        Z# xZ$S )2r   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r   c           	        |j                   |j                   u sJ t        |t        t        f      sJ |j	                         rt        |t
              rt        |j                  t              sJ t        |j                  j                        dk(  sJ t        t        t        |j                  j                              t              sJ t        t        |j                  j                              j                  }|j                         D cg c]  }|j	                         s| }}t        |      dk(  sJ |d   }t        |j                  j                        dk(  sJ t        t        |j                  j                              }t        |t               sJ t#        t!        ||j$                  |j&                  |j(                  |j*                        g      |j                  _
        nt        |t        t        f      sJ t-        t/        j0                  |j                         |j                                     } | |j                   |      S c c}w )Nr   r   )r   ra   rb   r   r  rl  rd   r8   r   r   ro  r  r  r0   r   rl   r/   r   r   	var_namesr   r  r   r  r  )	ry   r   r   r   rd   template_nodesr  writer  s	            ri   fusezFusedSchedulerNode.fuse&  s    %//111%-1C!DEEE:e5N#O ejj+666u((//0A555d4(9(9(@(@#ABGLLLU..5567<<D/4/@WtDDTDTDVdWNW~&!+++*1-M}00778A===m77>>?@EeY///'1ekk5??EJJ

(E$ em5G%HIIIY__U__%68IJK5??E**! Xs   I'Ic                    | j                   D ]6  }t        |t              sJ |j                         sJ |j	                          8 | S r`   )r   ra   rb   rc   rO  r   rh   s     ri   rO  z,FusedSchedulerNode.extract_pw_from_reductionH  sJ    {{ 	0Gg}555'')))--/	0 ro   c                j    | j                   D ]$  }t        |t              sJ |j                          & y r`   )r   ra   rb   rM  r  s     ri   rM  z(FusedSchedulerNode.swap_pw_red_dimensionO  s1    {{ 	,Gg}555))+	,ro   c                    t        t        d d | j                         D                    }t        |      dk(  ry t	        |      }|S )Nc              3  |   K   | ]4  }|j                         s|j                         r|j                          6 y wr`   r  r   r~  r  s     ri   rj   z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>Z  6      '')T^^-= '')   :<r   r   filterrl   r   r   r   fpsr  s      ri   r~  z!FusedSchedulerNode.estimate_flopsT  K      $ 0	
 s8q=#h
ro   c                   | j                         ryd}| j                  D ]`  }t        |t              sJ |;t	        |      t	        |j
                  d         k7  rt        j                  d        y|j
                  d   }b d}|J t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|s%t        j                  d| j                                yt        xj                  dz  c_        t        j                  d| j                         |       | j                  D ]%  }t        |t              sJ |j                  |       ' t        |        y)	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)r  r   ra   rb   r   r   r]  r  r   rZ  r[  r   r'   r\  rD  r  )r   r  r  r^  r  rC  s         ri   r  z,FusedSchedulerNode.reorder_loops_by_dep_pairf  sL    
[[ 	)Ee]333%%
*;uU\\RS_?U*U!''G aJ	) 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[ 	2Ee]333&&y1	2 	(-ro   c                    t         |   |       t        | ||       g | _        t	        |d       j
                  | _        y )Nc                4    t        | j                               S r`   )r   rc   r  s    ri   rW  z-FusedSchedulerNode.__init__.<locals>.<lambda>  s    s1>>3C/D ro   r  )r  rY  r  r   r  rx   )r   r   r   r  s      ri   rY  zFusedSchedulerNode.__init__  s8    #i0%'
%DEKK
ro   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w N_r`  r   r   r   r'  s     ri   r   zFusedSchedulerNode.get_name  )    xxt{{;!;<<;   8c                <    | j                   d   j                         S r   r   r   r  s    ri   r  z!FusedSchedulerNode.get_first_name      {{1~&&((ro   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w r`   r   r  r   r  r  s     ri   r  z#FusedSchedulerNode.get_buffer_names  .    !L1!"4"4"6!LMM!L   9c                j    g }| j                   D ]!  }|j                  |j                                # |S r`   r   r  r[  r   r  rd   s      ri   r[  zFusedSchedulerNode.get_outputs  4    (*KK 	.DMM$**,-	.ro   c           
     ~   t        | j                        D cg c]+  \  }}| j                          d| d|j                          - }}}| j                  d   j                  }||j                  | j                                t        j                  dj                  |      j                         d      S c c}}w )Nz.snodes[z] =
r   rg  r  )r  r   r   r  rd   r  ry  r  r  r`  rt  )r   r  rd   ra  s       ri   rp  z"FusedSchedulerNode.debug_str_extra  s     %T[[1
4 }}xs%0@/AB
 
 {{1~""LL3356tyy/668&AA
s   0B9c                h    | j                   D cg c]  }|j                          }}|  d| S c c}w )Nz
, snodes: )r   r  )r   rd   
snodes_strs      ri   r  z"FusedSchedulerNode.debug_str_short  s9    9=Ed**,E
Ez*.. Fs   /c                    t         |   ||       t               }t        | j                        D ]/  }|j                  ||       |j                  |j                         1 y r`   )r  r  r   r   r   updaterD  )r   r  r  rd   r  s       ri   r  z!FusedSchedulerNode.set_last_usage  s\    
 	24FG 0:|T[[) 	8D 35GH&&t7	8ro   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w r`   )r   r  r   r   r  s     ri   r   z$FusedSchedulerNode.used_buffer_names  s.    !MA!"5"5"7!MNN!Mr  c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w r`   )r   r  r   r  r  s     ri   r  z/FusedSchedulerNode.used_or_aliased_buffer_names  s3    8<D1a,,.D
 	
Dr  c                    | j                   S r`   r\  r  s    ri   rl   zFusedSchedulerNode.get_nodes  rc  ro   c                T    t        |       j                   d| j                          dS )Nz(nodes=r_  r`  r  s    ri   ra  zFusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@ro   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wr`   )rc   r  s     ri   rj   z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     91>>#9r  r   r   r  s    ri   rc   zFusedSchedulerNode.is_reduction  s    9T[[999ro   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wr`   )r  r  s     ri   rj   z6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>  s     =A1%%'=r  r  r  s    ri   r  z#FusedSchedulerNode.is_native_matmul  s    ====ro   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wr`   )r  r  s     ri   rj   z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :1??$:r  r  r  s    ri   r  z FusedSchedulerNode.is_split_scan  s    :dkk:::ro   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wr`   r  r  s     ri   rj   z1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8q1==?8r  r  r  s    ri   r  zFusedSchedulerNode.is_template  s    8DKK888ro   c                j    | j                   D ]$  }|j                         s|j                         c S  y r`   )r   r  r  r   rd   s     ri   r  z$FusedSchedulerNode.get_template_node  s5    KK 	0D!--//	0 ro   c                     | j                   d   S r   )rx   r  s    ri   r   zFusedSchedulerNode.get_device  s    zz!}ro   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wr`   )r  r  s     ri   rj   z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA1--/Er  r  r  s    ri   r  z+FusedSchedulerNode.has_aliasing_or_mutation  s    EEEEro   c                    t         r`   NotImplementedError)r   r  s     ri   r  z'FusedSchedulerNode.update_mutated_names      !!ro   c                    t         r`   r  )r   r   s     ri   r  zFusedSchedulerNode.add_fake_dep  r  ro   c                    t         r`   r  r  s     ri   r  zFusedSchedulerNode.can_inplace  r  ro   c                P   | j                         }dj                  d | j                  D              }t               }|j	                  | dt        |       j                   d| d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j	                  |j                                # 	 ddd       |j                  d       	 |j	                  | j!                                |j)                         j+                         S # 1 sw Y   XxY w# t"        $ r t$        j'                  dd       Y Lw xY w)rc  r  c              3  F   K   | ]  }t        |      j                    y wr`   )r   r   r  s     ri   rj   z/FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     FQQ 0 0Fs   !r  rd  re  rf  rg  rh  ri  z.outputs = [
            Nr  rj  Trk  )r   r`  r   rK   rm  r   r   r  r   ro  rR  r   r  r[  r  r	  rp  rq  rr  rs  r  rt  )r   r   node_typestrr   r   s        ri   r  zFusedSchedulerNode.debug_str  s   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   )5E69F 6E? F%$F%c                p    | j                   t        d | j                   D              S t        |          S )Nc              3  <   K   | ]  }|j                           y wr`   )r  r  s     ri   rj   z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>  s     G4t,,.Gr  )r   r   r  r  r  s    ri   r  z#FusedSchedulerNode.has_side_effects  s0    ;;"G4;;GGGw'))ro   r   rX   r   rX   r   r   r  r7  r  r  )r   r   r   r  r   r8  r5  r  r   rJ  r  r  r9  r  )r   torch.devicer  )r   r.   r   r8  r  )%r   r   r   r   r=  r   r  rO  rM  rC   r~  r  rY  r   r  r  r[  rp  r  r  r   r  rl   ra  rc   r  r  r  r  r   r  r  r  r  r  r  r  r  s   @ri   r   r     s    $#+%+.?+	+ +B,
  "(!(.7(	(TL = =) N N	B/8#28HV8	8 O O 
 

A : : > > ; ; 9 9   F F
"""*4 * *ro   r   c                  D     e Zd Zd fdZ	 	 	 	 	 	 ddZddZddZ xZS )FusedMixOrderReductionsc                    || _         || _        t        |   |j                  t        |j                               t        |j                               z          t        j                  | j                         | _	        y r`   )
r   r   r  rY  r   r   rl   r]   r   numel)r   r   r   r  s      ri   rY  z FusedMixOrderReductions.__init__  s\    

OOT%//"34tEOO<M7NN	
 '00<
ro   c                   t        |t              rJ t        |t              rJ | j                  j                  ||d      sydd}	 	 	 	 dd}|r' |||f       ||      z  s ||       |||f      z  ry|j	                          xsC t        j                  t        | j                  j                  ||d            | j                  k\  S )a  
        node1 is from the current mix order reduction; node2 is another node we want to fuse in.

        other_nodes are passed in to check if fusion will introduce producer/consumer relationship
        between the inner and outer reduction. If yes, we don't fuse.
        Fallow_mix_order_reductionc                B    t               } |j                  d | D         S )Nc              3  4   K   | ]  }|j                     y wr`   )r   r  s     ri   rj   zTFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>=  s     :qq{{:r  r   r  r  r   s     ri   _get_ancestorszAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors;  s     ,C399:E:;;ro   c                B    t               } |j                  d | D         S )Nc              3  <   K   | ]  }|j                           y wr`   )r   r  s     ri   rj   zZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>C  s     F1q446Fr  r  r  s     ri   _get_operation_nameszGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names?  s"     ,C399FFGGro   )count_bytes)r  tuple[BaseSchedulerNode, ...]r   rC  )
ra   r  r   r   rc   typingcastr   score_fusion_memoryr  )r   r   r   other_nodesr	  r  s         ri   sub_node_can_fusez)FusedMixOrderReductions.sub_node_can_fuse&  s     e%<===e%<===
 ~~&&ueu&U	<	H0	H	H u~.1Ek1RR{+.BE5>.RR ""$$ {{T^^77uRW7X zz	
ro   c                   t        |t              sR| j                  | j                  || j                  f      xs( | j                  | j                  || j                  f      S | j                  | j                  |j                  | j                  |j                  f      xr/ | j                  | j                  |j                  t                     S r`   )ra   r  r  r   r   r   r   others     ri   can_fuse_withz%FusedMixOrderReductions.can_fuse_withS  s    %!89))

EDJJ= J''

EDJJ=IJ ))

EKK$**ekk)B K((U[[%'JKro   c                T   | j                   j                         }| j                  j                  |      }t	        |t
              rX|j                  | j                   |j                         }|j                  | j                  |j                        }t        ||      S | j                  | j                   || j                  f      r2|j                  | j                   |      }t        || j                        S |j                  | j                  |      }t        | j                   |      S r`   )	r   r   r   r+  ra   r  r  r   r  )r   r  r  backendfused_node1fused_node2r  s          ri   	fuse_withz!FusedMixOrderReductions.fuse_with_  s    &&(..,,V4e45!,,tzz5;;?K!,,tzz5;;?K*;DD%%djj%$**G$\\$**e<
.z4::FF$\\$**e<
.tzz:FFro   r  )r   rX   r   rX   r  r  )r  rX   )r   r   r   rY  r  r  r  r  r  s   @ri   r  r    s6    =+
 +
 !+
 3	+
Z
KGro   r  c                  N    e Zd ZU dZ	 	 	 	 ddZ	 	 	 	 ddZedd       Ze	 	 	 	 	 	 dd       Z	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	e	 	 	 	 dd       Z
e	 	 	 	 dd       ZeZd	ed
<   e	 	 	 	 dd       Ze	 	 	 	 dd       ZddZddZddZddZd dZd!dZ	 	 	 	 d"dZ xZS )#ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    c                    |j                         D ]=  }|j                         | j                  v s | j                  |j                            c S  y r`   )r[  r   read_to_node)r   producerr   s      ri   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_forv  sL     '') 	9C||~!2!22((88	9 ro   c                   t        t                  }|j                  j                  D ]  }|j                  | j
                  j                  vr&| j
                  j                  |j                     j                         }|| j                  v sf|j                  | j                  |           t        |      dk(  rt        t        |            S y Nr   )r   rX   r   r   r   r   r#  r   name_to_noder  r   r  r  )r   consumer	producersrd	node_names        ri   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,, 	<Bwwdnn88822277;LLNID---d//	:;	< y>QY((ro   c                   t        |      }j                         r|j                         rt        j                  t              t        j                  t        |      }t        j                        t        |j                        k(  }|s |d       |xr2 t        fdt        j                  |j                        D              S |j                         rkj                         r	 |d       yt        j                  t        |      }|j                        }||j                  j                  |      S  |d       yj                         rk|j                         r	 |d       yt        j                  t              j                  |      }|j                  j                  ||      S  |d       yt        d      )	Nzforeach do not have same lengthc              3  \   K   | ]#  \  }}j                   j                  ||       % y wr`   )r   r   )rg   lrr!  s      ri   rj   z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s0      )Aq ""++Aq1)s   ),zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  r  r  r  r   r   rk   ru  rc   r"  r   r   r*  r  )ry   r!  r&  whyforeach_matchconsumer_subnodeproducer_subnodes    `     ri   r   z#ForeachKernelSchedulerNode.can_fuse  s   (+ X%8%8%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    "$$&n {{#=xHH'@@J+))228=MNNGH  "$$&n {{#=xHH'@@J+))223CXNNGHf
 	
ro   c                
   |j                         s|j                         sJ |j                         r3t        j                  t        |      }|j                  }|j
                  }n2t        j                  t        |      }|j                  }|j
                  }d }d }|j                         r|j                         r|t        j                  t        |      }t        j                  t        |      }t        |j                  |j                        D cg c]  \  }}t        j                  ||       }	}}n/|j                         rt        j                  t        |      }|j                  |      }
g }	|}d }|j                  D ]A  }||
u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C n|j                         rt        j                  t        |      }|j                  |      }g }	|}d }|j                  D ]A  }||u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C nt        d       | |j                  |	||||      S c c}}w )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r  r  r  r4  r7  ru  r   r   r  r*  r   r"  r  r   )ry   r!  r&  r4  r7  r5  r6  r-  r.  fused_nodesr2  rd   new_noder1  s                 ri   r  zForeachKernelSchedulerNode.fuse  sZ    ""$(;(;(=== {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O X%8%8%:{{#=xHH{{#=xHH  AAq #''1-K    "{{#=xHH'@@JK"KK  -++166tXFH"*K&&x0&&t,-   "{{#=xHH'@@JK"KK  -++166xFH"*K&&x0&&t,- !f  &?##+
 	
Ks    I?c                    i  _         i  _        ||qt           ||       |D ]Z  }|j                  j
                  D ]  }| j                   |j                  <    |j                         D ]  }	| j                  |	<    \ n| _        | _	        d  _
        g  _         j                  t        j                  j                  |j                  |j                  g             t!         fdt!        j"                  |j$                  |j$                        D               j                  j&                  z
   _        t)        |j*                  |j*                  g       _        t-        |j.                  |j.                  g       _        |j1                         rt3        |t4              sJ ||}}
nt3        |t4              sJ ||}}
|
j6                   _         j6                  j9                  |j6                         |
j                   _        |j                         D ]  }	| j                  |	<     j                  D ci c]'  }|j:                  j=                         D ]  \  }}||
 ) c}}} _        | _        |d   jA                         }|sJ |tC        jD                  d      fff _#        t!        tH        jJ                  jL                             _'        | _(        y c c}}}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wr`   r  r  s     ri   rj   z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>$	  s0       xxt'<'<'>>	 r  r   combo_kernel))r   r%  r  rY  r   r   r   r   r   r   rd   r   r  r%   r  r  r   r  rR  ro  rf  rE  r  rF  r  ra   r  r   r  rM  itemsr4  r   r   Exprrx   r}  fxNoder<  r7  )r   r   r   r4  r5  r6  r7  rd   r%  r   foreach_noder   r  r  vr  r  s   `               ri   rY  z#ForeachKernelSchedulerNode.__init__	  s    +"5GY/ 3 ,,22 8D37D%%dii08 !446 3D.2D%%d+3	3 'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%'!+/IJJJ+6j!+/IJJJ+6j)33DNNN!!*"6"67 , 9 9D"668 5*4!!$'5 #'++@ @%:O:O:U:U:W@26!Q1@@D  *C&%%'v

> :<>?
!%((--02.@s   ,K&c           	        |D cg c]  }t        |t              s| }}|rSt        j                  dt	        |      |D cg c])  }|j
                  |j
                  j                         + c}       |D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]  }t        |t        t        t        f      s|! }}|D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]  }t        |t              r| }}|D cg c]  }|j                         s| }}|r t        j                  dt	        |      |       |D cg c]	  }||vs| }}|S c c}w c c}w c c}w c c}w c c}w c c}w c c}w c c}w )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d grouped nodes are filteredz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %s)ra   rl  rr  r  r   rd   r4  r  r  r  r  )	ry   r  r'  externrd   groupedfiltered_nodesforeach_nodesr  s	            ri   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodesJ	  s    #Oj4M&N!OOIIAF5;UTtyy?T&&(U
 $Kz!5I'J1KKII=G 
*-( 
 
 &
A7Q)RA
 
 IICSEWX%
Z;U-VA
 
 &4Gq}}!GGIIBN#
 &4Oq7N!OOS P
 VK



 H PsX   FFFF:F$F$;$F)%F.;F.&F3<F3F8F8	F=F=c                   | j                         }g }d}|D ]  }t        t              }|D ]G  }|j                         }|r|j                  dk(  s|j                  dk(  r4||   j                  |       I |j                         D ];  }|j                  t        dt        |      |      D 	cg c]
  }	||	|	|z     c}	       =  |S c c}	w )zS
        Returns a list of lists of nodes that are to be grouped together.
           mpsr  r   )
_topological_sort_nodesr   r   r   r   r   r   r  rH  r   )
r   sorted_nodesgrouped_nodesmax_num_nodesr  device_groupsrd   r  device_nodesr  s
             ri   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernelsy	  s     !88:! 	E D!   3*v{{e3v{{e7Kf%,,T2	3 !. 4 4 6 $$ "'q#l*;]!K %Q]):;	( s   +C4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    | t         _        y r`   r  rT  )custom_group_algorithms    ri   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels	  s    
 # 	#Dro   c                ,    t         j                  |       S r`   rV  r   s    ri   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels	  s     *KKIVVro   c                    t         r`   r  r  s    ri   r  z#ForeachKernelSchedulerNode.mark_run	  r  ro   c                    t         r`   r  r  s    ri   r  z"ForeachKernelSchedulerNode.codegen	  r  ro   c                     yr  r   r  s    ri   r  z%ForeachKernelSchedulerNode.is_foreach	  r  ro   c                ,    t        | j                        S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r   r  s    ri   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes	  s     DKK  ro   c                t    t        t        j                  j                  d | j                  D                    S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  <   K   | ]  }|j                           y wr`   )rl   r  s     ri   rj   z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>	  s     1UA!++-1Ur  )r   r  r  rv  r   r  s    ri   rl   z$ForeachKernelSchedulerNode.get_nodes	  s(     IOO111U1UUVVro   c                <    | j                   d   j                         S r   )r   r  r  s    ri   r  z)ForeachKernelSchedulerNode.get_first_name	  s    {{1~,,..ro   c                    t        | || j                  j                         | j                  D ]  }|j	                  |        y r`   )r  r   r#  r   r  )r   r  rd   s      ri   r  z/ForeachKernelSchedulerNode.prune_redundant_deps	  s=     	d$68R8RSKK 	:D%%&89	:ro   )r!  rX   r   r   )r&  rX   r   r   r!  rX   r&  rX   r   r   )r!  rX   r&  rX   r   r  )NNF)r   r   r   r  r4  r   r5  r   r6  r   r7  r   r   r8  r  r  r   r  )r   r   r   list[list[BaseSchedulerNode]])rW  rS  r   r8  r7  r9  r   r  r  r5  r  )r   r   r   r   r"  r*  r   r   r  rY  rH  r   rR  rT  r=  rX  r[  r  r  r  r`  rl   r  r  r  r  s   @ri   r  r  p  s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %F/F/ (F/ $(	F/
 1F/ 1F/ F/ 
F/P ,+,	 , ,\ 	& B 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	:ro   r  c                       e Zd ZU dZded<   edd       Z	 d	 	 	 	 	 	 	 d fdZddZddZ	e
dd       Zdd	Ze
dd
       ZddZe
dd       ZddZddZedd       Z xZS )r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r   c                    |d   j                   t        fd|D              sJ  | |      }|D ]  }|j                  |j                         <   ! |j                  |j                         <   |S )Nr   c              3  :   K   | ]  }|j                   u   y wr`   rZ  )rg   rd   r   s     ri   rj   z.GroupedSchedulerNode.create.<locals>.<genexpr>	  s     B44>>Y.B   )r   rk   r  r   )ry   r   grouped_snoder  r   s       @ri   createzGroupedSchedulerNode.create	  sy    1I''	B6BBBBIv. 	KE=JI(()9:	KAN	$$]%;%;%=>ro   c                L    t         |   |       t        | ||       || _        y r`   )r  rY  r  temp_grouping)r   r   r   rp  r  s       ri   rY  zGroupedSchedulerNode.__init__	  s(     	#i0 +ro   c                6   | j                   r| j                  S | j                  D ])  }|| j                  j                  |j	                         <   + | j                  j                  | j	                         = | j                  j                  | j                        S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )rp  r   r   r  r   
fuse_nodes)r   r  s     ri   unpackzGroupedSchedulerNode.unpack	  sx    
 ;;[[ 	HEBGDNN--enn.>?	HNN--dmmo>~~((55ro   c                    | j                  | j                  j                  |             | j                  j	                  |       y r`   )r  r   r  rR  r  )r   fake_deps     ri   r  z!GroupedSchedulerNode.add_fake_dep	  s5    T--77AB##H-ro   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w r  r  r  s     ri   r   zGroupedSchedulerNode.get_name 
  r  r  c                <    | j                   d   j                         S r   r  r  s    ri   r  z#GroupedSchedulerNode.get_first_name
  r  ro   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w r`   r  r  s     ri   r  z%GroupedSchedulerNode.get_buffer_names
  r  r  c                j    g }| j                   D ]!  }|j                  |j                                # |S r`   r  r  s      ri   r[  z GroupedSchedulerNode.get_outputs
  r  ro   c                    t        t        d d | j                         D                    }t        |      dk(  ry t	        |      }|S )Nc              3  |   K   | ]4  }|j                         s|j                         r|j                          6 y wr`   r  r  s     ri   rj   z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>
  r  r  r   r  r  s      ri   r~  z#GroupedSchedulerNode.estimate_flops
  r  ro   c                    | j                   S r`   r\  r  s    ri   rl   zGroupedSchedulerNode.get_nodes#
  rc  ro   c                X    | j                   r| j                   d   j                         S d S r   )r   r   r  s    ri   r   zGroupedSchedulerNode.get_device&
  s$    .2kkt{{1~((*CtCro   c                     yr  r   )ry   r!  r&  s      ri   r   zGroupedSchedulerNode.can_fuse)
  r  ro   )r   r  r   r  F)r   r   r   r  rp  r   r   r8  rh  )ru  r.   r   r8  r5  r  r  r  r  r;  re  )r   r   r   r   r=  r   rn  rY  rs  r  rC   r   r  r  r[  r~  rl   r   r   r  r  s   @ri   r  r  	  s     $#  $	++ (+ 	+
 
+6. = =) N N  "D  ro   r  c           
          t         j                  d fd       }t        t        t	        t         d                           }t        |      dkD  r|D cg c]  } |   	 c} t        j                  r|j                  |       |S c c}w )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                t   |    dk(  s|   dk(  rt        |    dk(  |   dk(        S D cg c]  }t        ||           }}D cg c]  }t        ||          }}t        d t        ||      D              }t        d t        ||      D              }||kD  ry||kD  ryt        ||       S c c}w c c}w )Nr   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr   Nr   rg   sl_asl_bs      ri   rj   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>F
  )      
)3tDAI$$
   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr  r   r  s      ri   rj   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>I
  r  r  r   )rD   absr   ru  )	r  bslstride_len_astride_len_ba_firstb_firstrB  stride_lengthss	          ri   	index_cmpz"pick_loop_order.<locals>.index_cmp9
  s    8q=E!HMuQx1}eAh!m44 .<<rBqE
<<-;<rBqE
<<  
7:<7V
 
  
7:<7V
 
 WW 1ay# =<s   B0	B5r   r  )r  r   r  r   r   r   )		functools
cmp_to_keyr   r   rH  r   r#   pick_loop_orderssort)r  rB  priority_idxr  orderpis   ``    ri   pick_loop_orderr  /
  s      4 %N1$5 6789E
<17CD.,D

y
!L Es   Bc                   |j                         }| j                         }t        |t              rt        |t              sJ |j                         }| j                         }t        |t              rt        |t              sJ t        j
                  j                  |= ||_        t        j
                  j                  |= ||_	        t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   y r`   )r   ra   r   r  rT   rs   r  r   
name_to_opoperation_namebuffersr   remove
operations)	orig_noder9  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          ri   _replace_operation_bufferr  ]
  sU    !))+&&(MmS)j9JC.PPP224//1LlC(Z8H#-NNN	01!HM	+,*H77??  +DGGOO8$$AGGOOD,4AGG=)77##I.DGGh''AGGt'/AGG|$ro   c                  T    e Zd ZU ded<   dZded<   dZded<   ddZddZdd	Zdd
Z	y)NodeUser$Union[BaseSchedulerNode, OutputNode]rd   Fr   r  is_weakc                v    t        | j                  j                         | j                  | j                  f      S r`   )r  rd   r   r  r  r  s    ri   r  zNodeUser.__hash__
  s+    TYY'')4+;+;T\\JKKro   c                    t        |t              xrW | j                         |j                         k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S r`   )ra   r  r   r  r  r  s     ri   __eq__zNodeUser.__eq__
  s[    uh' .5>>#33.  E$5$55. -		
ro   c                6    | j                   j                         S r`   r  r  s    ri   r   zNodeUser.get_name
  r  ro   c                    | j                   |j                   u sJ t        | j                   | j                  xr |j                  | j                  xr |j                        S r`   )rd   r  r  r  r  s     ri   r/  zNodeUser.merge
  sP    yyEJJ&&&II2!2!2LL*U]]
 	
ro   Nr6  )r  objectr   r   r5  )r  r  r   r  )
r   r   r   r=  r  r  r  r  r   r/  r   ro   ri   r  r  y
  s3    
..K GTL
$
ro   r  c                 "    t         j                  S r`   )r#   r  r   ro   ri   *used_non_deterministic_runtime_estimationsr  
  s    333ro   c                   t               }| j                         }t        |t        j                        r|j                  t        |j                        t        |j                        z  t        |j                        z         t        |t        j                        r$|j                  t        |j                               |S |
J d|        |S )z=Get free symbols from a node's layout (size, stride, offset).z*Expect layout to be None but found layout=)r   maybe_get_layoutra   r&   Layoutr  r   r   strideoffsetr   get_layout_symintsr5  )rd   free_symbol_usesr
  s      ri   r  r  
  s    1;""$F&"))$%6==)*6==)*	

 fb;;<##$6v}}$EF  ~T!KF8TT~ro   c                "   t        | t              r( t               j                  d | j                  D         S | j
                  J | j
                  j                         } |j                  d | j
                  j                         D          |S )z
    Gets symbols used in a scheduler node, including free symbols from
    the node's operations and layout symints from outputs.
    c              3  2   K   | ]  }t        |        y wr`   get_scheduler_node_symbol_uses)rg   r  s     ri   rj   z1get_scheduler_node_symbol_uses.<locals>.<genexpr>
  s     M,U3M   c              3  2   K   | ]  }t        |        y wr`   )r  )rg   ir_nodes     ri   rj   z1get_scheduler_node_symbol_uses.<locals>.<genexpr>
  s     	M'
W
%	Mr  )	ra   r   r   r  r   rd   get_free_symbol_usesr  r[  )rd   r  s     ri   r  r  
  s     $*+!z|!!MM
 	
 99   yy557	MTYY5J5J5L	M ro   c                  ~    e Zd ZdZdOdZdO fdZdPdZedQd       Zej                  dRd       ZdSdZ
dTdZdUd	ZdSd
ZdSdZdSdZdSdZ	 	 	 	 dVdZdWdZdXdZdSdZdSdZdVdZdSdZ	 	 	 	 dYdZ	 dZ	 	 	 	 	 	 	 d[dZ	 	 	 	 	 	 d\dZdSdZ	 	 	 	 	 	 	 	 	 	 d]dZd^dZ	 	 	 	 	 	 d_dZd`dZ	 	 	 	 	 	 dadZ dZdbdZ!dcdZ"	 	 	 	 	 	 ddd Z#	 	 	 	 	 	 ded!Z$	 	 	 	 	 	 ded"Z%	 	 	 	 	 	 	 	 dfd#Z&	 	 	 	 	 	 ded$Z'	 	 	 	 	 	 	 	 dgd%Z(	 	 	 	 	 	 dhd&Z)	 	 	 	 	 	 dhd'Z*did(Z+	 	 	 	 	 	 	 	 djd)Z,	 	 	 	 	 	 dkd*Z-	 	 dl	 	 	 	 	 	 	 	 	 dmd+Z.	 	 	 	 	 	 ded,Z/	 	 	 	 	 	 	 	 dnd-Z0dod.Z1dpdqd/Z2	 	 	 dr	 	 	 	 	 	 	 	 	 	 	 dsd0Z3	 	 	 	 dtd1Z4	 	 	 	 dud2Z5dSd3Z6dSd4Z7dSd5Z8dvd6Z9dwd7Z:dxd8Z;dyd9Z<	 	 	 	 	 	 dzd:Z=	 d{	 	 	 	 	 d|d;Z>	 	 d}d<Z?	 	 	 	 d~d=Z@	 	 	 	 	 	 dd>ZA	 	 	 	 	 	 dd?ZB	 	 	 	 dd@ZC	 	 	 	 dVdAZD	 	 	 	 dVdBZE	 	 	 	 dVdCZF	 	 ddDZGdSdEZH	 	 	 	 	 	 ddFZI	 	 	 	 	 	 ddGZJ	 	 	 	 	 	 ddHZKdSdIZLdcdJZM	 	 	 	 ddKZNddLZOddMZPdSdNZQ xZRS )r   z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    c                f    t        d      5  | j                  |       d d d        y # 1 sw Y   y xY w)NzScheduler.__init__)r   _initr   r  s     ri   rY  zScheduler.__init__
  s,    ./ 	JJu	 	 	s   '0c           
         t                     t        j                  _        i  _        t        t               _        t        j                          _        t                _        t        g t        j                  j                  j                         t        j                  j                   j                         t        j                  j"                  j                                _        |D cg c]  } j'                  |       c} _        d  _         j-                           j$                  j/                  t        j                  j                   j                                 j(                  D ]  }|j1                           d  _         j5                          _         j(                  D ci c]  }|j9                         | c} _         j(                  D ci c](  }|j=                         D ]  }|j9                         | * c}} _         j:                  jA                          _!        i  _"        i  _#        tI        jJ                   j(                   j>                   jB                         _         jM                           jO                   j(                         _         jQ                           j(                  D ci c]  }|j9                         | c} _!         jS                          tT        xjV                  tY         j(                        z  c_+        ddl-m.}m/}  | j(                         tY         j(                         _0         jc                           jO                   j(                         _        t        td        tf        tf        f              _4        tj        jl                  $tk        jl                   j(                         _        tj        jn                  r'ddl8m9} |ju                           jS                           jw                   j(                         _        tj        jx                  $tk        jx                   j(                         _         j{                           j}                          tj        j~                  r)t        ddd      5   j                  d        d d d        tj        j                  rdd	lCmB}  | j(                   j>                   jB                  t        t        j                  j                  j                               t        t        j                  j                                      _        tj        j                  stj        j                  rtj        j                  s#dd
lCmG}	  |	 j(                   j>                         t               rvt        j                  rftj        j                  st        j                  rFd}
 j(                  D ]  }t        |j                        sd}
 n |
rddl$mO}  | j(                         ddlPmQ}  |dd  fd       tI        j                   j(                         _         j                          tj        j                  rttj        j                  j                  rZtj        j                  j                  r@ j                   j(                         _         j                   j(                         _         j                          t        j                  jj                  j                  j                  r j                           | j(                         t        j                  j                   j(                          j                          t                _c        i  _d        t        d      j                   fd       t                _g        y c c}w c c}w c c}}w c c}w # 1 sw Y   4xY w)Nr   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotunez#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffersF)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     dddS )N#scheduler_nodes_before_comm_overlapstring)r   encodingr   r   ro   ri   rW  z!Scheduler._init.<locals>.<lambda>b  s    A (% ro   c            
         dj                  t        j                        D  cg c]0  \  } }d|  d|j                         z   d|j	                          z   2 c}}       S c c}} w )Nz

zsnode[r  z buffer_names:)r`  r  r  r  r  )r  r  r   s     ri   rW  z!Scheduler._init.<locals>.<lambda>f  sl    6;;
 %.djj$9	 !Aq !1++-(*1+=+=+?*@AB$ s   5A"
)metadata_fn
payload_fngraph_statsc                 ^     j                    j                  t         j                        dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r  r  s   ri   rW  z!Scheduler._init.<locals>.<lambda>  s'     33+/+>+>*-djj/ ro   )hr  rY  rT   rs   r   backendsr  _post_grad_graph_counterr  r  count_graph_partition_counterr   r  rs  r   	constantstorchbind_constantsr  create_scheduler_noder  current_nodeupdate_zero_dim_cpu_tensorr  r  default_device_contextget_donated_buffersr"  r   r%  r[  r#  copyr  r  rI  r"   decide_global_ordering_of_commsrW   topological_sort_scheduledead_node_eliminationcompute_ancestorsr'   ir_nodes_pre_fusionr   torch._inductor.debugr  r  r  create_foreach_nodesr   r   logged_slow_fusionr#   _pre_fusion_custom_passdistributed_max_autotune_gemmrv  r  schedulerr  _post_fusion_custom_passrX  finalize_multi_template_bufferscombo_kernelsr   create_combo_kernel_nodesr  memoryget_output_namesdeterministic reorder_for_compute_comm_overlapr  r  r$   6runtime_estimations_align_across_all_distributed_ranksr  r  rL   rd   r  torch._loggingr  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   
cudagraphs%reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usager}  r~  test_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_rowremoved_ops)r   r  r  rd   r   r  r  r  r  r  has_collectivesr  r  r  s   `            ri   r  zScheduler._init
  s    <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCd003C
9='')##**177+<+<+A+A+CDJJ 	DOO	 ?C# $$& 	# &*ZZ;
 !AJJL!O;
 -1JJ8
$($BRBRBT8
;>CLLNC8
8
 AE@Q@Q@V@V@X 35 13 ::JJ##

 	!!#33DJJ?
""$<@JJ"Gq1::<?"G  	##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ//. ))$/""$__TZZ0
**688DDJ,,.5&* $ B
 ..D.AB ))70

  ''177//44671773356DJ ##(O(O11UAJJ 0 0
 ;< WW<<#PP #( JJ D$TYY/*. # K4::V7 CCDJJODJ""$ ""((CCDDTZZPDJJJ4::VDJ!??!!..EE**,4::&	djj) 6@\! :<'//	
 -7Lu D;
8
B #H8B Bs$   5_"8_''-_,_22_77`c                   i }t         j                  j                  D ]d  }t        t         j                  j                  |   t        j
                        s9t        | t         j                  j                  |   d       ||<   f |S )N)r   )rT   rs   graph_inputs_originalra   r&   DonatedBufferrA  )r   name_to_donated_bufr   s      ri   r  zScheduler.get_donated_buffers  sp     GG11 	D!''77=r?O?OP,BGG11$7 $-#D)	 #"ro   c                6    t         j                  j                  S r`   rT   rs   current_devicer  s    ri   r  zScheduler.current_device  s    ww%%%ro   c                .    |t         j                  _        y r`   r  r  s     ri   r  zScheduler.current_device  s    !'ro   c                    t         j                  j                  dd      dk(  rddlm}  || j
                  d       yy)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r  r   r  )r   r   s     ri   r  zScheduler.debug_draw_graph  s1    ::>>:DASH+6 Iro   c                    t         j                  t        j                        r8t         j	                  d|       | j
                  D ]  }|j                           y y )Nz%s:)rr  isEnabledForloggingINFOr  r  r  )r   labelrd   s      ri   debug_print_nodeszScheduler.debug_print_nodes  sF    GLL)HHUE"

 #  "# *ro   c                6   |j                         J d       |j                         rt        | |      S t        |t        j
                  t        j                  f      rt        | |      S t        |t        j                        rt        | |      S t        |      )Nz2All nodes passed to scheduling must have an origin)r4  is_no_opr  ra   r&   r   r(  rb   r  rl  r  r  s     ri   r  zScheduler.create_scheduler_node  s    !- 	
@	
- ==?)$55r00"2C2CDE t,,boo.,T488%d++ro   c                   t               }g }| j                  j                         }t        j                  j
                  j                         D ]  }|D cg c]%  }||v rt        | j                  |   t              s|' }}|s6|j                  |       |D cg c]  }| j                  |    }}t        j                  dkD  }t        | |d|      }|j                  |       |D ]  }|| j                  |<     | j                  D 	cg c]  }	|	j!                         |vs|	 c}	t#        |      z   | _        y c c}w c c}w c c}	w )Nr   Fr4  r7  )r   r  r   rT   rs   listsr   ra   r%  r  r  r#   combo_kernels_autotuner  r   r  r   r   )
r   removed_node_namesfe_nodeskept_node_namesnamesr   r   r7  fe_noderd   s
             ri   r  zScheduler.create_foreach_nodes  sN   .8l11668WW]]))+ 	8E "?*"4#4#4T#:<RS E  %%e,:?@$d''-@F@$;;a?O0*/ /	G OOG$ 807''-81	88 "ZZ
4==?BT+TD
N
5 A
s   *D<EE#Ec                   %&'  G %fddt         t                 %t        j                  %      & j                  D ]  }|j                         D ]  }|j                         }t        |j                  j                  t        j                        rt        |j                               dkD  r^|j                         D ]J  }|&v r/|&v r+&|   }&|   }||z   }&D ]  }&|   |u s&|   |u s|&|<    6|&v r	&|   &|<   C&|   &|<   L   d' fd'	 	 d	 	 	 	 	 	 	 	 	 d&'fd}	i }
t        j                  j                   j#                         D ]  }t        |t$        j&                        r|j(                  D ]  }d|
|<   	 4t        |t        j*                        sO|j-                         D cg c]  }t        |t$        j&                        s|! }}|D ]  }|j(                  D ]  }d|
|<   	   d} j                  D ]s  }|j                  J t/        |j                  j1                         d 	      }|D ]8  }t        |t$        j2                        sJ d
}||
vs&|j                         |
|<   : u  j                  D ]  }t4        j7                  d|j                         |r|j                  J t/        |j                  j9                  d
      d 	      }|D ]d  }||
v sJ | d|
        |
|   x} j:                  |   j                         D ]*  }|j=                  t?        |j                                      , f t        |j@                  jB                        dk(  rGtE        tG        |j@                  jB                              x}rt        |tH              r|jJ                  }nd}|j                         D ]  }t        |jM                               dk  sJ |jM                         D ]  } '|      } |	||       |j=                  t?        ||             &|   jN                  D ]  }|j                         |j                         k(  r%t        |j                  tP              sJ |j                  jS                         D ]?  } '|      }|j=                  tU        ||j                                       |	||d
       A    t        j                  jV                  |j                            D ]8  } |	||d
       |j=                  tU        ||j                         d
             : t        j                  jX                  |j                            D ]'  } |	||d       |j=                  t?        |             ) |j@                  jZ                  D ]6  }t        |tT              r |	|j\                  ||j_                  |             8 |ja                   jb                         |j                         D ]  }|jM                         D ]y  }|j                          jb                   '|      <   |j                          jb                  |<    jd                  jg                  ||       jd                  |j                         <   {   t        j                  ji                         D ]3  }t4        j7                  d|        |	|tk        t?        |                   5 |rt        j                  jl                  D ]  }|j9                  d
      D ]|  }||
v sJ | d|
jo                                 |
|   x}s) j:                  |   jS                         D ]4  }t4        j7                  d||        |	|tk        t?        |                   6 ~   jb                  D ]  }|t        j                  j                   v rE |	|tk        t?        |                   t        j                  jp                  js                  |       d|t        j                  jt                  v s |	|tk        t?        |                    tw        t        j                  j                   jo                               D ci c]  \  }}||
 }}}t        j                  jp                  D cg c]  }||   	 c}t        j                  _<         j                  D ]C  }|j                         D ].  }|j{                  &|j                            jN                         0 E  j|                  D ]-  } j|                  |   j{                  &|   jN                         / t               } | j                  d       &jO                         D ]]  \  }}!| j                         5  |!jN                  D "cg c]  }"|"j                          }#}"| j                  d| d|# d       ddd       _ | j                  d       | j                         j                         }$t        j7                  d       t        j7                  d|$       yc c}w c c}}w c c}w c c}"w # 1 sw Y   xY w)zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                  >    e Zd ZdZ	 	 d	 	 	 	 	 ddZddZd	 fdZy)
1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nc                @    |xs g | _         |xs
 t               | _        y r`   )r=  r   
membership)r   r=  r9  s      ri   rY  z:Scheduler.compute_dependencies.<locals>.DedupList.__init__  s    
 #[b
","<
ro   c                    || j                   v ry | j                  j                  |       | j                   j                  |       y r`   )r9  r=  r   r  )r   	node_users     ri   r   z8Scheduler.compute_dependencies.<locals>.DedupList.append  s5    /

!!),##I.ro   c                    t        j                  | j                  |j                        }| j                  |j                  D cg c]  }|| j                  vs| c}z   } ||      S c c}w r`   )r   r  r9  r=  )r   r  new_membershipr'  	new_items	DedupLists        ri   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__	  sc    !+!1!1$//5CSCS!T JJ${{*at.FA* 	 !N;;*s   A+A+r  )r=  zOptional[list[_T]]r9  zOptional[OrderedSet[_T]]r   r8  )r;  rZ   r   r8  )r  DedupList[_T]r   rA  )r   r   r   r   rY  r   r@  )r?  s   ri   r?  r7    s;     -17;=)= 5= 	=/<ro   r?  r   c                N    | j                   v r j                   |          S | S r`   )rI  )r  r  r   s    ri   r  z.Scheduler.compute_dependencies.<locals>.rename6  s,    D)))d33A677Hro   Fc                P     |          j                  t        |||             y r`   )r   r  )used_by_namer  r  r  name_to_usersr  s       ri   add_userz0Scheduler.compute_dependencies.<locals>.add_user;  s)     &./66K9ro   Nc                    | j                   S r`   r  r  s    ri   rW  z0Scheduler.compute_dependencies.<locals>.<lambda>^  s
    AFF ro   r  Tzscheduling %s)unbacked_onlyc                    | j                   S r`   r  r  s    ri   rW  z0Scheduler.compute_dependencies.<locals>.<lambda>q  s
    !&& ro   z not in )r  )mutating_buf)r  )r  zscheduling output %sz+scheduling output %s for unbacked symint %sr0  'z': r  r1  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)r  r   r   r   )FF)
rD  r   r  r  r  r   r  r   r   r8  )Er   rZ   rr  r   r  r[  r   ra   rd   r
  r&   r:   r   r  rT   rs   rs  r   r   r>  r   	TensorBoxr  r  get_unbacked_symbol_defsSymbolrr  r  r  r%  r  r0   r   ro  r  r  r/   r  r  r=  rX   r  r1   additional_buffer_depsadditional_star_depsr   r   r  r  rI  r  r  r  r*  graph_outputsr   mutated_inputsr  r  r  mutated_input_idxsr0  r"  rK   rm  r  r  rt  compute_dependencies_log)(r   rd   buf1	buf1_name	buf2_namelist1list2combinedr  rF  unbacked_symbol_to_origin_nodevalfsrQ  sym_sizehas_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr.  r   r   	node_modealt_namer  
other_nameadd_depr%  r  r   r   r   	inp_nameslogbufr  rB  r   r   r?  rE  r  s(   `                                    @@@ri   rW   zScheduler.compute_dependencies  s
   	< 	<@ @K?V?V@
 JJ 	LD((* L MMO	 tyy//?D,,./!3!%!1!1!3 LI M1i=6P -i 8 -i 8#(5=#0 >C -c 2e ;#0#5#>5=c 2> #m33@3Ki03@3Ki0LL	L<	 !&!				 <		 			
 		 		 MO&
 77''..0 
	BC#uzz*** >B9=226>C. (+||~S!Auzz9RASS! BAnn B=A6r:BB
	B ',#JJ 	HD99((( $*		224:J$  * H!!U\\222 /3+::8<215H	H  JJ J	DIIotyy1*yy,,,'-II222F(($
 . GA >> #X&D%EF> <A>>K#'#4#4Q#7#C#C#E GC --gclln.EFGG D$$++,1 d&6&6&=&=!>??S?sI.HH	 	 '') E3,,./1444 # 1 1 3 EH%h/HXt,%%ghY&GH -h 7 = = E==?dmmo=$)$))5FGGG*.))*D*D*F EJ)/
);J -- '
 P %ZtDEEEE* 7799$--/J S$5 !!''4==?D"QR	S 7777H 4$6!!''"234
 ((.. F!$0TYYd.>.>t.DEF %%d&;&;< '')  # 1 1 3 H>AllnD))&*:;69llnD))(3//33HhG ++CLLN;IJ	Z 002 	>HII,h7Xz'(*;<=	>
 'ww,, N111E NA >> #X&D&I&I&K%LM> ;1==q=(,(9(9!(<(M(M(O NHII M ( !
 %Xz'(:K/LMNNN )) 	:Dqww+++z'$-89&&**40***z'$-89	: ,5QWW5I5I5N5N5P+Q
'E4D%K
	 
 )*(>(>&
 $IdO&
"
 JJ 	CD'') CmCLLN;AABC	C // 	SD''-77d8K8Q8QR	S  !c'--/ 	4JC 4/4{{;!;;#c%234 4	4 	c  "))+ &&';< &&'I3Os T@
&
" <4 4s6   5iii8i)i#8ii#i##i,	c           
         ddl m}m}m}m} t        t        j                  j                  j                               } | j                  |      }t        j                  j                  j                  s | j                   j                         t        t        j                  j!                               } | j                  ||      \  }}	}	t#        t%         j                              D 	cg c]  }	g g f c}	|D ]}  }
|
j&                  dk(  r|
j(                  dk(  r"|
j*                  j-                         }|
j.                     d   j1                  |       |
j2                     d   j1                  |        ddlm}  |        	 	 	 	 	 	 d fd}g }t9         j                        D ]H  \  }}|j1                  |       |j1                   |||t%         j                        dz
  k(               J | _
        y c c}	w )Nr   )r  compute_memory_timelineFreeableInputBufferget_freeable_input_bufr   )register_check_mem_opc                X   |    d   }|    d   }|||g}t        j                  t        t        j                  d            t        j
                  j                  j                  j                  g |d       }dj                  |    j                          |_        t        |      S )Nr   r   r  )r  c                $    | |d   |d   |d   dfS )Nr   r   r   )alivedeadis_final_stepr   )tensor_argsr  s     ri   rW  zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>%  s*    !.q!1 -a 0)6q)9C ro   )r
  r   rr  nontensor_argsunflatten_args
mem_check_)r&   MemoryCheckKernelr:   r}  r  r  _inductor_debugcheck_memory_stepdefaultr  r   r  rl  )step_idxrq  expected_newly_aliveexpected_newly_deadrs  rd   r   step_allocs_deallocss         ri   construct_mem_check_nodezEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node  s     $8#A!#D "6x"@"C24GWN''!e)<=yy00BBJJ- D %/tzz(/C/L/L/N.O"PD,T488ro   )rq  )rz  r   rq  r   r   rl  )r  r  ri  rj  rk  r   rT   rs   rs  r   r  r}  r~  r#   r  r#  r  rH  r   
size_alloc	size_freer:  r   
start_stepr   end_step#torch._inductor.runtime.debug_utilsrl  r  )r   r  ri  rj  rk  rs  name_to_freeable_input_bufrQ  buf_info_listr  buf_infor  rl  r~  	new_nodesr  rd   r}  s   `                @ri   r  z#Scheduler.insert_memory_check_nodes  s   	
 	
 )31773G3G3L3L3N(O"4::|< 	# %%===

D,, *4AGG4L4L4N)O5JJ&
q! $C

O4C
RHC
 & 	HH""a'H,>,>!,C//1H !4!45a8??I !2!23A6==hG	H 	N	9	9*.	9&	92 	 , 	GAtT"(1DJJRS@S;SU	 
eC
s   3Hc                |  	 t         j                  syg }t        | j                        D ]  }dd	d}|j	                         D ]  }t        	fd|j                  D              }|r\t        j                  d|j                                t        j                  j                  j                  |j                                d} |j                          xr | }|s|j                  |       t        j                  d|j                                t        j                  j                   j                  |j                                |j"                  j$                  D ]  }|j&                  | j(                  v s| j(                  |j&                     j                  }|D cg c]0  }|j*                  j                         |j                         k7  s/|2 c}| j(                  |j&                     _          t-        t        |            | _        | j                  D ]  }|j/                           yc c}w )	z0
        Remove any nodes without users
        Nc                r    | j                   xs* | j                         t        j                  j                  v S r`   )r  r   rT   rs   r  )r  s    ri   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_userH  s&    ||Tt}}!'':T:T'TTro   Fc              3  .   K   | ]  } |        y wr`   r   )rg   ur  s     ri   rj   z2Scheduler.dead_node_elimination.<locals>.<genexpr>M  s     #Ma$6q$9#M   zremoved dead buffer: %sTzremoved dead operation: %s)r  r  r   r   )r#   use_dcer   r  r[  rk   r   rr  r  r   rT   rs   r  r  r  r   r  r   r   r   r#  rd   r   r  )
r   updated_nodesrd   active_buffersr   can_eliminater%  r   r  r  s
            @ri   r  zScheduler.dead_node_elimination;  s    ~~
 TZZ( 	DU #N'') * ##M399#M M II7HGG++//?%)N* !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22 DyyD$4$44 $ 0 0 ; A A',="#0AT]]_0TA=((39-	8 (=12
 JJ 	#D  "	#=s   %0H9H9c                    t        t                  t               g dfd|D ]  }|j                         D ]  }||<   	  |D ]
  } |        S )z?
        Ensure nodes is in topologically sorted order
        c                    | vrdj                  |        t        | j                  d       D ]&  }|j                  vr |j                            ( j	                  |        y y )Nc                    | j                   S r`   r  )ds    ri   rW  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>u  s
    aff ro   r  )r  r  rR  r   r   )r  r   r%  r  seenvisits     ri   r  z2Scheduler.topological_sort_schedule.<locals>.visitr  se    }!!"6"6<LM 2Cxx|3 ,sxx01	2
 a  ro   )r  rX   r   r8  )r   rX   rt  r  )r   r  rd   r   r%  r  r  r  s       @@@@ri   r  z#Scheduler.topological_sort_scheduleh  sy     +,.59V*,	! 	!  	*D--/ *%)T"*	*  	D$K	ro   c                <    t               }t        |t        t        t        t
        t        f      r-|j                  D ]  }|j                  |j                          nt        dt        |       d       fd|D        }t        t         fd|D                    S )Nz+get_unmet_dep_nodes is not implemented for .c              3  X   K   | ]!  }j                   |   j                          # y wr`   )r#  r   r  s     ri   rj   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>  s%     Xc))#.??AXs   '*c              3  <   K   | ]  }j                   |     y wr`   r  )rg   r  r   s     ri   rj   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>  s     Qat66q9Qs   )r   ra   rb   rl  r  r   r  rR  r  r   RuntimeErrorr   r   )r   r  
unmet_depsr   unmet_dep_opss   `    ri   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodes  s    &0l
)&"$	
 // )sxx() =d5k]!L  YZXJQ=QQRRro   c                z   g }t         j                  | j                  d      }i }| j                  D ]P  }| j                  |      }t	        |      ||<   |D ]*  }|j                  |g       }|j                  |       |||<   , R |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|j                  |
       |
D ]7  }|j                  |g       D ]  }||xx   dz  cc<    |j                  |       9 |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|rJ d       |S c c}	}w c c}	}w )zU
        Sort nodes by their topological order, return a list of node lists.
        r   r   zTopological sort failed!)	rt  fromkeysr  r  r   r  r   r=  r  )r   r  r  childrenrd   r  r   cr  rB  zero_deg_nodesr  s               ri   rL  z!Scheduler._topological_sort_nodes  sF    djj!,#%JJ 	"D,,T2Dd)E$K "LLb) !"	" ).@1a!@@LL(# $LLB/ %D$K1$K%		! -2KKMDDAqQ!VaDND  444y A Es   D1%D1D7D7c                j   i }| j                   D ]w  }t               }|j                  D ]B  }| j                  |j                     j                         }|j                  |       |||   z  }D |||j                         <   ||_        y t        | j                         D ]  \  }}||_
        ||_         y)z.
        Populate each node.ancestors
        N)r  r   rR  r#  r   r   r  r   r   r  rE  rF  )r   name_to_ancestorsrd   r   r   dep_node_namer  s          ri   r  zScheduler.compute_ancestors  s    
 9;JJ 	'D)3I.. > $ 0 0 : K K Mm,.}==	> 2;dmmo.&DN	' %TZZ0 	#KE4"DN"DN	#ro   c                H   t         j                  sy | j                  D ]  }t        |t        t
        f      r#|j                         st         j                  dk7  r=|j                         D ]3  }t        |t              r|j                         r$|j                          5  y )Nhalide)r#   r-  r  ra   rb   r   rN   cpu_backendrl   r  rX  )r   rd   r  s      ri   rX  zScheduler.merge_loops  s    00JJ 	$D d]4F$GHKKMf&8&8H&D) $!%75;L;L;N!!#$	$ro   c                   t        ddd      5  t        d      D ]  }t        |      }t        j	                  d|dz   |       | j                  |d      }t        |      }t        j	                  d	|dz   ||       ||k(  s|dk(  slt        j	                  d
|dz           n t        j                  st        j                  r| j                  |d      }|cddd       S # 1 sw Y   yxY w)zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTr  
   z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   rH  r   r  r  fuse_nodes_oncer#   r-  loop_index_inversion_in_fusion)r   r  r  old_lennew_lens        ri   rr  zScheduler.fuse_nodes  s     #4QU
 	 2Y e*  EE
 ,,UU,Ke*  TE	 g%A$$Eq1u ', 1188,,UT,J;	 	 	s   A7C!AC!!C*c                    g }| j                   D ]4  }|j                  t        |t              r|j	                         n|g       6 || _         y)zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r  ra   r  rs  )r   r  rd   s      ri   r  zScheduler.process_grouped_nodes  sJ     .0	JJ 	D!+D2F!GdV	 
ro   c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        ddd      5  |j                  |      cddd       S # 1 sw Y   yxY w)
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)r  dynamo_compile_column_usN)r   r   r  r+  r   r  )r   r  r  r  s       ri   r  zScheduler.benchmark_fused_nodes  st     5zA~~q$$&$""6*#"&%D
 	8
 007	8 	8 	8s   
A%%A.c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        d      5  |j                  |||      cddd       S # 1 sw Y   yxY w)r  r   r  hint_overrideN)r   r   r  r+  r   generate_kernel_code_from_nodes)r   r  benchmark_kernelr  r  r  s         ri   r  z)Scheduler.generate_kernel_code_from_nodes   sw     5zA~~q$$&$""6*12 	::'} ; 	 	 	s   A%%A.c                    || _         | j                  |      }t        d      5  |j                  |      cddd       S # 1 sw Y   yxY w)r  r  N)r  r+  r   benchmark_codegened_module)r   moduler  r  s       ri   r  z$Scheduler.benchmark_codegened_module3  sH     %""6*12 	>55f=	> 	> 	>s	   ?Ac           
     0   t        | j                        D ]k  \  }}t        |t              st        |j                  t
        j                        s=|j                  }t        j                  j                  s|j                         \  }}n t        d |j                         D              }t        |t        j                  j
                  j                        rt        j                   ri }||d<   t        j                   D ]k  }|j                  |      }|j#                         D 	
ci c]  \  }	}
t        |	t              r|	|
 }}	}
t%        |j#                         d       d   }|||<   m |j                  j'                  |       n|j                  j)                  |       t
        j*                  j-                  |j.                        5  |j1                         }ddd       j2                  }t        |t
        j4                        sJ |j2                  }t        |t
        j6                        sJ |j8                  rt;        ||j8                         |j<                  |_        | j?                  ||||       n yc c}
}	w # 1 sw Y   xY w)a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        c              3  |   K   | ]4  }t        |t        j                  j                  j                        r| 6 y wr`   )ra   r}  r~  select_algorithmExternKernelCaller)rg   timings     ri   rj   z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>R  s6       &) & % @ @ S S  #r  Nr  c                    | d   S r$  r   r  s    ri   rW  z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>k  s    qQRt ro   r  r   ) r  r  ra   rb   rd   r&   MultiTemplateBufferr#   r  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr}  r~  r   multi_kernel_hintsr=  rf  finalize_as_triton_callersfinalize_as_triton_callerr  current_originsr<  output_noder   
StorageBoxOperationBufferorigin_noder5   r
  _replace_node)r   r  rd   
multi_nodemin_node_unfusedr  callershinttimingsr  rB  triton_timingschoiceout_tensorboxout_storage
out_buffers                   ri   r  z)Scheduler.finalize_multi_template_buffers?  s:    !, 5	DGAt$.:		2114 "YY
**PP*4*C*C*E'$a'+*4*C*C*E	($ $OO&&?? 00QS(8$*$=$= 3D&0&?&?d&?&SG -4MMO.$(Aq#-a1I#J !"1.N .
 &))=)=)?^%TUV%WF,2GDM3 		<<WE		;;<LMYY..z/A/AB C$4$@$@$BMC+00!+r}}===(--
!*b.@.@AAA))&}j6L6LM$.$5$5
!"":z1dCk5	D:.C Cs   *J
JJ	c                   t        ||       | j                  |      }|| j                  |<   || j                  |j	                         <   || j
                  |j	                         <   i t        j                  |j                  j                  |j                        D ]:  }| j                  j                  |j                  d       x}s,|j                  |<   < dfd} ||j                        |_
         ||j                  j                        |j                  _	        t        |j                         |j                               D ]3  \  }	}
|	| j                   |
j	                         <   |
j"                  |	_        5 |j$                  |_        |j&                  |_        |j(                  |_        |j*                  |_        y )Nc                ,    t        fd| D              S )Nc              3  @   K   | ]  }|j                          y wr`   )r  )rg   r   rI  s     ri   rj   z?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>  s     Kscjj)9:Kr  r   )r  rI  s    ri   rename_depsz,Scheduler._replace_node.<locals>.rename_deps  s    KdKKKro   )r  rQ  r   rQ  )r  r  r  r%  r   r  r  r  r   r   rR  r  r  r   ru  r[  r#  r   rE  rF  r   rD  )r   r  r  r  rd   new_scheduler_noder   	real_namer  new_outold_outrI  s              @ri   r  zScheduler._replace_node  s    	"*j9!77
C*

1-?$--/*3E0 ??4#3#3#9#94;R;RS 	7C 3377$GGyG.1hh +	7	L 1<111
- 0;**000
&&, !$**,d.>.>.@!
 	*GW 4;DW--/0#MMGM		* (,~~$'+~~$'+~~$(,%ro   c                &    t        d |D              S )Nc              3     K   | ]q  }t        |j                  d       xrU |j                  duxrE t        |j                  j                  d      xr# |j                  j                  j                  dk(   s yw)r   Nscatter_moder  )r  rd   r   r  r  s     ri   rj   z,Scheduler._any_atomic_add.<locals>.<genexpr>  so      

 	 AFFF# 9d"9^49 ((L89
s   A7A9)r   r   	node_lists     ri   _any_atomic_addzScheduler._any_atomic_add  s     

 
 
 	
ro   c           
     t
    !"#$% t        d fD              }t        j                  s|syj                         r(t	        j                         t        j                        r j                         sj                         ryj                         }|d   j                         sJ j                  dk(  rt        j                  dk7  ryj                         }t        t        j                  ||            } j!                  |      ryddlm t'              %|d   j                         J dfd!t(        j*                  j,                  j/                         	 d	 	 	 	 	 d fd	}|rt        d
 fD              rj                         durj                         nj                         $t	        $t        j0                        sJ i  g t        j2                  D ]8  }$j5                  |      }	t7        |	j9                         d       D ]u  \  }
}t	        |
t(        j*                  j:                  j<                        s5$j?                  |
      5  jA                  |
g |||
jB                               ddd       w tE        d      }d}i }D ]V  \  }
}}	 ||jG                          $j?                  |
      5   jW                  |      \  }}|||
<   ||k  r|}|
}ddd       X |$jX                  |<   t	        |tZ              sJ | |<   ; $j5                         }	$j]                         \  }"r j_                  |      n j_                  |      \  #}g d}t7        |	j9                         ta        jb                  d            D ]  \  }
}t	        |
t(        j*                  j                  jZ                        s5s&te        |
d      r|
jf                  $jf                  k7  r]|"#z   k\  r nQ|dz  }|t        jh                  kD  r n7$j?                  |
      5  jA                  |
g ||             ddd        tk              dk(  ryd !"#$ f	d}|S  ||       ||       ||      d! %fd}|S # 1 sw Y   xY w# tH        $ rR}tJ        jM                  tN        jP                        r$tJ        jS                  dsdndtU        |             Y d}~_d}~ww xY w# 1 sw Y   qxY w# 1 sw Y   xY w)
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c              3     K   | ]>  }|j                         xr( t        |j                         t        j                         @ y wr`   )r  ra   r  r&   r  r  s     ri   rj   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  sE       
  MMO J1..0"2H2HIJ 
s   AATr   r  r   CompilationErrorNc           
     t   t         j                  t        j                        r| ||z   k  rFt         j	                  dj                         j                         t        ||z   | z  d             y t         j	                  dj                         j                         t        | ||z   z  d             y y )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r%  r&  DEBUGr  r  r?   r@   )ms_fusedms1ms2r   r   s      ri   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}}5cCi'$$S..0..0"sSyH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6ro   c                    j                  | d|      }t        j                  |      }j                         sd }||fS j	                  d|      }t        |t              sJ ||fS )NT)r  r  triton_)kernel_namesource_code)r  r   loaduse_process_poolr   ra   r   )r  r  src_codemodfutasync_compiler   s        ri   compile_kernelz3Scheduler.speedup_by_fusion.<locals>.compile_kernel  s     ;;M < H ""8,C 113
 : $**yh*W!#|444:ro   c              3  @   K   | ]  }|j                         d u  y wr`   r  r  s     ri   rj   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  s#      %
23A!-%
s   c                    | d   S r$  r   r  s    ri   rW  z-Scheduler.speedup_by_fusion.<locals>.<lambda>  s    aPQd ro   r  r  infException in compiling %s: %sr  r  r   allowed_prologue_inpsFc            	     h  	 t        d      } d }i }D ]V  \  }}}	 ||j                          j                  |      5  j                  |	      \  }}|||<   || k  r|} |}d d d        X  |        | z   k  rJ|Ht        j                  r|d <   j                         nj                  |       |j                  d <   yy# t        $ rR}t        j	                  t
        j                        r$t        j                  d
sdndt        |             Y d }~d }~ww xY w# 1 sw Y   xY w)Nr  r  r  r  TF)r  r  rq  r  r%  r&  r  r  r   swap_as_triton_callerr  r#   r  r  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr  future	mod_fusedr   r  pathr  epilogue_fusionfuture_choices hint_override_best_fusion_choicer  r  r  r  r   s            ri   benchmark_when_readyz9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyk  sX   $U|"& 1? 5-FFI!!-"MMO $99&A 	5)-)H)H%"*$
 /7F+#l2+3L.4O	5 	5!56 <c239-/2M00AP8>"==<
 #<<_M 8CJ..t4 K % !%227==A&,, ?2A
z #A
 !!	5 	5s#   C	$D'		D$ADD$'D1	c                    ddl m}  	 d   d   d   fD ]  }||j                           j                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       y        t        d      rWz   k\  rOfj                  vr?j                  j                  f       t        d      j                  fd	       z   k  S # | $ r Y y	$ r}d
t        |      v rY d }~y d }~ww xY w)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r  r  r  path1path2
path_fuseds   ri   rW  zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s(    053605365?8@3;sSy3I% ro   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  r  r  mathisinfr   r  r  r   r  r   )r  r   r   r  r  r  r   r!  r"  r  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  r   r/  s      @@@@@@ri   r  z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  s   A *!,)!,/2  )
 ?JJL) "&!@!@)!,"JC
 zz#CD$!%!@!@)!,"JC
 zz#DE$+/+J+J/2,(Hj
 zz(+CD$xc2 0>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E AE +5E !5E A3E E.E.E)(E))E.)r  r  r  r  r  r  r   r8  r`   )r  rk  r  Optional[int]r   z)tuple[Optional[LambdaFuture], ModuleType]r9  )6r   r#   benchmark_fusionr  ra   r  r&   TritonTemplateBufferr  rl   r   r   r  r   r  r  r  triton.compiler.errorsr  r  r}  r~  r  AsyncCompiler  r  r  r  r=  r  TritonTemplateCallerr	  r   r  r  r  rq  r  r%  r&  r  r  r   r  r
  r   r  r  operator
itemgetterr  r   max_epilogue_benchmarked_choicesr   )&r   r   r   is_multi_templatenode_list_1node_list_2node_list_fusedr  r  r  r  r  r  r  r  r  r  r   r  r  r!  triton_choicesunfused_timer  r  r  r  r  r'  r(  r)  r  r  r  r  r  r  r/  s&   ```                     @@@@@@@@@@@@@@ri   speedup_by_fusionzScheduler.speedup_by_fusion  sd       
 U^ 
 

 &&/@ u668":Q:QR!! oo'Q**,v ;;%F$6$6($Boo'y{KHI
 0;u% #..0!!!	" 55BBD PT	.	?L	6	  %
8=u~%
 "
 $557tCO # ''),,. 
 j"*@*@AAA  - TVN!'!:!: *R!+!:!:=!I!'(<(<(>N!S IFA% @ @ U U !#99&A &-- &!/$36CWCW""   %U|FJ 1? 5-FFI
!!-"MMO $99&A 5)-)H)H%v*$ /7F+#l2+3L.4O5 55( =H
**=9!/3KLLLBQ0?U*RZ (668N..0FAs # **;7//< C TVNN(.$$&H,?,?,B) V$ "&%//*<*<*U*UV ((?@44
8X8XX39,!#!F$K$KK55f= V"))6*TN?4S*TUV V3V8 >"a'2! 2!h (' !/{ ; .{ ;&4_&E#F FP ('i " % !%227==A&,, ?2A
z #A
 !!5 5bV Vs=   6(R5S$$T 	T-5R?	TATT T*-T7	c                <    | j                   |j                            S )z0Look up the node in Scheduler name_to_fused_node)r  r  r  s     ri   r  zScheduler.get_fused_node  s    &&t':':'<==ro   c                     j                  |       t        |      t        j                  t        j
                        r@t        j                  d       D ]&  }t        j                  d|j                                ( i 	 	 	 	 	 	 d fd	 	 	 	 	 	 d fd} j                  ||      D ]  \  }} |||        j                  |      } j                  |      } j                  |||      sE j                  ||      rX j                  ||      }t        |      r|||f|<   |||f|<   |s ||        t               }j                         D ]j  \  }	}
}|	|v r|j                  |	        j                  |
      |
u sJ  j                  |      |u sJ  |	       sO j                  |
|      rb |
|       l t!        d       } j#                  |      }|S )	a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sc                   t         j                  d| j                         |j                                | j                         }|j                         |k(  sJ j	                  |      j                  | |      }j                  |        j                  |       j                  |       j                  j                  |j                         D ci c]  }|j                         | c}       |S c c}w )Nzfusing %s with %s)r  r  r   r   r+  r  r  r  r  r  rl   )r   r   r  node3r  r8  r   s        ri   fuse_two_nodesz1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodes  s     0%..2BENNDTU%%'F##%///$$V,11%?Eu%u%OOE"##**.3oo.?@u$@ L As   C6c                   j                  |       v sj                  |      v rj                  j                  |       j                  j                  |      d             }|J |\  }}}j                  |d        j                  |d        j                  |      |u sJ j                  |      |u sJ  |       rj                  | |      r ||       j                  |       v rωj                  |      v ry y r`   )r  r  r  will_fusion_create_cycle)	r   r   pending_fusion
is_speedup	node_key1	node_key2r>  pending_fusionsr   s	         ri   resolve_pending_fusionsz:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusions  s    ##E*o=&&u-@!0!4!4''.#''(;(;E(BDI" &1113A0
Iy##It4##It4**95BBB**95BBB!|t'D'DUE'Ry)4' ##E*o=&&u-@ro   c                    | j                   S r`   r  r  s    ri   rW  z+Scheduler.fuse_nodes_once.<locals>.<lambda>]  s
    !++ ro   r  )r   rX   r   rX   r   rX   r  )r  r   r  r%  r&  r  r  r  get_possible_fusionsr  r   r@  r9  callabler   r  r  r  )r   r  r  rd   rF  r   r   speedupseen_pair_speedup_fnis_speedup_fnrC  rD  r>  r8  rE  s   `           @@@ri   r  zScheduler.fuse_nodes_once  s,    	!!%( '""7==1;<# A  )=)=)?@A  	
	$	->		 	5$	5->	5	52 !55e=MN 	-LE5 $E51''.E''.E}}u.33E5A00>G$.5ue-DOE*.5ue-DOE*ue,)	-, @J|3B3I3I3K 	5/M9i 44 $$]3&&y1Y>>>&&y1Y>>>t'D'D9( y)4	5 {(=>..u5ro   c                   t        | j                        }d}t        | j                        }t        j	                  d|       t        t        j                  |             D ]  \  }}t        j                  |      }t        |      dk  r+|||kD  r n| j                  |      st        j	                  d|       \|dz  }t        j                  dkD  }t        |d   j                  |d|      }t        j                  d	t        |      |       |D ]  }	|j                  |	        |j                  |       | j                   j#                  |j%                         D 
ci c]  }
|
j'                         | c}
       ! t)        |d
       | _        | j+                  | j                        | _        t        j                  d||t        | j                               | j-                  | j                         yc c}
w )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Tr-  z0ComboKernels: Combining %d nodes for %d-th groupc                    | j                   S r`   r  r  s    ri   rW  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>  s
    q{{ ro   r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  r   rr  r  r  r  r[  rH  speedup_by_combo_kernelr#   r/  r   r  r  r  r  r  rl   r   r  r  r  )r   r  r8  r  num_nodes_orignumr  r7  r  rd   r  s              ri   r  z#Scheduler.create_combo_kernel_nodesa  s    !,TZZ		FU'&DDTJ
 	NC 3CCINI9~!'EL,@//	:		EsKQJE$;;a?O4!&&*. /	K HHBI
 " )""4()OOK(##**4?4I4I4KLq{*L7	< K-BC
33DJJ?
R

O		
 	!!$**- Ms   !G=
c                H    |D ]  }|j                  | j                          y r`   )r  r  )r   r  rd   s      ri   r  zScheduler.prune_redundant_deps  s%     	?D%%d&=&=>	?ro   c                   
 g 
t        t        t        t        f             d
 fd}t        j                  t
              }|D ]=  } j                  |      r|j                         D ]  }||   j                  |        ? |j                         D ]
  } ||        t        j                  rat        j                  t
              }|D ]&  }t        |dd      }	|	s||	   j                  |       ( |j                         D ]
  } ||         j                  
      

j                   j                  d       t         j#                  dt%        
             
S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                |   t        |       D ]  \  }}| |dz   |dz   t        j                  z    D ]  }||f}|v rj                  |       j	                  ||      rj                  |       B|j                         s|j                         scj	                  ||      swj                  ||f         y r$  )r  r#   )max_fusion_buffer_group_pairwise_attemptsr  r   r   r  r  )	r  node1_indexr   r   r  r  possible_fusionsr  r   s	        ri   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairs  s    &/&6 @"U"!Ok'FF'G @E
 !%.Cd{ HHSM}}UE3CD(//4++-1A1A1Cu&6J )//?!@@ro   rx   NT)r  reversezfound %d possible fusionsr  r  r   r8  )r   r   rX   rr  r   r   unfusable_noder   r   r   r#   aggressive_fusionrn  *get_possible_fusions_with_highest_priorityr  score_fusion_keyr  r  r   )r   r  r  rX  buffer_names_groupingrd   r   node_groupinggroup_groupingrx   rW  r  s   ` `       @@ri   rH  zScheduler.get_possible_fusions  sn    % 13D DEFH	@ 	@( !, 7 7 = 	8D""4(--/ 8%c*11$78	8
 399; 	+MM*	+ ##(44T:N 7gt4"5)0067 "0!6!6!8 /./  JJ
 	$"7"7F4c:J6KLro   c                    t        t                  d fd|j                         j                  j	                         |j                         j                  j	                         z  |j
                  j                  j	                         |j
                  j                  j	                         z  z
  t         fdD              }|r t        ||      d       |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        c                   t        | t              rq| vrmj                  |        | j                         j	                        ryt        | j                  z        xs" t        fd| j                  z
  D              S y)NFc              3  H   K   | ]  } j                   |           y wr`   r  rg   r  
found_pathr   s     ri   rj   zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s+      H #4#:#:1#=>H   ")ra   r   r  r   issubsetr   r   r   )rd   combined_ancestorscombined_namesrf  r   visiteds    ri   rf  z6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 23G8KD!++-667IJ !   ?@ C H!%2D!DH E  ro   c              3  H   K   | ]  } j                   |           y wr`   r  re  s     ri   rj   z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s!     WqJt66q9:Wrg  zwill create cycler   )r   r   r   _dictr   r   r   r  )r   r   r   cycleri  rj  rf  rk  s   `   @@@@ri   r@  z"Scheduler.will_fusion_create_cycle  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWW#IeU#$78ro   c                    ddl m 	 	 	 	 d fd} ||      } ||      }t        fd|D              }t        fd|D              }|j                  |      }d}	|D ]  }
	 |	t	        |
d         z  }	  j                  ||      }t        j                  j                  j                  |	d	|z        ry
y# t
        $ r Y  yw xY w)a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   )buffer_reuse_keyc                0   g }| j                   j                  D ]y  }j                  j                  |j                        }|s+t        |j                        dk(  sD|j                  j                         s_|j                  |j                         { |S r$  )
r   r   r#  r  r   r   r   rd   has_tensor_outputr   )rd   r\  r(  r   r   s       ri   _find_single_user_inputszKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,, ,&&**27733syy>Q.3883M3M3OMM#((+, Mro   c              3  .   K   | ]  } |        y wr`   r   rg   r   rp  s     ri   rj   z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>        #Sc$4S$9#Sr  c              3  .   K   | ]  } |        y wr`   r   ru  s     ri   rj   z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>!  rv  r  r   r   F    T)rd   rX   r   zlist[ir.Buffer])r  rp  r   intersectionr   r  r  rT   rs   rt   statically_known_gt)r   r   r   rs  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingrp  s   `           @ri   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$ 	C3s1v;.	 ,,UE:	 77//iP  s   $B88	CCc                   t        |j                         D cg c]  }|j                          c}|j                         D cg c]  }|j                          c}z         }t        d |j                  j                  D              }t        d |j                  j
                  D              }||z  }t               }	|j                  j                  D ]:  }
| j                  |
j                  |      s |	j                  |
j                         < t        d |j                  j
                  D              t        d |j                  j
                  D              z  }t        d |j                  j                  D              t        d |j                  j                  D              z  }||z
  }||	z
  }||z  }t        |      |kD  S c c}w c c}w )Nc              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>B  s     &TCsxx&Tr  c              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>C  s     %R3chh%Rr  c              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>P  s      $
CHH$
r  c              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>R  rV  r  c              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>U  s      %
CHH%
r  c              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>W  s     DCsxxDr  )
r   rl   r   r   ro  r   $can_buffer_be_removed_through_fusionr   r  r   )r   r   r   	thresholdrd   fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionr  all_read_namesall_write_namesunique_readsunique_writesunique_io_bufferss                   ri   (fusion_prevent_too_many_reads_and_writesz2Scheduler.fusion_prevent_too_many_reads_and_writes4  s    &).):;T]]_;+0??+<=4t}}=>
 '&T5;L;L;S;S&TT%%R%:K:K:Q:Q%RR'7:K'K$ :D%**11 	BI88 0 .11)..A		B $ $
 % 1 1 7 7$
 
C5+<+<+B+BCCD
 % %
 % 1 1 8 8%
 
D5+<+<+C+CDDE
 &(DD (*GG )=8$%	11M <=s   GG
c                    t        t        |j                  |j                  z
        t        |j                  |j                  z
              }|dkD  S )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r  rE  rF  )r   r   r   proximity_scores       ri   are_long_distant_nodesz Scheduler.are_long_distant_nodesd  sE    * %//12%//12
 ##ro   c                   i }|j                   j                         D ci c]  }|j                  | }}|j                   j                         D ci c]  }|j                  | }}|D ]}  }t        j                  j                  |      }	||   }
||   }t        |
t              rt        |t              sdt        |
       dt        |       ||<   k|
j                         |j                         k7  r(d|
j                          d|j                          ||<   t        |
j                        t        |j                        k7  rd||<   |
j                         }|j                         }||k7  rd| d| ||<   |
j                         |j                         k(  rd|
 d| ||<   Ed}t        |	t        j                        sd|	j                    }d	|
 d| d
| ||<    t#        |      S c c}w c c}w )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        znot MemoryDep: rq   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: rv  zLayout: zUnknown reason: z. )r   r  r   rT   rs   ra  ra   r/   r   r   rS   r   
get_offsetnormalize_with_stride_orderr&   rd  r
  r   )r   r   r   common_buf_namesreasonsr   node1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  ri   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reason  s    383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX( ,	H''$$X.C$X.G$X.Ggy1GY9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G'! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#56'

|4
"7)6'"ZLI HU,	\ 7|c YXs   G5G:c                   t         j                  syt        d ||fD              ry|j                  j	                         }|j                  j	                         }||z  }|syt        d |j                  D              }||z
  ryt        |      dkD  ryt        |j                  j                        dkD  s"t        |j                  j                        dkD  ryt        t        |j                  j                              }t        t        |j                  j                              }t        |t              rt        |t              sy|j                  j                  D 	ci c]  }	|	j                  |	 }
}	|j                  |
vry|
|j                     }t        |t              sy|j                         }|j                   |j                   k7  r|j"                  |j"                  k7  ry|j"                  |j"                  k7  st        |j$                        dk7  ryt        |j&                  j(                        dk7  ry|j&                  j*                  ryd|j&                  j(                  v rd|j&                  j(                  v sJ t        d |j&                  j-                         D              }t        |      dk7  ryt        t        |            }||j&                  j(                  d   k(  rd}d}n"||j&                  j(                  d   k(  sJ d}d}d	d
lm} |j&                  j2                  d	   }t        |      dk7  ryg }t4        j6                  j9                  |      D ]:  }|j;                  t<        j>                  j@                  jC                  |             < tE        |      } |||d	         }|y|j&                  j(                  |   |j&                  j(                  |<   ||j&                  j(                  |<   |jG                  dd       | jI                  ||      }t        |tJ              sJ tL        jO                  d|       |S c c}	w )aW  
        Attempts to enable fusion between two nodes by inverting indexing patterns.

        This optimization targets cases where node1 has a contiguous write and
        node2 has a contiguous write but discontiguous read. By inverting the
        indexing in node2's read and write operations, we can make them compatible
        with node1 for potential fusion.

        Args:
            node1: First scheduler node (source)
            node2: Second scheduler node (target for inversion)

        Returns:
            int: Fusion score if successful, 0 if optimization not applicable
        r   c              3  <   K   | ]  }|j                           y wr`   r  r  s     ri   rj   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s     2aqxxz2r  c              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s      .
CHH.
r  r   r   index0index1c              3      K   | ]  }|  y wr`   r   )rg   exprs     ri   rj   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s     %Ttd%Ts   r   )generate_inverse_formulaTFz!Shared memory after inversion: %d)(r#   r  r   r   buffer_namesr   rR  r   r   ro  r  r  ra   r/   r   r'  r   r   r  r   r   	subblocksget_read_exprs$torch._inductor.invert_expr_analysisr  varsr   Add	make_argsr   rT   rs   rt   combine_modular_indexing_pairsr   r>  r  r   r  r  )r   r   r   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writer   node1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexr  rp  simplified_termstermsimplified_read_exprinverse_formulascores                          ri   $shared_data_after_inverting_indexingz.Scheduler.shared_data_after_inverting_indexing  s   & 442E5>22 #..;;="..;;=03EE" $. .
 % 8 8.
 $
  $&88'(1, u  &&'!+s53D3D3K3K/Lq/P$u006678
4 1 1 8 89:*i0
9
 161B1B1I1IJ##JJ??,.":??3+y1 "++- !2!22  K$4$44??k...#j6J6J2Kq2P u{{))*a/ ;;   222EKK666	
7
 &%Tu{{7Q7Q7S%TT A%./0	 228<<&O' : :8 DDDD&O'Q[[%%a(
z?aII''	2 	D##  ??E	  ##3423GTUW "
 7<kk6P6P7
""?3 8G""#34 	""4/((6%%%%;UCm Ks   "Qc                t   t         j                  rt        d ||fD              ry|j                         s|j                         ry|j                  j                         }|j                  j                         }||z  }|sy|j                  j                         D ci c]  }|j                  | }}|j                  j                         D ci c]  }|j                  | }}g }	|D ]y  }
||
   }||
   }|j                         |j                         k(  s/|	j                  t        j                  j                  j                  |j                         d      ||f       { t        |	      dk(  ryt!        |	t#        j$                  d            \  }}}t'        |t(              rt'        |t(              sy|j*                  |j*                  k7  r3|j-                         |j-                         k(  r| j/                  |      S yd}|j1                         s|j3                  ||      }nV|j1                         s|j3                  ||      }n3t4        j7                  d|j9                         |j9                                |r*t;        j<                  t>        | jA                  ||            S dS c c}w c c}w )a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c              3  <   K   | ]  }|j                           y wr`   r  r  s     ri   rj   z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>\  s      8
AHHJ8
r  r   r   r   r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)!r#   r-  r   r  r   r  r  r   r  r   rT   rs   rt   r   r   r   r  r0  r1  ra   r/   rZ  r'  dep_size_hintrc   r  r]  r  r   r  r  r   r  )r   r   r   r  r  r  r   r  r  
candidatesr   r  r  _numel	reordereds                  ri   !shared_data_after_reordering_loopz+Scheduler.shared_data_after_reordering_loopL  s     00C 8
!&8
 5
 
 %"3"3"5"..;;="..;;=03EE"383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX 
. 	K$[1G$[1G3356689 !!((2273D3D3FQR2S	 z?a $'zx7J7J17M#N '9-Z5Sw///
   "g&7&7&99))'22	!!#77II##%77II##Q    KKT55eUCD	
 	
g YXs   J0J5c                    t        |t        t        f      xr) |j                          xr t	        |j
                         S )z>
        Is this node unfusable under any conditions.
        )ra   rl  r  r  rP   rd   r  s     ri   r[  zScheduler.unfusable_node  sD    
 t79OPQ C$$&&C7		BB	
ro   c                   |j                         t        j                  j                  k  ry|j	                         }|j                         }d}|||z  kD  r	 |d       yt        d |j                         D              }|t        j                  j                  j                  j                  fk(  r	 |d       yd	d} ||j                         j                        r|j                         s	 |d       yy)
zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     K   | ]J  }|j                   <|j                   j                         D ]  }|j                  dk(  r|j                   ! L y w)Ncall_function)rd   r4  r   r5  )rg   r  r   s      ri   rj   zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sT      
vv!VV'')	
 tt&	 HH

s   AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                <    | j                   dk  xr | j                  S )Nr   )itemsizeis_floating_point)r  s    ri   low_prec_fpzGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBro   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r  ztorch.dtyper   r   )r   rT   rs   invoke_quant_opsrI  rK  r   rl   r}  r  r  constant_pad_ndry  r  r  r  )	r   prologue_noder  r/  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr<  r  s	            ri   (check_prologue_fusion_heuristics_fusablez2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHI!>>@h ro   c                <    t        |t              rt        |t              syt        |j                  t        j                        r$t        |j                  t        j                        sy|j                         s|j                         ryt        j                  dk(  ry|j                  |j                  }}|\  }}|\  }}|j                         s,|j                         s||k7  st        |      t        |      k7  ryt        |j                  j                        dkD  s"t        |j                  j                        dkD  ry j                  t        t        |j                  j                                    }	 j                  t        t        |j                  j                                    }
t!        |	|
      t        j"                  kD  ryd fd} ||      s ||      ryg }t%        t'        ||            D ]  \  }\  }}||k7  s|j)                  |       ! t        |      dk7  ry|d   }||   ||   }}t*        j,                  j.                  j1                  ||      r|||fS t*        j,                  j.                  j1                  ||      r|||fS y)ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        Nr  r   c                ~   | j                   j                  D ]  }|j                  j                  v rj                  |j                     }n%j                  j                  |j                        }|s]t        j                  j                  j                  ||       st        |j                  t              r y y)NTF)r   r   r   r"  r#  r  rT   rs   r  r  ra   r   r  )rd   r%  r&  r   s      ri   has_reusable_bufferzIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer&  s    ((..  99 ; ;; $ ; ;DII FI $ 0 0 4 4TYY ?I ,,66y$G&y'<'<>TU  ro   r   r   )ra   rb   rd   r&   r   r  r#   r  r   rc   r   r   ro  r  r  r  r  small_memory_access_thresholdr  ru  r   rT   rs   rt   statically_known_lt)r   r   r   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  ri   "get_expand_dim_for_pointwise_nodesz,Scheduler.get_expand_dim_for_pointwise_nodes  s]    %/z%7W uzz2#4#455::r'8'89 ))+u/M/M/O ) #\\5<<()1&)1& !!#/1=!S%77 u  ''(1,E4E4E4L4L0MPQ0Q "//T%:K:K:R:R5S0TU!//T%:K:K:R:R5S0TU"$67223 	  u%)<U)C !'0]M1R'S 	0#C#'7'!#**3/	0 "#q(*1-,',' ' 77//O66WW11..Q66ro   c                   ||u ryt        |t              r|j                  |      S t        |t              ryt        ||      }|j	                         r0| j                  |j                               j                  ||      ryt        |t              st        |t              r	 |d       yt        |t        t        f      r|j	                         s	 |d       yt        |t        t        f      r|j	                         s	 |d       y|j                         |j                  z  r	 |d       y|j	                         r!t        j                  s	 |d       y|j                         s|j	                         r	 |d       y|j!                         }t        |t"        j$                        s	 |d	       y|j'                         }t)        d
 |j*                  D              |z
  }|j-                         |z  r	 |d       y|j/                         s|j/                         r	 |d       y|j1                         dd D ]B  }	|	j3                         }
|
D ]+  }t5        fd|j6                  D              r" |d         y D t        |t8              s|gn*|j:                  D cg c]  }|j	                         s| c}}t=        |      dk(  sJ |d   }t=        d   j>                        dk(  rSt=        d   j>                  d   j6                        dk(  r+d   j>                  d   j6                  d   j@                  |u s	 |d       y| jC                  |||      sy|j	                         r9|j/                         s |j                         st        jD                  s	 |d       y|j-                         tF        jH                  jJ                  z  s+|j-                         tF        jH                  jJ                  z  r	 |d       y|j                         }|j                         }||k7  r |d||       y~| jM                  |||      }t        |tN              sJ |r<|t        jP                  k  r)t        jR                  r| jU                  ||      }|dk\  r|}t        jV                  rP| jY                  ||      x}r<|\  }}}|j[                  ||       | jM                  ||      }t        |tN              sJ t        j\                  r,|t        jP                  k  r| j_                  ||      }|dk\  r|}t`        jc                  td        jf                        r4t`        ji                  d|jk                         |jk                         |       tF        jl                  jo                  | |||      sy|j                         |j                  z  rY| jq                  ||      xrE tF        jl                  jq                  | |||      xr! | j                  |      jq                  ||      S tF        jl                  js                  | |||      xr! | j                  |      js                  ||      S c c}w )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  <   K   | ]  }|j                           y wr`   r  )rg   inps     ri   rj   z%Scheduler.can_fuse.<locals>.<genexpr>  s     Ec3<<>Er  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr   c              3  :   K   | ]  }|j                   v   y wr`   rm   )rg   r  prologue_nodess     ri   rj   z%Scheduler.can_fuse.<locals>.<genexpr>  s     QttyyN:Qrl  z7template prologue can only fuse nodes with a single user   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r  z%s and %s has %s shared data):ra   r  r  r  r  r+  r   can_fuse_multi_outputs_templater  rl  r  r   r   r#   prologue_fusionrc   r  r&   r,  get_allowed_prologue_inpsr   r  r  r  rl   r[  rk   r   r   r   r   rK  rd   r  r  rT   rs   no_fuse_buffer_namesr  r   score_fusion_memory_thresholdr-  r  $expand_dimension_for_pointwise_nodesr  rT  r  r  r]  r%  r&  r  r  r   choicesr   can_fuse_verticalcan_fuse_horizontal)r   r   r   can_reorderr  r/  r  r  unsupported_prologue_argsrd   	node_outsr   r  template_snodestemplate_snoder  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizer  s                          @ri   r   zScheduler.can_fuseN  s    E>e45&&u--e45 u%4#3#3$

)
)%
7$8 e12j'7
 ABu8:PQR%%'()u8:PQR%%'()$$&8,-))01!!#u'8'8':HI779Hh(?(?@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--/53Q3Q3SPQ"__.N&s+ % ,,.	$ %CQsyyQQUV$%% "%);< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sS**,!!#))12""$qww'C'CC""$qww'C'CC56!!#""$W,fg> 4454M 5 
 +S111 !F$H$HH11$($J$J5RW$X!$)$9!66#FFueTTOT6E3Z{<<ZU $ 8 8 F/555 11!F$H$HH$($M$Mu%! %)$9!))'--8##.  !	 yy!!$u6GH$$&8 &&ue4 MII//eUDUVM$$V,>>ueL 9900eU$5 M""6*>>ueLMs Bs   'Y2=Y2c                   |j                         }t        ||      }t        t              }|j                  D ]j  }| j
                  j                  |j                  |j                        }t        |t              r| j                  |||      rW||   j                  |       l |j                  j                  D ]  }t        |t              s|j                  | j
                  j                  |j                  |j                              }	|	sV|	D ]&  }
| j                  |
|      s|	j!                  |
       (  t#        d t$        j&                  j)                  |j+                               D              }||z  r	 |d       y|j-                         }|D ]E  }| j.                  |   j1                         }|| j2                  |   j4                  z  s= |d        y y)a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  4   K   | ]  }|j                     y wr`   r  r  s     ri   rj   z.Scheduler.can_fuse_vertical.<locals>.<genexpr>$  s      $
 HH$
r  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r  r  r   r   rR  rI  r  r   ra   r1   r  r   r   ro  r/   fusable_read_and_writer  r   r  r  rv  r   r   r#  r   r  r   )r   r   r   node1_buf_namesr/  remaining_deps_by_namer   r   cd	remainingr(  remaining_depsnode1_op_namesr  s                 ri   r  zScheduler.can_fuse_vertical  s     002u%7B47H++ 	5C((,,SXXsxx@D#w'D,A,A#ue,T"4(//4		5 ##** 		-Bb),.22%%))"''277;I # -B222r:!((,-		- $ $
 445K5R5R5TU$
 

 O+
 +,224" 	D&&t,==?G 7 7 @ J JJ>?		 ro   c                   |j                   |j                         vry|j                  j                  D cg c]  }|j                   |j                  k(  r| }}t        |      dk7  ry|d   t        t              ryt        t              sJ t        j                  t        j                        ry| j                  |j                     }|g}t        |t              r|j                  }d}|D ]R  }	|	j                  j                   D 
cg c]  }
|
j                   |k(  r|
 }}
|s8|dz  }t#        fd|D              rR y |dk  S c c}w c c}
w )NFr   r   c              3     K   | ]q  }t        |t              xr[ t        |j                  t        j
                         xr4 |j                  j                  k(  xr |j                  j                  k(   s y wr`   )ra   r/   r   r   r    TMPr   )rg   r%  r  s     ri   rj   z-Scheduler.fusable_weak_dep.<locals>.<genexpr>_  sm      
 	 4+ ,+DJJAA,JJ%++-, II+,s   A7A:)r   r  r   ro  rJ  r   ra   r0   r/   r   r   r    r  r  r  r   r   rk   )r   weak_depr   r   r  mutating_writesr  relevant_reading_nodesnum_concurrent_readsreading_noder%  relevant_readss       `       ri   r  zScheduler.fusable_weak_dep:  sn    == 6 6 88 **11
zzX222 
 

 1$"eW%%+++u{{DHH5++H,A,AB	"'e78%*\\" 2 	L )44::99	) N 
 " A%  
 +  !	" $q((K
*s   "EEc                   t        |t              rH| j                  j                  |j                  |j                        }||j                  k7  sHt        |j                  t        j                        s$t        |j                  t        j                        ryt        j                  r9|j                  |j                  k7  r |j                         }|j                         }|j                  |j                  k(  xr\ t        |j                        t        |j                        k\  xr/ |j                  d t        |j                         |j                  k(  S t        |t              r| j                  j                  |j                  |j                        }| j                  j                  |j                  |j                        }|j                   |j                   k(  r|j                   ||k(  ryyr)  )ra   r/   rI  r  r   r   r   r    r  r#   r-  rZ  r'  r   r   r0   r  )r   r%  r  	read_name
write_names        ri   r  z Scheduler.fusable_read_and_writem  s`   dI&--11$))TYYGI UZZ'&tzz488<&u{{DHH=00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+ro   c                B    t         j                  j                  ||      S r`   )rT   rs   get_dep_size_hint)r   r   r  s      ri   r  zScheduler.dep_size_hint  s    ww((k::ro   c                    fd}|r5t         j                  ||      rt         j                  ||      } ||d      S t        |j                  j
                        t        |j                  j                        z   }t        |j                  j
                        t        |j                  j                        z   }	t        ||	      dz  t        ||	      k  r||	kD  r||}}|j                  j
                  |j                  j                  z  D 
cg c]4  }
|
|j                  j
                  v s|
|j                  j                  v r|
6 }}
 |t         fd|D              d      S |j                  j
                  |j                  j                  z  |j                  j
                  |j                  j                  z  z  } |t         fd|D              d      S c c}
w )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        c                    r| |fS | S r`   r   )r  is_mix_order_reductionreturn_is_mix_order_reductions     ri   _construct_return_valuez>Scheduler.score_fusion_memory.<locals>._construct_return_value  s"     1 ./ ro   Tr  c              3  B   K   | ]  }j                  |        y wr`   r  )rg   r   r  r   s     ri   rj   z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     ISD&&sK8Ir  Fc              3  @   K   | ]  }j                  |        y wr`   r/  r  s     ri   rj   z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     FC""3'Fr  )
r]   r   r   r   r   r   ro  rf  r  r   )r   r   r   r  r,  r  r-  r  node1_dep_lennode2_dep_lenr   r  common_memory_depss   `  ``        ri   r  zScheduler.score_fusion_memory  s   	 %):)C)CE5)Q
 &66ueDE*5$77E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT }m,q03}m3TT},$eu !,,22U5F5F5M5MM%++111SE<M<M<T<T5T D  +IDII5  $//558I8I8P8PP##e&7&7&>&>>
 'F3EFF
 	
s   9Gc                   t        |      dk(  r|S i }|D ]  \  }}|j                         |j                         k(  sJ |j                         }t        | j                  |      j	                  ||            }||vr	||fg||<   p||   j                  ||f        t        |j                         t        j                  d            d   }t        |      dkD  sJ |S )Nr   r  r   )
r   r   r   r+  get_fusion_pair_priorityr   rf  r=  r0  r1  )r   rW  "possible_fusions_group_by_priorityr   r   r  fusion_pair_priority&possible_fusions_with_highest_prioritys           ri   r]  z4Scheduler.get_possible_fusions_with_highest_priority  s   
  A%##  	+ - 	LE5##%)9)9);;;;%%'F#&  (AA%O$  $+MMENL23GH 33GHOOEN	 25.446H<O<OPQ<R2

2. 9:Q>>>55ro   c                B    t        j                  j                  | g| S )z-
        Shim for list.sort(key=...)
        )rT   r  score_fusionr  s     ri   r^  zScheduler.score_fusion_key  s     yy%%d3U33ro   c                    t        t        j                  j                               }t	        | j
                        D ]9  }|j                  || j                         |j                  |j                         ; y)zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rT   rs   r  r   r  r  r  r  rD  )r   r  rd   s      ri   r
  zScheduler.compute_last_usage  s]    
 ))A)A)CDTZZ( 	8D 3T5L5LM&&t7	8ro   c                   t        | j                  t        j                  j                  z
  t        j                  j
                  j                  z
        D ]i  }|| j                  v rT| j                  |   }|j                         s2t        j                  j
                  j                  |j                         f|t        j                  j                  v st        j                  j                  |   }t        |t        j                        r*t        j                  j
                  j                  |       t        |t        j                        r|j                   }t        |t        j"                        r|j%                         sJ t        j                  j
                  j                  |j                          l | j                  j'                          y)z*Free any buffers that are no longer neededN)r  r  rT   rs   r  r  freedr#  r,  codegen_freerd   rs  ra   r&   rd  r  r   r  is_input_bufferclear)r   r   r   r  storages        ri   free_bufferszScheduler.free_buffers  sK   %%gg%%&gg""(()
 	DD
 t'''&&t,<<>GG((55chh?---gg**40c2#5#56GG((55c:R%6%67!hhG"7BMM:w?V?V?XXGG((55gllC)	D, 	!!'')ro   c                    | j                   j                         D ]  }|j                           | j                          y r`   )r  r   flushrB  )r   r  s     ri   rD  zScheduler.flush  s3    }}++- 	GMMO	ro   c                   t        |t              sJ t        d   dxx   dz  cc<   t        j                  t        d            5  |j                          |j                          d d d        |j                  }t        |t        j                        sJ dt        |             |j                  t        j                  j                         | j                          y # 1 sw Y   |xY w)Nrx  extern_callsr   F)increase_kernel_countztype(node)=)ra   rl  r   rT   set_kernel_handlerr+   r)  r  rd   r&   r  r   r  rs   r  rB  )r   scheduler_noderd   s      ri   codegen_extern_callzScheduler.codegen_extern_call  s    .*CDDD
 	^,1,!!&u"EF 	&002##%	& ""$0B[T$ZM2BB0QWW))*	& 	&s   !C""C+c                P   t        |j                        r|j                  
J | d       t        j                  j                  |       t        |j                        }|t        d|j                         t               s|j                  dk(  rLt        j                  j                  |      x}j                  dk  rt        |t        j                               t        |j                        r,|j                  dk(  st!        t        j                                ||       S )Nz( should have been normalized in loweringzUnsupported device type: r      rK  )rN   r   r   rT   rs   add_device_infor*   r  r!   r}  r   get_device_propertiesmajorr2   inspectcurrentframer3   )r   r  device_schedulingdevice_propss       ri   create_backendzScheduler.create_backend(  s    &++&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII|v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$V[[E-A#G$8$8$:;; &&ro   c                    |J || j                   vr| j                  |      | j                   |<   | j                   |   S r`   )r  rT  r  s     ri   r+  zScheduler.get_backend=  sB    !!!&$($7$7$?DMM&!}}V$$ro   c                    d fd}|j                         D ci c]8  }|j                  *|j                  j                         D ]  } ||      |fd  : }}}t        |j	                               }|rMt        |t        j                  d            \  }}t        j                  j                  j                  |       y y c c}}w )Nc                    | j                   vrLj                   j                  t        | j                  j                        D  ci c]  \  }} | |
 c} }       j                       S c c} }w r`   )r  r  r  rs   r  )r  r  r   s     ri   	get_orderz*Scheduler.enter_context.<locals>.get_orderD  s\    ,,,$$++i>V,WdaQT,WX''** -Xs   A+
r   r  )r  ztorch.fx.Noder   r   )rl   rd   r4  r   r   r  r0  r1  rT   rs   r  enter_context)r   rd   rX  r  r   r<  r  lasts   `       ri   rY  zScheduler.enter_contextC  s    	+ ^^%
vv!VV'')	
  q\1t#

 
 w||~&'x':':1'=>GAtGG  ..t4 
s   =Cc                    	 | j                   |   j                  }t        fd|D              xr || j                  vxr || j
                  vS # t        $ r Y yw xY w)NFc              3  ^   K   | ]$  }|j                   xs |j                         v  & y wr`   )r  r   )rg   r  r  s     ri   rj   zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>]  s)     VC3C CCVs   *-)r#  r   KeyErrorrk   rI  r  )r   r   r  r   s     ` ri   r  z.Scheduler.can_buffer_be_removed_through_fusionU  sn    	$$T*00E VPUVV 4D1114D333	
  		s   A 	AAc                    |j                   }t        |t        j                  j                  j
                        r|j                  x}r|j                         }t        |t        j                  j                        r| d|j                   n|}|t        j                  v s|t        j                  v r't        |t        j                  j                        sJ yt        j                  j                  j                  j                  st        j                   ydd}t#        d t$        j&                  j(                  D              }|r|rt*        n|}	t        |t,              rt#         fd|j.                  D              S |j                   J |j1                         s |	d|       yt        |j                   t        j2                        r |	d	|       yt        |j                   t        j4                        r |	d
|       yt7        |j                   dd      r |	d|       yt9        |j                         r |	d|       yt        j                  j:                  rt=        |      r |	d|       yy)zBReturn True if we should partition the inductor graph on this noder  TNc                     y r`   r   )msgrd   s     ri   noop_logz,Scheduler.should_partition.<locals>.noop_log  s    ro   c              3  2   K   | ]  }t        |        y wr`   )rN   )rg   r  s     ri   rj   z-Scheduler.should_partition.<locals>.<genexpr>  s     OVF^Or  c              3  @   K   | ]  }j                  |        y wr`   )should_partition)rg   r  r   s     ri   rj   z-Scheduler.should_partition.<locals>.<genexpr>  s     Mt,,U3Mr  znon gpu opsrm   zDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape opsF)r`  r   rd   r   r   r8  )rd   ra   r}  r~  r&   r!  rm  r   _ops
OpOverload_overloadnamer#   custom_should_partition_opsr   r  rB   wrapperr   rT   rs   device_typesrR   r   r   rN   
DeviceCopyConditionalrn  rM   cudagraph_skip_dynamic_graphsr  )
r   rd   
should_logr  r   op_overload_packet_nameop_overload_namera  has_gpu_devicelog_partition_reasons
   `         ri   rd  zScheduler.should_partitionb  s    ))gu11@@A%%%B%&(ggi# b%**"7"78 ++1R-=-=,>?,  (6+M+MM#v'I'II!"ejj&;&;<<< &&--886>>F	
 O!'':N:NOO-7N)PX 	 d./MMMMyy$$${{} T:dii/ !1=dii0 !2>499148 !7dC!$)), !>TJ ==66-d3$%8tDro   c                    i }|j                  t        j                  j                         | j                  D ]3  }|j
                  j                         D ]  \  }}|j                  ||<    5 |S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r  rT   rs   rs  r  rM  r=  rd   )r   r%  rd   r   scheduler_buffers        ri   get_name_to_nodeszScheduler.get_name_to_nodes  sr     UWAGG001JJ 	;D*.*>*>*D*D*F ;&&%5%:%:T";	; ro   c           	        t        t        j                  j                        D ci c]  \  }}||
 }}}t        t        j                  j	                               D ci c]  \  }}||
 }}}g t        j                  _        t        |      D ]  \  }}|j                  rg }|j                  D ]"  }|j                  |j                  |             $ g }	|j                  D ]0  }
|	j                  |j                  |
j                                      2 t        j                  j
                  j                  t        |||	|j                                yc c}}w c c}}w )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        N)r  rT   rs   rs  r  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr   rJ   constant_names)r   
signaturesr  r   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingrd   s              ri   compute_graph_partition_mapsz&Scheduler.compute_graph_partition_maps  sT    (11E1E'F%
##tD#I%
! %
 (11I1I1K'L&
##tD#I&
" &
 "$'0'< 	#L)''
 M!-- J$$%>%B%B4%HIJ  N!.. W%%&@&D&DT]]_&UVW GG""))! !",,	!	%
&
s   E!E c                   	 	 	 	 dd	 	 	 	 dd} t               j                  d |D         } |j                  fd|j                         D           ||      }t               }|D ]F  }t        j                  j
                  j                  |      }|j                  |j                         H t        t        |t        j                  d                  S )	ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        c                    t        | t        j                        r
t               S t        | t        j                        rt        |       S t        dt        |              )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )ra   r&   rd  r   r  r  r  r   rm   s    ri   get_input_node_symbolszKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sN     $ 2 23!|#D")),)$// *,I$t**VWWro   c                &    t        d | D              S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c              3     K   | ]N  }t        |t        j                  t        j                  t        j                  t        j
                  f      r| P y wr`   )r   r    SIZEFLOATUNBACKED_INTUNBACKED_FLOAT)rg   rQ  s     ri   rj   zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sH      !		

))++	 s   AAr   )symbolss    ri   filter_symbolszCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s         ro   c              3  2   K   | ]  }t        |        y wr`   r  r  s     ri   rj   z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     It,T2Ir  c              3  4   K   | ]  \  }} |        y wr`   r   )rg   r  rd   r  s      ri   rj   z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>!  s     Nwq$$T*Ns   r   r  )rd   z0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   OrderedSet[sympy.Symbol])r  r  r   r  )r   r  r=  rT   rs   rt   ru   r  r   r  r0  
attrgetter)	r   	partitionrz  r  candidate_symbolsresrQ  symplified_sr  s	           @ri   !get_graph_partition_symbol_inputsz+Scheduler.get_graph_partition_symbol_inputs  s    	XB	X%	X 	-	%	, 7Ijl6H6HIyI7
 	 N+:K:K:MN	
 ++<=(2" 	2A77++44Q7LJJ|001	2
 &(*=*=f*EFGGro   c           
         g }t        t        j                  j                               } j	                         }d fdt        t        |      t        |            D ]2  \  }}t               }|D ]+  }	|j                  |	j                  j                                - |j                  |      }
t        j                  j                  |D 	cg c]  }	|	j                   c}	      }t        |j                  |j                   z  D cg c]  }t#        |t$              s|j&                    c}      |z
  }t         fd|D              }t               }|D ]  }	|j                  |	j(                          ||z
  D cg c]  }||v r|
 }}|j                  |       |D ci c]  }||v r|||    }}|D ci c]  }||v r|||v  }}|D cg c]  }||v r||vr| }}|
j                  |       t         fd|
D              }
|
D cg c]  } |      s||    }}|D cg c]!  }|t        j                  j*                  v s |# }} j-                  ||      }t/        ||||||      }|j1                  |       |j3                  ||
z
        }5 |ddd   S c c}	w c c}w c c}w c c}w c c}w c c}w c c}w c c}w )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        c                    j                   j                  | d      }|yt        |j                  j                  t
              r'j                  j                  | d      x}r |      S yy)z
            Checks if buf_name resolves to a NoneLayout buffer (following mutation_real_name).
            Buffers with NoneLayout are not allocated so graph partition should not
            take them as inputs or outputs.
            NFT)r#  r  ra   rd   r
  r:   r  )r  r   r  is_unallocated_bufferr   s      ri   r  zFScheduler.get_graph_partition_signature.<locals>.is_unallocated_buffer:  sh     ""&&x6C{#((//:6 !% 7 7 ; ;Hd KK9K0;;ro   c              3  V   K   | ]   }j                   j                  ||       " y wr`   r  r  rg   r   r   s     ri   rj   z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>m  ,      / ''++D$7/r   c              3  V   K   | ]   }j                   j                  ||       " y wr`   r  r  s     ri   rj   z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>  r  r   Nr   )r  r   r   r   )r   rT   rs   r  rv  ru  r   r  rM  r   ry  r%   r  r  r   r   ro  ra   r1   r   rD  r  r  r7   r   r  )r   
partitionsskip_cudagraphsr}  unmet_output_namesr%  r  ry  output_namesrd   returned_output_namesr   r'  partition_input_namesr  r   extra_input_namesrz  input_deallocationextra_output_namesr{  r|  symbol_inputspartition_signaturer  s   `                       @ri   get_graph_partition_signaturez'Scheduler.get_graph_partition_signature.  s^    
'(@(@(BC--/	, *-Z (?";*
 h	%I~ -7LL! A##D$8$8$=$=$?@A %1$=$=>P$Q! '11<<.78d!!8K  "-!2!2[5G5G!G)!W5   " %/ /1/ %!
 5?L ! =$++DOO<= 2L@!<' ! !
 "(():; 2<' l4((K  2"<' d222" " 2"<'D8L,L " " "(();<$. /1/ %! 2,T2 T"L  "7$!''BSBS:SN  !BB;M #:"# 12!6!<!<"%::"Kh	T $B${ 9*!
""s6   J

#J$
>J)"J.9J3J8J=%!KKc                   |j                   j                         D ci c]$  \  }}|t        j                  j                  vr||& }}}|j
                  j                         D ci c]$  \  }}|t        j                  j                  vr||& }}}|j                  D cg c].  }|j                         t        j                  j                  vr|0 }}|j                  D cg c]   }|t        j                  j                  vr|" }	}t        |j                  ||||j                  |	      S c c}}w c c}}w c c}w c c}w )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        )rz  r=  rT   rs   r  r  r{  maybe_get_namer|  r7   r  ry  )
r   r  r   r:  rz  r\  r  rd   r{  r|  s
             ri   .clean_removed_buffer_from_partition_signaturesz8Scheduler.clean_removed_buffer_from_partition_signatures  sK    !* 5 5 ; ; =
f177222 &L
 
 '99??A
c177222 #I
 
 "..
""$AGG,C,CC 
 
 "00
177222 
 

 '##$$
 	
)






s   )D/')D5!3D;$%E c                p   	
 ddl 	t               g g t        |      D ci c]  \  }}||
 c}}d	 fd
d
fd}|D ]5  }t        |j                  j
                        |<   |   dk(  s. 
|       7 g }d}|t        |      k  rsr}r0	j                        \  }}|j                  |        ||       r0r0	j                        \  }}|j                  |        ||       r0|dz  }|t        |      k  rrzr}|t        |      kD  rt        d      |S c c}}w )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                    |    | f}j                  |       rj                  |       y j                  |       y r`   )rd  heappush)rd   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesr   s     ri   insert_pending_nodeszHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  s>    ,T2D9O$$T*6H2ODro   c                    | j                   j                  D ]*  }|   dkD  sJ |xx   dz  cc<   |   dk(  s# |       , y )Nr   r   )rG  
succ_nodes)rd   	succ_noder  node_to_indegrees     ri   update_indegreezCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  sT    !]]55 4	'	2Q666 +q0+#I.!3(3	4ro   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                rd   rX   r   r8  )	r  rt  r  r   rG  
pred_nodesheappopr   r  )r   r  r  rd   r  r  	num_itersr  r  r  r  r  r  r  s   `       @@@@@@ri    reorder_for_minimizing_partitionz*Scheduler.reorder_for_minimizing_partition  sU    	9=CEGI4=e4DEysDsE	E 	E	4  	+D%()A)A%BT"%*$T*	+
 -/	#e*$#':)--(?@4%% *
 &--(;<4%% &
 NI #e*$#': s5z!  ] Fs   D2c           	     X   ddl m}m} t        t        j
                  j                               } ||| j                  | j                  t        t        j
                  j                  j                               |      \  }}| j                  |      } ||||      \  }}	||dz  k  r|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r   )estimate_peak_memoryprepare_planning_infor  )r  r  r  r   rT   rs   r  r#  r  rs  r   r  )
r   r  r  r  rQ  default_peak_memoryr  reordered_nodesreorder_peak_memoryr  s
             ri   r  z0Scheduler.maybe_reorder_for_minimizing_partition!  s     	H"177#;#;#=>:O##qww++0023;
77 ??F!57"
Q
 !4s!::""ro   c                   g }g }g }dd}|D ]l  }| j                  |      }|r*t        |j                        dk(  r|j                  |       @|r ||      r|j                  |       \|j                  |       n ||z   |z   S )a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        c                    | j                         D ]0  }|j                  D ]  }t        |j                  t              r  y 2 yr)  )r[  r   ra   rd   r*  )rd   r   r+  s      ri   only_output_userzPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_userN  sC    '') %99 %C%chh
;$%% ro   r   r   )rd  r   rR  r   )r   r  frontmiddlebackr  rd   rd  s           ri   r	  z6Scheduler.reorder_for_partition_with_simple_dependency@  s     *,*,(*	  	$D#44T:C(?(?$@A$ET"!&6t&<D!d#	$ v~$$ro   c                n   g }d}g }g }| j                   D ]S  }| j                  |d      }|r)||k7  r$|j                  |       |j                  |       g }|}|j                  |       U |r"|j                  |       |j                  |       | j                  ||      }| j	                  |       ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        T)ro  )r  r  )r  rd  r   r  r  )r   r  ry  cur_partitionr  rd   rd  r}  s           ri   r  zScheduler.graph_partition`  s     +-
')JJ 	'D#44Td4K3C!C!!-0&&~6 "-N  &	' m,"">277!? 8 

 	))*5:%%ro   c                    t        d      5  t        j                  j                  j                  r| j                         n| j                  | j                        	 cd d d        S # 1 sw Y   y xY w)NzScheduler.codegen)r   r}  r~  r#   r  _codegen_partitions_codegenr  r  s    ri   r  zScheduler.codegen  sX    -. 	 ??))99 ((*]]4::.	 	 	s   AA&&A/c                ^   ddl m} t        j                  j                  }t        | j                        }t        j                  j                         5  t        j                  j                  dd| ||       | j                  |       t        t        j                  j                  |      sJ | j                  |      }|t        j                  j                  _        t        j                  j                  j                          t        j                  j                  }t        j                  j                  j                  t        j                  j                         \  }}ddd       t        j                  j                  j#                         t        j                  j                  j%                  ||       t        j                  j                  j&                  j)                  |j*                  D 	cg c]  }	|	j-                          c}	       y# 1 sw Y   xY wc c}	w )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)r  r  rT   rs   r  r  r  set_current_wrapper_codeinit_wrapper_coder  ra   r  r  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr  r{  r   )
r   r  r  r  r  graph_partition_id
graph_namepartition_coder  rd   s
             ri   _codegen_partition_wrapperz$Scheduler._codegen_partition_wrapper  s    	Bgg22!$"?"?@WW--/ 	TGG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQKKIVI8AAGG  5GG  --/J ! 4 4 = =agg>R>R SNA/	T2 	
88^T	334F	R	&&--)2)?)?@T]]_@	
9	T 	T: As   DH?H*H'c                L     t         j                  d fd       } |       S )Nc               3    K   j                          j                  ryt        j                  j                        rZj                  j                  J d       t
        j                  j                  j                  j                  j                         	 d  j                  rGt        j                  j                        r(t
        j                  j                  j                          d _        y # j                  rGt        j                  j                        r(t
        j                  j                  j                          d _        w xY ww)Ndevice should have an index)
%update_graph_partition_default_devicer  rE   r   r   rT   rs   r  codegen_device_guard_entercodegen_device_guard_exit)r  r   r}  s   ri   ctxz1Scheduler.use_default_device_context.<locals>.ctx  s    66z:N**/@++000 2288D 1D $$??//553..3D//444 GG((BBD.2+	 ..3D//444 GG((BBD.2+s    BEC;  AE;AEE)r   zIterator[None])
contextlibcontextmanager)r   r  r}  r  s   ``` ri   use_default_device_contextz$Scheduler.use_default_device_context  s&     
	"	"	3 
#	3* uro   c                    t        |      dk(  r|d   j                  sy dd}	 	 	 	 	 	 dd}d }t        ||      D ]  \  }}|j                  r ||      } n |y t        ||      D ]  \  }}|j                  s |||      r y  || _        y )Nr   r   c                4    | d   j                         }|J |S r   r   )r  partition_devices     ri   get_cudagraph_partition_devicezWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s'    (|668#///##ro   c                @    | D ]  }|j                         }||k7  s y yr)  r  )r  target_devicerd   r  s       ri   all_on_target_devicezMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s/     " !*]* ! ro   )r  rY   r   r  )r  rY   r  r  r   r   )r   ry  ru  r  )r   r  r}  r  r  cudagraph_partition_devicer  r  s           ri   r  z/Scheduler.update_graph_partition_default_device  s     z?a
1(D(D 	$
	$	5A		 &*"$'
J$? 	 Iy++-KI-V*	 &-$'
J$? 	 Iy''0D51 		 'A#ro   c                :   | j                         \  }}t        |      dkD  r9dt        |       d}t        |d       t        d   dxx   t        |      z  cc<   | j	                  ||      5  t        ||      D ]V  \  }}t        |      dk\  sJ dt        |              |j                  r| j                  |       E| j                  ||       X 	 d	d	d	       t        | j                        }t        j                  j                  j                  |       |d
kD  rqt        j                  j                  J |t        t        j                  j                        k(  s.J d| dt        t        j                  j                                y	y	# 1 sw Y   xY w)z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   zcudagraph partition into z partitionsrv  )r`  prefixrx  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r  r   rR   r   r  ru  ry  r  r  r  r  rT   rs   r  set_all_partition_namesrx  )r   r  r}  r`  r  r  num_partitionss          ri   r  zScheduler._codegen_partitions  s    "&!5!5!7
Jz?Q-c*o->kJC)c"=Z !78C
OK8,,ZD 		J(+J
(C J$	99~* KCPYNK[\* ++MM),33IyIJ		J d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@ 		J 		Js   -A&FFc                `   t         j                  rdd l}t        j                         }t               }t        |      D ]  }|j                  dk(  r/|j                  |j                  j                  j                  k(  r nQ|j                  |j                  f}||vs"J d|j                   d|j                   d       |j                  |        | j                  | _        | j                  rBt         j                   j"                  r(t$        j&                  j(                  j+                          |D ]H  }t,        j/                  t0        j2                        r4	 t,        j5                  d|j7                         |j9                                | j=                  |       |j?                         x}r|| j                  k7  s |jA                         s|jC                         r| jE                          || j                  k7  r| j                  rGtG        | j                  jH                        r(t$        j&                  j(                  jK                          || _        tG        |jH                        rF|jL                  J d       t$        j&                  j(                  jO                  |jL                         || _(        | jR                  jU                  |jV                         |jC                         rP|jY                  t[        |j]                                     \  }	}
}| j_                  |      ja                  |
||	       nH|jA                         r-tc        jd                  tf        |      }| ji                  |       n|jk                         rqtc        jd                  tl        |      }| j_                  |      }d	d
l7m8} d	dl9m:} tw        |||f      r|}nty        dtI        |             |j{                  |       ntw        |t|              r!| j_                  |      j                  |       nYtw        |t        t        f      r!| j_                  |      j                  |       n"tw        |t              sJ |j                          t         j                   j                  r| j_                  |      j                          | j                  jU                  |j                                | j                  jU                  |j                                tw        |t              r|j?                         }||jH                  dk7  s| j_                  |      j                         s9| jE                          K | j                  | j                  k7  rU| j                  J tG        | j                  jH                        r(t$        j&                  j(                  jK                          | jE                          y # t:        $ r( t,        j5                  d|j7                                Y w xY w)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r  r   )CUDACombinedSchedulingr5  ztype(self)=r6  )Lr#   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  r  r  r   autotune_at_compile_timerT   rs   r  write_get_raw_stream_headerrr  r%  r&  r  r  r   r  rq  rY  r   r   r  rD  rE   r   r  r   r  r  r  r  rD  r  r   rl   r+  codegen_templater  r  rl  rJ  r  r   codegen.cuda_combined_schedulingr  r9  r6  ra   r  codegen_combo_kernelr  codegen_mix_order_reductionr   rb   codegen_noder  r  debug_sync_kernelcodegen_syncr  r  r  r   ready_to_flush)r   r  r}  stackr  framer  rd   r  r  r  r  backend_r  r6  r  s                   ri   r  zScheduler._codegen  s   44.++-E7A|D!%  JJ"22%--*E*E*N*NN~~u||4$ ,U^^,<Aell^ LJ J
  #99 &&6==+Q+QGG  <<> T	!D.
IIO224 t$**v*d111~~''')JJLT000**/@++000 ,,FFH*0D'(5%||7V9VV7,,GGU $D%%,,T__=!484W4W)*51-   (99!8X !{{#<dC((."{{#=tD++F3T8h9O(PQ&G(KDJ=)9::,,T2D"9:  (DDTJD#5}"EF  (55d;!$(>??? }}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;*&v-((0??AJJLiT	!l $"="== &&222 !4!4!9!9: $$>>@

q ! IIPs   3W<<-X-,X-c                    |d   j                         }| t        j                  _        || _        |J | j                  |      }|j                  |      S )r  r   )r   rT   rs   r   r  r+  benchmark_combo_kernel)r   r  r  r  s       ri   r  z Scheduler.benchmark_combo_kernel  sW     1((* $!!!""6*--i88ro   c                   |}|d   j                         t        fd|D              sJ d       t        j                  syddlm} dg }}t        |      D ]  \  }}|j                         }| j                  |      rt        j                  d       	 | j                  |      \  }	}
t        j                  |	      rt        j                  d|        y		 ||	z  }|j                  |
        	 | j                  |      \  }}}||z
  dk  xs |dk  }t        j!                  t"        j$                        rP||kD  s|r%t        j                  dt'        ||z  d             n$t        j                  dt)        ||z  d             ||z
  |k  xs |S # |$ r.}d
t        |      v rt        j                  d       Y d}~ y d}~ww xY w# |$ r-}d
t        |      v rt        j                  d       Y d}~y d}~ww xY w)r  r   c              3  D   K   | ]  }|j                         k(    y wr`   r  )rg   rd   r  s     ri   rj   z4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr>  s     K44??$.Ks    z<All nodes in a combo kernel group must be on the same deviceTr  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr#  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   rk   r#   r  r-  r  r  rl   r  r  r  r  r%  r&  r   r   r%  r&  r  r?   r@   )r   r  subkernel_nodesr  r  
path1_listr  r  r  r  r  r   r  	ms2_clone_path2_listsmall_kernelr  s                   @ri   rO  z!Scheduler.speedup_by_combo_kernel  s      #..0K?KK 	
J	
K ,,;rZ!/2 	$HAu)I ##I.  R55i@D::b>$$U ! " 2ICd#7	$:
	*.*E*Eo*V'CK Y,9c	""7==1SyL  E#)C2
   Ic	#0
 Y$44M $ *c!f4$$]     	&#a&0  Y 	s<   ?F0G	 G"G GG	G;"G65G66G;c                p    | j                   |   }|j                  J |j                  j                         S r`   )r#  rd   
get_layout)r   r  r   s      ri   get_buffer_layoutzScheduler.get_buffer_layout  s5    x(xx###xx""$$ro   c                   | j                   D ]  }|j                         s|j                  j                  D ]  }t        j
                  j                  j                  |j                        }|s9t        |      dk(  sHt        |j                  t        t        f      ri|j                         g k(  s}t        j
                  j                  j!                  |j                           y r  )r  rN   r   r   rT   rs   r  r  r   r6   ra   r
  r:   r9   r  zero_dim_cpu_tensor_listr  )r   rd   r%  r:  s       ri   r  z$Scheduler.update_zero_dim_cpu_tensor  s    JJ 	HD{{} ,,22 
HDWW3377		BF+F3u< *"MMJ8I+J! #OO-388<<TYYG
H	Hro   )r  zlist[ir.Operation]r   r8  )r   z!dict[str, SchedulerDonatedBuffer]r;  )r  r<  r   r8  r7  )r(  r   r   r8  )rd   r  r   rX   rf  )r  rX   r   r  )r   rg  r  rk  r   tuple[float, str]r`   r  rk  r  r   r  r*  r   r   )r  r   r  r  r   r(  )
r  ir.OperationBufferr  ir.MultiTemplateBufferr  r   rd   rb   r   r8  )r  rk  r   r   )r   rX   r   rX   r   zUnion[bool, Callable[[], bool]])rd   rX   r   rX   )r  r  r  r   r   r  )r  r*  r   r8  rZ  )r  r  r  r   r   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r   )r   rX   r   rX   r  r   r   r   )r   rX   r   rX   r  z'Union[tuple[str, ...], OrderedSet[str]]r   r   r   r   )r  rX   r  rX   r/  r  r   r   )r   rX   r   rX   r   z/Optional[tuple[int, SchedulerNode, sympy.Expr]])FT)
r   rX   r   rX   r  r   r  r   r   r   )r  r1   r   rX   r   rX   r   r   )r%  r.   r  r/   r   r   r  )r   r.   r  r   r   r   )TFT)r   rX   r   rX   r  r   r,  r   r  r   r   zint | tuple[int, bool])rW  r,  r   r,  )r  z+tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )rI  rl  r   r8  )r  r  r   BaseScheduling)r  r<  r   r-  r  )r   r   r  rC  r   r   r  )rd   rX   ro  r   r   r   )r   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r}  list[GraphPartitionSignature]r   r8  )r  rY   rz  r.  r   r  )r  list[PartitionType]r  z
list[bool]r   r/  )r  r7   r   r7   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r  rY   r  r7   r   r8  )r  r0  r}  r/  r   z'contextlib.AbstractContextManager[None])r  r0  r}  r/  r   r8  r  rk  r   z(tuple[float, float, list[Optional[str]]])r  r  r   r   )r  r   r   z	ir.Layout)Sr   r   r   r   rY  r  r  propertyr  setterr  r)  r  r  rW   r  r  r  r  rL  r  rX  rr  r  r  r  r  r  r  r  r9  r  r  r  r  rH  r@  r  r  r  r  r  r  r[  r  r  r   r  r  r  r  r  r]  r^  r
  rB  rD  rJ  rT  r+  rY  r  rd  rv  r  r  r  r  r  r  r	  r  r  r  r  r  r  r  r  rO  r$  r  r  r  s   @ri   r   r   
  s   
I9V	# & & ( (7#,"HAPFKZ+#Z,	 6S*4#&$6!F	808	8, (,	*  %	
 
&
> 
>*6
>	
>?DB'8&'8 +'8 	'8
 '8 
'8R
}(&}(/@}(	(}(~	>j&j j 
!	jX..`?6 &6  6  
;	6 p,&,/@,	,\7&7/@7	7r.2&.2/@.2MP.2	.2`$&$/@$	$6< < !< B	<
 
<|M&M/@M	M^Y
&Y
/@Y
	Y
v
9(9 )9 	9
 
9v`&`/@`	8`L "*.uM uM !uM 	uM
 $(uM 
uMn3&3/@3	3j-)-)(9-)BS-)	-)f D; !.3*.3
 3
 !3
 	3

 (,3
 $(3
 
 3
j6 Q6	:6@4@4	4	8*4
'*%5$

+:
	
 ;@M%M37M	M^	D '1' 
'RBH BH QBH 
"	BHHL -L @JL 	&L \"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@&	B&@)
 )
 +)
 
	)
V-;X	06-A--A;X-A	-A^ D{z949	19J5X%
Hro   c                  <    e Zd Zd fdZddZddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ		 	 	 	 ddZ
	 	 	 	 	 	 	 	 dd	Z	 d	 	 	 	 	 	 	 dd
ZddZddZddZd dZddZ	 	 	 	 d!dZd"dZ	 	 	 	 	 	 d#dZ	 	 	 	 d$dZ	 d	 	 	 	 	 d%dZ xZS )&r-  c                0    t         |           || _        y r`   )r  rY  r   )r   r   r  s     ri   rY  zBaseScheduling.__init__  s    "ro   c                R    | j                   r| j                   j                          y y r`   )r   rB  r  s    ri   free_buffers_in_schedulerz(BaseScheduling.free_buffers_in_scheduler  s    >>NN'') ro   c                    t               S )z0Return a set of .codegen.common.BackendFeature()r   r  s     ri   get_backend_featuresz#BaseScheduling.get_backend_features  s
    |ro   c                    t         )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r  s      ri   r  z BaseScheduling.can_fuse_vertical  
     "!ro   c                    t         )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r  s      ri   r  z"BaseScheduling.can_fuse_horizontal   r;  ro   c                     y)au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Fr   r  s      ri   r  z.BaseScheduling.can_fuse_multi_outputs_template(  s     ro   c                    |j                         s|j                         rt        j                  ||      S t        j	                  ||      rt        ||      S t        |t
              r|j                  |      S t        j                  ||      S )z 
        Fuse two nodes
        )	r  r  r  r]   r   r  ra   r  r   r  s      ri   r  zBaseScheduling.fuse4  sx     !1!1!3-225%@@77uE*5%8867??5))%**5%88ro   c                    t         )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )r   rB  s     ri   r,  zBaseScheduling.group_fnC  r;  ro   c                    t         )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )r   r  epilogue_nodesr  s       ri   r  zBaseScheduling.codegen_templateK  s
     "!ro   c                    t         zD
        Generate a kernel given a list of pre-fused nodes.
        r  )r   r  r  r  s       ri   r  z.BaseScheduling.generate_kernel_code_from_nodesY  s
     "!ro   c                    t         rC  r  r  s     ri   r  zBaseScheduling.codegen_noded  
     "!ro   c                    t         r`   r  r  s     ri   r  z*BaseScheduling.codegen_mix_order_reductionj  r  ro   c                    t         )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  r  s    ri   r  zBaseScheduling.codegen_syncm  rE  ro   c                     y)z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr   r  s    ri   r  zBaseScheduling.ready_to_flushs  s    
 ro   c                    t         )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  r  s    ri   rD  zBaseScheduling.flushz  rE  ro   c                    t         )r  r  r  s     ri   r  z$BaseScheduling.benchmark_fused_nodes  
     "!ro   c                    t         )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r  )r   r  s     ri   r  z)BaseScheduling.benchmark_codegened_module  s
    
 "!ro   c                     y)z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r   r  s      ri   r5  z'BaseScheduling.get_fusion_pair_priority  s     ro   c                    t         )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r  r  s     ri   r  z%BaseScheduling.benchmark_combo_kernel  rK  ro   c                |    |r:ddl m}  |||      }t        j                  j                  j                  ||       y y )Nr   )'set_kernel_post_grad_provenance_tracing)r  rP  rT   rs   r  write_provenance_debug_handle)r   node_scheduler  rP  debug_handles        ri   codegen_commentzBaseScheduling.codegen_comment  s>    
 UBL GG  >>\ ro   )r   zOptional[Scheduler]r7  )r  r  r   zOrderedSet[BackendFeature]r   r  )rB  r  r   z"tuple[tuple[sympy.Expr, ...], ...])r  rX   rA  rk  r  rk  r   Optional[str]r`   r)  )rd   z(Union[FusedSchedulerNode, SchedulerNode]r   r8  )rd   r  r   r8  r9  r'  )r  r   r   r(  r   r1  )rR  rk  r  rU  r   r8  )r   r   r   rY  r7  r9  r  r  r  r  r,  r  r  r  r  r  r  rD  r  r  r5  r  rT  r  r  s   @ri   r-  r-    s   #*"&"/@"	""&"/@"	"
&
/@
	
9&9/@9	9"3"	+""(" 4" 4	"
 
"$ (,		"*	" 	" %		"
 
	""""""0"	""&/@	"4"	1" &*2 # 
	ro   r-  )r   z$torch._inductor.codecache.LocalCache)r  rX   r   r   )r  rX   r   zOptional[Callable[[Any], Any]])r  rX   r   rN  )r  r   r   r   )rd   rX   r  r  r#  rL  r   r8  )r  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   r8  )r  rV  r   r   r   r  r   r8  )r   )r  zlist[list[int]]rB  r  r  r  r   z	list[int])r  r+  r9  r*  r   r8  r9  )rd   z	ir.IRNoder   r  )rd   rX   r   r  )
__future__r   rr  r  r>  r  rP  r  r&  r%  r0  r"  r  r  r  r  r   r   r   r   r   r	   r
   r   r   typing_extensionsr   torch.utils._ordered_setr   r&   r   collections.abcr   r   r   typesr   r   r}  torch._inductor.async_compiletorch.utils._pytreer  _pytreer  torch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._sympy.symbolr   r   r    torch.utils._tritonr!   rv  r"   r#   r$   r%   r'   analyze_preserves_zero_maskr(   codegen.commonr)   r*   r+   comm_analysisr,   r-   r.   r/   r0   r1   excr2   r3   fx_utilsr4   r5   r6   r7   r8   r9   r:   r   r;   r  r<   r=   runtime.hintsr>   runtime.runtime_utilsr?   r@   rt   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   virtualizedrT   	getLoggerr   rr  _logginggetArtifactLoggerr  r]  rT  r   rY   r=  rZ   r[   r]   	dataclassr   rA  rX   r  r  r  r  r  r  r  r*  r  rl  r  rb   r  r  r   r  r  r  r  r  r  r  r  r  r  r  r   r-  r   ro   ri   <module>rr     s   "          	     , S S S ' /  <<    $ $ $ 6 ? 7 M > O O * D D D M M ; : 2 $    J ( 7 &    (  g!^^--hA
NN44XO  >>;;$    34y 4T]t_\ \~ h8 h8 h8V 4_ 4 4q1 q1h 2 2(' > T"
 
 #
*  *K
*K4*K ,*K 
	*KZW 1 W"5. 5L*% L*^
@	$@ $ 
	,}** }*@PG0 PGfW:!3 W:t
b, bP #%+#++  + 	+\0%01C0	08 
 
 
> +9??, 4$
&C9H C9HLre ero   