
    ih                        d dl mZ d dlZd dlZd dlmZmZmZmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 er6d dl1m2Z2 d dl3m4Z4 d dl5m6Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z? ddlm@Z@ d dlAmBZB  G d dej                        ZDej                   G d d              ZF G d! d"      ZGy)#    )annotationsN)AnyOptionalTYPE_CHECKINGUnion)next_power_of_2)MixOrderReduction)bound_sympy   )config)
write_text)KernelInputs)make_ktc_generator)get_metric_tableis_metric_table_enabled)DevicePropertiesReductionHint)BaseSchedulerNode	Scheduler	WhyNoFuse)ExternKernelChoice)get_template_heuristic)BaseConfigHeuristicCPUConfigHeuristicCUDAConfigHeuristicMTIAConfigHeuristicROCmConfigHeuristicXPUConfigHeuristic)_use_autotune_backend)V)	Generator)partial)Config)KernelTemplate)SIMDKernelFeatures)TritonKernel)ChoiceCaller)KernelTemplateChoice)
OrderedSetc                      e Zd ZdZddZy)Sortablez>Anything that can be used as a list.sort() key (int/tuple/etc)c                     y N )selfothers     Q/var/www/html/engine/venv/lib/python3.12/site-packages/torch/_inductor/choices.py__lt__zSortable.__lt__5   s        N)r0   ztyping.Selfreturnbool)__name__
__module____qualname____doc__r2   r.   r3   r1   r+   r+   2   s    H5r3   r+   c                  <    e Zd ZU ded<   ded<   ded<   ded<   d Zy)	FusionScoreinttemplate_scorer5   node_type_scorememory_scoreproximity_scorec                   d}| j                   |j                   k7  r| j                   |j                   k  S t        | j                  |j                        t        | j                  |j                        |z  kD  r| j                  |j                  k  S | j                  | j                  | j
                  f|j                  |j                  |j
                  fk  S )zx
        node_type_score has higher priority than memory_score unless
        the memory_score differs too much
           )r=   maxr?   minr>   r@   )r/   r0   	thresholds      r1   r2   zFusionScore.__lt__?   s    
 	%"6"66&&)=)=== !!5#5#56$##U%7%789DE $$u'9'999$$d&7&79M9MN!!!!R
 
 	
r3   N)r6   r7   r8   __annotations__r2   r.   r3   r1   r;   r;   8   s    
r3   r;   c                     e Zd ZdZ	 d	 	 	 ddZ	 d	 	 	 ddZ	 d	 	 	 	 	 	 	 ddZ	 d	 	 	 	 	 	 	 ddZ	 d	 	 	 	 	 	 	 ddZ	 d	 	 	 	 	 	 	 	 	 	 	 ddZ		 d	 	 	 	 	 	 	 	 	 dd	Z
	 	 	 	 	 	 dd
Z	 d	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 ddZedd       Ze	 	 	 	 	 	 dd       Ze	 	 	 	 	 	 	 	 	 	 d d       Ze	 	 	 	 	 	 	 	 	 	 d!d       Ze	 	 	 	 	 	 	 	 	 	 d!d       Ze	 	 	 	 	 	 	 	 	 	 d!d       Ze	 	 	 	 	 	 	 	 d"d       Zy)#InductorChoicesax  
    This class contains a collection of default heuristics that effect performance of our generated
    code.  We try to not put correctness requirements in this file.

    You can override the choices made here by doing:

            class MyHeuristics(InductorChoices):
                ...

            torch._inductor.virtualized.V.set_choices_handler(MyHeuristics())
    c                    |dk(  r.t         j                  j                  
t               S t	               S |dk(  r
t               S |dk(  r
t               S |dk(  r
t               S t               S )Ncudaxpucpumtia)	torchversionhipr   r   r   r   r   r   )r/   device_types     r1   get_config_heuristicsz%InductorChoices.get_config_heuristicsb   sj     & }}  (*,,*,,E!%''E!%''F"&((&((r3   c                D    | j                  |      }|j                         S r-   )rR   get_conv_configs)r/   rQ   conv_heuristicss      r1   rT   z InductorChoices.get_conv_configst   s#     44[A//11r3   c                H    | j                  |      }|j                  ||      S r-   )rR   get_flex_attn_fwd_configsr/   head_dimdtyperQ   flex_heuristicss        r1   get_flex_attention_fwd_configsz.InductorChoices.get_flex_attention_fwd_configs|   '     44[A885IIr3   c                H    | j                  |      }|j                  ||      S r-   )rR   get_flex_attn_bwd_configsrX   s        r1   get_flex_attention_bwd_configsz.InductorChoices.get_flex_attention_bwd_configs   r]   r3   c                H    | j                  |      }|j                  ||      S r-   )rR   get_flex_decode_configsrX   s        r1   rb   z'InductorChoices.get_flex_decode_configs   s'     44[A66xGGr3   Nc                V    g }|j                         D ]  }|j                  |        |S )al  
        This method can be subclassed to perform any override/modification of the choices.
        The incoming parameters are cheap (generators), so you can do any overrides without
        incurring too much cost. Override this method to customize the kernel template choices
        before they are converted to ChoiceCaller objects, which is expensive on template codegen.

        The full list of arguments are here to facilitate any overrides you may want to do,
        as they can be used to start from scratch for each template if so desired.

        Args:
            template_choices: Dictionary mapping template UIDs to generators of KernelTemplateChoice objects
            kernel_inputs: MMKernelInputs containing input tensor nodes and matrix indices
            templates: List of template objects (KernelTemplate or ExternKernelChoice) in use
            op_name: Operation name (e.g., "bmm", "baddbmm", "addmm")
            kwarg_overrides: Optional dict of kwargs to override for each template heuristic

        Returns:
            Flattened list of KernelTemplateChoice objects across all templates
        )valuesextend)r/   template_choiceskernel_inputs	templatesop_namekwarg_overrideschoices
choice_gens           r1   _finalize_template_configsz*InductorChoices._finalize_template_configs   s2    6 /1*113 	'JNN:&	'r3   c                   |j                   }|J d       |j                  }t        |||      }|j                  ||      }|j	                  ||      }	|j                  ||      }
|xs i }t        |||
||j                         |	      S )a  
        Utility to get the KernelTemplateChoice generator for a specific input.

        This is a per template/op call, whereas get_template_configs is an op wide call (all templates).
        Consider when overriding/using at which level you need to make decisions
        z$get_ktc requires a valid device type)templatecsextra_kwargs	overrideslayoutinputs)rQ   uidr   get_template_configsadjust_kernel_inputsget_extra_kwargsr   output_layout)r/   rg   ro   ri   rj   rQ   template_name	heuristicrp   
inputs_valrq   rr   s               r1   get_ktczInductorChoices.get_ktc   s     $//&N(NN&  +=+wO	++
 33M7K
 11-I#)r	!% ..0
 	
r3   c                p   t        |      dkD  r!|d   j                  j                  dk(  r|dvryt        j                  s&t        j
                  st        j                  s|dvryyt        j                  syt        d      ryt        d      st        d      ryt        d	      ryt        d
 |D              S )z
        Check if we need to fix the layout instead of keeping it flexible

        Args:
            ktc: KernelTemplateChoice object

        Returns:
            True if we need to fix the layout, False otherwise
        r   mps)mmaddmmTFCUTLASSCKCKTILECPPc              3  R   K   | ]  }t        |j                  t                ! y wr-   )
isinstancero   r   ).0ktcs     r1   	<genexpr>z6InductorChoices._need_to_fix_layout.<locals>.<genexpr>	  s$      
AD
3<<);<<
s   %')	lenrt   rQ   r   max_autotunemax_autotune_gemm#max_autotune_allow_flexible_layoutsr   any)r/   adjusted_choicesri   s      r1   _need_to_fix_layoutz#InductorChoices._need_to_fix_layout   s    "  1$"))55>7 S D  ##v'?'?==' R C 99 !+ &*?*I ' 
HX
 
 	
r3   c           
         |i }|j                         }t        |      dk  rt        dt        |             |j                         }i }|D ]=  }| j	                  ||||j                  |j                  i             ||j                  <   ? | j                  |||||      }	| j                  |	|      r/|j                  d      }|	D ]  }
||
_	        t        |
d      s|
` |	D 
cg c]  }
|
j                  |
j                   c}
S c c}
w )a  
        Get list of ChoiceCallers for MM templates using template-specific heuristics.

        Args:
            kernel_inputs: MMKernelInputs containing input tensor nodes and matrix indices
            layout: Output layout
            templates: List of template objects (KernelTemplate or ExternKernelChoice)
            op_name: Operation name (e.g., "bmm", "baddbmm", "addmm", "mm_plus_mm")
            kwarg_overrides: Optional dict of kwargs to override for each template heuristic,
                             indexed by template.uid. These only override the per config kwargs, not the extra kwargs
        Returns:
            List of ChoiceCaller objects from the templates
           z#Need at least 2 input tensors, got F)flexible_choice)nodesr   
ValueErrorry   r}   getru   rm   r   rs   hasattrr   choice)r/   rg   rh   ri   rj   input_tensorsrs   rf   ro   r   r   s              r1   rv   z$InductorChoices.get_template_configs  s-   ( " O%++-}!B3}CUBVWXX,,.! 	H-1\\##HLL"5	.X\\*	  ::
 ##$4g>"00%0@F' $#
 3	*$ '7Qs#**:P

QQQs   (D:Dc                    |S )zTHook to change the kwargs passed to TritonKernel, used to apply fixed configurationsr.   )r/   
kernel_clsfeaturesgroupskernel_kwargss        r1   triton_kernel_kwargsz$InductorChoices.triton_kernel_kwargsE  s
     r3   c                   t         j                  j                  ryt         j                  j                  r+t        j
                  j                         j                  dk(  ryt        j
                  j                  j                  | j                  d      }|dk  rd|z  }n	|dk  rd	}nyt        j
                  j                  j                  | j                  |      S )
z>Heuristic to decide if a cooperative reduction should be used.TrL   Fr   )fallback   i   rB   i    )r   tritonforce_cooperative_reductionscooperative_reductionsr    graphget_current_device_or_throwtypesizevars	size_hintnumelstatically_known_geqreduction_numel)r   xhintrE   s      r1    should_use_cooperative_reductionz0InductorChoices.should_use_cooperative_reductionO  s     ==5544ww22499UB  **8>>A*FA:Ib[Iww44$$i
 	
r3   c                (   t         j                  j                  syt        j                  dij                  | j                         d      }| j                         t        j                  t        j                  fvrpt        | j                        }|j                  }|j                  }t        d ||fD              syt        t        |            }t        t        |            }||k7  ry|rD	 |dt        t         j"                  j$                  j'                  | j(                        d      z  z  }t         j                  j,                  r|dz  }t         j"                  j$                  j/                  | j                  |      S # t*        $ r Y ^w xY w)zO
        Heuristic to decide if a persistent reduction should be used.
        Fi   @   c              3     K   | ]]  }t        |t              xs |j                         xr5 |t        j                  j
                  j                  j                         k7   _ y wr-   )r   r<   is_constantrN   utils_sympynumbersIntInfinity)r   bounds     r1   r   zBInductorChoices.should_use_persistent_reduction.<locals>.<genexpr>{  s\      
   s+Bu/@/@/B J!3!3!;!;!G!G!IIJs   A#A%    rB   )r   r   persistent_reductionsr   INNERr   get_reduction_hint
OUTER_TINYr
   r   lowerupperallr   r<   rD   r    r   r   size_hint_or_throwr   r   multi_kernelstatically_known_leq)r   cooperative_reductionrE   boundsr   r   s         r1   should_use_persistent_reductionz/InductorChoices.should_use_persistent_reductionf  sf    }}22

#h))+R
0 	 &&($$1
 
 !!9!9:FLLELLE 
 $U^  #CJ/E#CJ/E ~ R3GG$$77G$  	 ==%%OIww44$$i
 	
  s   /AF 	FFc                   t        j                  |       }|j                  }d}dd}||z  |z  }|z  |z  }	d}
d|
z  }|r|d|z  k\  ry|dk  ry||z  |k  r|}n||z  |	k  rm||z  d|z  z  }||z   dz
  |z  }|||z  z   dz
  ||z  z  t        j                  |      }t        |fd	      }t        |z
        d
k  rt        ||      }n>}n;t        j                  |      }t        |fd	      }t        |z
        dk  r|}n}|||z  z   dz
  ||z  z  S d}d}||z   dz
  |z  }||z  |k  r|}n||z  |	k  rj||z  |z  }||z   dz
  |z  }|||z  z   dz
  ||z  z  t        j                  |      }t        |fd	      }t        |z
        dk  rt        ||      }n>}n;t        j                  |      }t        |fd	      }t        |z
        dk  r|}n}|||z  z   dz
  ||z  z  S )zHeuristic to decide the RSPLIT used for split reductions.
        When a reduction has a small number of outputs there is not enough parallelism,
        so we will do the reduction in two phases.r   i   i   r   r   r   i    c                     t        | z
        S r-   absxtmp_split_sizes    r1   <lambda>z8InductorChoices.reduction_split_factor.<locals>.<lambda>      c!n:L6M r3   )key   c                     t        | z
        S r-   r   r   max_elements_per_threads    r1   r   z8InductorChoices.reduction_split_factor.<locals>.<lambda>      c!>U:U6V r3   2         c                     t        | z
        S r-   r   r   s    r1   r   z8InductorChoices.reduction_split_factor.<locals>.<lambda>  r   r3      c                     t        | z
        S r-   r   r   s    r1   r   z8InductorChoices.reduction_split_factor.<locals>.<lambda>  r   r3   )r   createmulti_processor_countsympydivisorsrD   r   rC   )devicereduction_numel_hint
numel_hintinner_reductionpropsnum_smmin_elements_per_threadthreads_per_smmin_elements_per_devicemax_elements_per_device	num_warpsnum_threads
split_sizetarget_blocksblocks_per_outputr   closestrvals_per_threadxvals_per_blockxblocksr   r   s                       @@r1   reduction_split_factorz&InductorChoices.reduction_split_factor  s    !''/,,"$"%"9F"B^"S"9F"B^"S	9n QZ'#t+#j04KK4
%
25LL & 7AO L%2Z%?!%C
$R!(;9J+JJQN!$55"7 !>>*>?h,MNw/025!$W.E!FJ!/J >>*>?h,VWw!889B>!(J!8J(:+CCaG[(   !!O!O3a7OKG#j03JJ4
%
25LL & 7K H!.!81!< H(+;m+KKaO&6"8 !>>*>?h,MN~/025!$W.E!FJ!/J >>*>?h,VWw!889B>!(J!8J(+;j+HH1L :- r3   c                ^    |dk(  rt         j                  r j                         sj                         rt        d      rvj                  j                         j                  j                         z  t              dkD  r3t        d      j                   fd        t              d       y t              d       yj                         sgj                         sWt        j                               t        j                               z   t         j                  kD  r t              d       y j                        r t              d       yt         j                  4 j                  t         j                        r t              d	       yy
)a  
        Heuristics to prevent fusion applied to both horizontal and vertical fusions.  Heuristics here should not
        be needed for correctness and tweaking them may yield additional performance.

        See also some related heuristics that can be changed via config:
            - config.triton.tiling_prevents_pointwise_fusion
            - config.triton.tiling_prevents_reduction_fusion
            - config.aggressive_fusion (will cause this function to be called more times)
        r   'fusion_failure_due_to_indexing_mismatchc                 B   t         j                  j                  t         j                  j                  j	                         j	                         t        j                               t        j                               t               j                         dS )N)pre_grad_graph_idpost_grad_graph_id
node1_name
node2_namenode1_debug_strnode2_debug_strcommon_buffer_namesfailure_reason)	r    r   graph_idr   get_namer   	debug_strlistdecide_fusion_fail_reason)common_buf_namesnode1node2	schedulers   r1   r   z*InductorChoices.can_fuse.<locals>.<lambda>  su    121A1A23''2L2L*/..*:*/..*:/9%//:K/L/9%//:K/L378H3I.7.Q.Q %u.>/! r3   z'no shared data due to indexing mismatchFzno shared datazexceeds max fusionz Fusion will increase peak memory(fusion_prevent_too_many_reads_and_writesT)r   aggressive_fusionis_reductionr   read_writesbuffer_namesr   r   add_rowr   
is_foreach	get_nodesmax_fusion_sizecan_fusion_increase_peak_memorymax_fusion_unique_io_buffersr
  )r	  r  r  shared_data_scorer  s   ``` @r1   can_fusezInductorChoices.can_fuse  sy     !((E,>,>,@EDVDVDX&'PQ%%224u7H7H7U7U7WW ! '(1,$%NOWW ,IeU+,UV #IeU#$45   "$$&EOO%&U__->)??&BXBXX#IeU#$8944UEB#IeU#$FG //;BB33 $IeU#$NOr3   c                     y)zCHook for heuristics to prevent vertical (producer/consumer) fusionsTr.   r	  r  r  r  s       r1   can_fuse_verticalz!InductorChoices.can_fuse_vertical>  s     r3   c                    t        j                  ||      ry|t        j                  k  r t	        ||      d       y| j                  ||      r t	        ||      d       yy)zEHook for heuristics to prevent horizontal (consumer/consumer) fusionsTscore_fusion_memory_thresholdFz=Nodes are too far away. Fusing them may increase peak memory.)r	   r  r   r  r   are_long_distant_nodesr  s       r1   can_fuse_horizontalz#InductorChoices.can_fuse_horizontalH  sh     %%eU3 vCCC#IeU#$CD++E59#IeU#O r3   c                   t        j                  t        t        t        f   | j                  ||d            \  }}t        t        |j                  |j                  z
        t        |j                  |j                  z
               }|j                         rd}n+d|j                         t        j                  k(  xr |dkD  z   }|j                         |j                         k(  xr |dkD  }t        ||||      S )a  
        Assign a score (higher comes first) to the fusion of node1 and node2.
        When different fusions conflict with each other, this is the way we
        decide what order to run them in.

        Our current score is based on:
        - The type of fusion (template/reduction/etc)
        - Estimate of the saved memory operations
        - Fusions closer together in original graph order
        T)return_is_mix_order_reductionr   r   )typingcasttupler<   r5   score_fusion_memoryrC   r   	min_order	max_orderis_templater   epilogue_fusion_firstr  r;   )r	  r  r  r?   is_mix_order_reductionr@   r=   
type_scores           r1   score_fusionzInductorChoices.score_fusion^  s    " 06{{#t)))uD * 0
,, %//12%//12
 
 N""$(D(DD % 1$N
 '')U-?-?-AAVlUVFV
 	
 	
r3   )rJ   )rQ   Optional[str]r4   r   )rQ   r+  r4   z,partial[Generator[TritonConfig, None, None]])rY   r<   rZ   ztorch.dtyperQ   r+  r4   z	list[Any]r-   )rf   z6dict[str, Generator[KernelTemplateChoice, None, None]]rg   r   rh   /list[Union[KernelTemplate, ExternKernelChoice]]ri   strrj   #Optional[dict[str, dict[str, Any]]]r4   list[KernelTemplateChoice])
rg   r   ro   z)Union[KernelTemplate, ExternKernelChoice]ri   r-  rj   zOptional[dict[str, Any]]r4   z+Generator[KernelTemplateChoice, None, None])r   r/  ri   r-  r4   r5   )
rg   r   rh   r,  ri   r-  rj   r.  r4   zlist[ChoiceCaller])
r   ztype[TritonKernel]r   r%   r   zlist[sympy.Expr]r   dict[str, Any]r4   r0  )r   r%   r4   r5   )r   r%   r   r5   r4   r5   )
r   ztorch.devicer   r<   r   r<   r   r5   r4   r<   )
r	  r   r  r   r  r   r  r<   r4   r5   )r	  r   r  r   r  r   r4   r+   )r6   r7   r8   r9   rR   rT   r\   r`   rb   rm   r}   r   rv   r   staticmethodr   r   r   r  r  r  r*  r.   r3   r1   rH   rH   U   s}   
 ,2)()	)& ,22(2	52 OUJJ$/J>KJ	J OUJJ$/J>KJ	J OUHH$/H>KH	H @DP $ C	
  = 
$J 59&
#&
 <&
 	&

 2&
 
5&
P5
45
 5
 
	5
x @D6R#6R C6R 	6R
 =6R 
6Rp& % !	
 & 
 
 
, :
$:
=A:
	:
 :
x SS!S S 	S
 
S Sj BB B !B 	B
 
B BH   ! 	
 
    ! 	
 
 * ,
,
 ,
 !,
 
	,
 ,
r3   rH   )H
__future__r   dataclassesr   r   r   r   r   r   rN   %torch._inductor.runtime.runtime_utilsr   torch._inductor.schedulerr	   torch.utils._sympy.value_rangesr
    r   	codecacher   rg   r   kernel_template_choicer   metricsr   r   runtime.hintsr   r   r	  r   r   r   select_algorithmr   template_heuristicsr   template_heuristics.tritonr   r   r   r   r   r   r   r   virtualizedr    collections.abcr!   	functoolsr"   r   r#   TritonConfigcodegen.commonr$   codegen.simd_kernel_featuresr%   codegen.tritonr&   irr'   r(   torch.utils._ordered_setr)   Protocolr+   	dataclassr;   rH   r.   r3   r1   <module>rJ     s    "   6 6   A 7 7  ! ' 6 > : > > 0 7  )  )!-.@, <36v 6 
 
 
8v
 v
r3   