
    i                   R   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZm Z  d dl!m!Z! d dl	m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4 d dlm5Z5 d dl6Z6d dl7Z7d dl8m9c m:Z; d d	l<m=Z= d d
l>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dl8mFZFmGZG e,rd dlHmIZI ddgZJd dlKmLZLmMZMmNZNmOZO e,r\d dlmPZPmQZQmRZR d dl7mSZSmTZTmUZU d dlVmWZW d dlXmYZY d dlZm[Z[ ddl\m]Z] ddl^m_Z_ ddl`maZa ddlbmcZc ddldmeZemfZfmgZgmhZhmiZimjZj ddlkmlZl ddlmmnZnmoZo g d Zp e/d!      Zqej                  d6d"       Zsd d#ltmuZu d d$lvmwZw d d%lxmyZy d d&lzm{Z{ d d'l|m}Z} d d(l~mZ d d)lmZmZmZmZmZ d d*lmZmZ d d+lmZmZ dd,lmZ dd-lmZ ej                  d.k(  Z ej$                  e      Ze7j*                  j-                  ed/      Z e/d0      Zee6j4                  e6j4                  f   Ze*e0e7j8                  ee7j                  f      Zd1d2d3Zd4Zd4Zd4Zd5Zd6Zeedz
  z  d k(  red7k\  sJ d8       d7d9Zd8d:Z G d; d<e6jN                        Z ejR                  d=>       G d? d@             Zd9d:dAZ	 	 	 d;	 	 	 	 	 	 	 	 	 d<dCZ	 	 	 d;	 	 	 	 	 	 	 	 	 d<dDZej                  d=dE       Zd>dFZd?dGZd@dHZdAdIZ	 	 	 	 	 	 dBdJZdCdKZ	 	 	 	 dDdLZdEdMZ	 	 	 	 dFdNZdGdOZdP f	 	 	 	 	 dHdQZ	 	 	 	 	 	 	 	 dIdSZdJdKdTZ	 	 dL	 	 	 	 	 	 	 	 	 dMdUZ	 	 	 	 	 dN	 	 	 	 	 	 	 	 	 	 	 	 	 dOdVZdPdWZdQdXZdRdYZdSdZZdTd[Z e3d\      Z e/d]d=^      Zee&e$ef   ef   Z G d_ d`e+e'eef         ZŐdUdaZƐdVdbZ	 	 	 	 dWdcZ	 	 	 	 dXddZ	 	 	 	 	 	 dYdeZ	 	 	 	 	 	 dZdfZ	 d[	 	 	 	 	 d\dgZ	 	 	 	 	 	 d]dhZ͐d^diZΐd_djZϐd`dkZАdadlZѐdbdmZҐdcdnZӐdddoZԐdedpZՐdfdqZ eg dr      Z	 	 	 	 dgdsZِdhdtZڐdiduZd dlZܐdjdvZg ZdRedw<   dkdxZdjdyZej                  	 	 	 dl	 	 	 	 	 	 	 dmdz       ZeZeZeZdBd{dnd|ZdBd{	 	 	 	 	 	 	 dod}Z ej                  d7      dpd~       Z G d de)      ZejR                   G d d             Z G d d      Z G d de      Zej                  dqd       Z G d d      Z G d de      Z G d de      Zej                  drdsd       Zej                  dtd       Zej                  d=d       ZdtdZ	 d[	 	 	 	 	 	 	 dudZ	 	 	 	 	 	 dvdZdwdZdwdZdBdBd=d	 	 	 	 	 	 	 	 	 dxdZddBd	 	 	 	 	 	 	 dydZdBd	 	 	 	 	 	 	 dzdZdBd	 	 	 	 	 	 	 dzdZ ej                  d      d=d       Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d{dZ d|dZd}dZe0ee6j4                  f   Zded<   ej                  	 d~	 	 	 	 	 	 	 	 	 dd       Zej                  dd       Zej                  dd       Zej                  dd       Zej                  dd       ZddZ	d|dZ
d|dZddZddZ	 	 	 	 	 	 	 	 ddZ	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZd=dZ G d d      Z	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZddZddZddZ	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZej                  	 	 	 	 	 	 dd       Z	 d[	 	 	 	 	 ddZddZddZddZddZddZddZ ej                  dd       Z!dtdÄZ"ej                  dtdĄ       Z#ej                  ddń       Z$ej                  dtdƄ       Z%dtdǄZ&dtdȄZ'ddɄZ(ddʄZ)d=d˄Z*d=d̄Z+dd̈́Z,dfd΄Z- G dτ dej\                        Z/	 	 	 	 	 	 	 	 	 	 ddфZ0dd҄Z1	 	 	 	 ddӄZ2	 d[	 	 	 	 	 ddԄZ3ddՄZ4	 d[	 	 	 	 	 ddքZ5ddׄZ6	 	 	 	 	 	 dd؄Z7	 	 	 	 	 	 	 	 ddلZ8dڄ f	 	 	 	 	 	 	 	 	 	 	 ddۄZ9d܄ f	 	 	 	 	 	 	 	 	 	 	 dd݄Z:ddބZ;dd߄Z<ejR                   G d d             Z=ej                  dd       Z>ddZ?ddZ@ddZAddZB	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZCddZDddZEddZFddZG	 	 	 	 	 	 	 	 ddZHddZI	 	 	 	 	 	 	 	 ddZJddZK	 d[	 	 	 	 	 	 	 ddZL	 	 	 	 	 	 ddZMddZN	 	 	 	 	 	 ddZOd=dZPddZQddddddddZReRj                         D  ci c]  \  } }|| 
 c}} ZT ej                  d      ZVddZWddZXdd ZYddZZej                  dd       Z[ejR                   G d d             Z\i Z]deߐd<   	 	 	 	 	 	 	 	 ddZ^ eE       Z_deߐd	<   dd
Z`d[ddZaddZb e/d      Zc e/d      Zd G d deecedf         Ze e2d=      d[d=d>dd       ZfddZg G d dej\                        Zhej                  dd       Zid=dZjddZkddZlddZmd6dZndÐdZod=dZpdĐdZqdZrdŐd ZsdŐd!ZtdƐd"Zu	 	 d	 	 	 	 	 	 	 	 	 dȐd#Zvdɐd$Zwdʐd%Zxd=d&Zydːd'Zz	 	 d	 	 	 	 	 	 	 d͐d(Z{dΐd)Z| ejR                  d=>       G d* d+             Z}ed,e$f   Z~ee~e}ge~f   Z G d- d.      Z e       Zdϐd/ZdАd0Zdѐd1ZdҐd2ZdӐd3Z eEg d4      Zd}d5Zyc c}} w (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnycastConcatenateGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVarUnion)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only)Path!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     t         D  cg c]#  } t        t        |       j                         s"| % }} t	        |      dk  sJ t	        |      dk(  rd}|S |j                         }|S c c} w )Nr5   r   rF   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      O/var/www/html/engine/venv/lib/python3.12/site-packages/torch/_inductor/utils.pyget_gpu_typerV   j   sg    &K'%*;*H*H*J!KJKz?aZA-vHO 4>>>3CHO Ls
   #A'A')get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32
perf_hints_Tz.cubinz.spv)rF   rH         @      zmust be power of 2c                *    | t         z   dz
  t          z  S )z/Round up to the nearest multiple of ALIGN_BYTESr5   )ALIGN_BYTES)nbytess    rU   _alignrs      s    [ 1$44    c                   t        | t        j                  t        j                  f      r#t	        t        t        | j                              S t        | t              xs! t        j                  | t              t        k(  S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrq   )vs    rU   r|   r|      sQ    !eii+,3{AFF+,,aK599Q#<#KKrt   c                  *    e Zd ZdZdZdZedd       Zy)r~   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr5   Tc                    t        |t        t        j                  f      rt	        t        |            S t        |      r|S y N)rv   intrw   Integerrs   r|   )clsvalues     rU   evalz
align.eval   s6    ec5==12#e*%%uL rt   N)r   
sympy.ExprreturnzOptional[sympy.Expr])__name__
__module____qualname____doc__nargs
is_integerclassmethodr    rt   rU   r~   r~      s!    FEJ rt   r~   T)frozenc                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    r   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__r   rt   rU   r   r      s$    
 	G -,-- rt   r   c           
         |         t         j                  j                          t        j                  t	        d      t         j
                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t        |	      D cg c]"  }t         j                  j                  d      $ }}t        |	      D cg c]"  }t         j                  j                  d      $ }}t         j                  j                  t         j                  j                  j                  g      5 }
t         j                  j                          t        |	      D ]q  }|j                          ||   j                          t         j                  j                   j                  d	      5   |         d
d
d
       ||   j                          s t         j                  j                          t        j"                  t%        ||      D cg c]  \  }}|j                  |       c}}      }d
d
d
       t        j&                        j)                         }t*        j-                  d       t*        j-                  
j/                         j1                  dd             t3        |
j5                         D cg c]A  }|j6                  t8        j                  k(  r"t;        j<                  d|j>                        |C c}      }|r"|tA        j&                  d |D              dz  z  }t*        j-                  d|       |S c c}w c c}w # 1 sw Y   xY wc c}}w # 1 sw Y   3xY wc c}w )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArF   dtypedeviceTenable_timing   r5   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  4   K   | ]  }|j                     y wr   device_time_total.0events     rU   	<genexpr>zfp8_bench.<locals>.<genexpr>
  s     QE33Q        @@profiling results: %s ms)!rN   rF   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestablerZ   eventsdevice_typerY   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rU   	fp8_benchr      sT    D	JJKKJu}}VLE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 BGxQA5::##$#7QKQ?DXO!!!!5OIO			NN++00
 
  
 
 


 x 	"AKKMN!!#&&7 aL!	" 	

 +.{I+FG41aQ^^AG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
!!Z__4HH0%**=I	 	
	O OOQQQ	

 II(#.JO RO 
 H
 
*	
sE   "'P'PA9P2=PAP2P,9P2AP?P)$P22P<Fc                8    ddl m}   |t              | |||      S )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr   _do_bench_using_profiling)r   r   r   is_vetted_benchmarkingr   s        rU   do_bench_using_profilingr     s(    " UE*+DE
FC/ rt   c                j   |sddl m}  |         |         t        j                  j	                          t        j
                  t        d      t        j                  d      }t        j                  j                  d      }t        j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t        j                  j	                          |j                  |      dz  }	t        d	t        ||	z              }
t        d	t        ||	z              }t        |
      D ]	  } |          t        j                  j	                          t        j                  j                  t        j                  j                  j                   g
      5 }t        |      D ]  }|j                           |          t        j                  j	                          ddd       t"        j%                  d       t"        j%                  j'                         j)                  dd             t+        |j-                         D cg c]0  }|j.                  t0        j                   k(  r|j2                  dk7  r|2 c}      }t5        |      |z  dk7  rt7        dt5        |      |      t5        |      |z  }t+        t9        |      D cg c]  \  }}||z  dk7  r| c}}      }|j;                          |j'                         }t"        j%                  d       t"        j%                  |j)                  d             t=        d |D              dz  |z  }t"        j%                  d|       |S # 1 sw Y   xY wc c}w c c}}w )r   r   )may_ban_benchmarkingr   rF   r   Tr   r   r5   r   Nr   r   r   r   zContext SynczYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)r   c              3  4   K   | ]  }|j                     y wr   r   r   s     rU   r   z,_do_bench_using_profiling.<locals>.<genexpr>  s     A%e%%Ar   r   r   )r   r   rN   rF   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rZ   r   r   rY   r   rP   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   num_event_per_groupr   actual_eventsr   s                      rU   r   r   *  s    "MD	JJKKJuyyHE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 
JJ			NN++00
 
  
 ! 
x 	AKKMD		 	

 ! IIlIIann$$-EQS$TU 	
  JOO3

n8T 	
O ?h&!+- 	
 	
 o.9 &o6	
5&&!+ 	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J_! !$	
	
s   AN35N*7N/
N'c                    	 ddl m}  t        j                  j	                  dd       | d uxr% t        t        t        j                  dd       d      S # t        $ r Y yt        $ r}dt        |      v sJ Y d }~yd }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   rN   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrM   opsImportErrorr   str)r   r   s     rU   has_torchvision_roi_alignr    s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 	A?A?&A::A?c                b   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dvrZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        )cpumeta)index)
rN   r   r   rv   r  typer  rW   Workercurrent_devicer   device_interfaces     rU   decode_devicer    s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMrt   c                |    t        j                  t        j                  | t        j
                  j                        S r   )	functoolsreduceoperatormulrw   SOne)its    rU   sympy_productr    s#    HLL"eggkk::rt   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wr   r   )r   abs      rU   r   zsympy_dot.<locals>.<genexpr>  s     >daAE>s   )rP   rw   expandr   r   )seq1seq2s     rU   	sympy_dotr    s8    t9D	!!!<<>c$o>>??rt   c                \    | D ci c]  }t        |      | c}j                         S c c}w r   )r   values)r  rR   s     rU   uniquer     s'     !BqE1H!((**!s   )c           
     n   t        | t        j                        st        |t        j                        r2t        t        j                  |       t        j                  |            S t        | t
              rt        |t
              s$J |  dt        |        d| dt        |              t        | |      S )Nz: , )rv   rw   Exprr]   sympifyr   r  runtime_ceildiv)numberdenoms     rU   rh   rh     s     &%**%E5::)Fu}}V,emmE.BCC fc"z%'= ("T&\N"UG2d5k];= 65))rt   c                f   | yt        |       j                  d      d   }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|j                  t        |j	                               D ci c]  }|| c}       t        | t               r| S d'||    S c c}w )(Nz*i8.r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64*)r  splitupdatelistr  rv   )key	dtype_strtysr   s       rU   _type_ofrV    sR   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /C4 JJd3::<01112S#&3@aI/?,@@ 2s   
B.c                R    | D cg c]  }t        j                  |       c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )rw   r$  lstr   s     rU   convert_shape_to_inductorrZ    s!     '**EMM!***s   $c                    ddl m} t        | t              r| S t        | t        j
                        rt        |       S |j                  j                  j                  j                  | d      S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r5   VN)hint)
virtualizedr]  rv   r   rw   r   graphsizevars	shape_envcreate_symintnode)r   r]  s     rU   convert_to_symintrd    se      a 	

 !U]]+ F	 !!++==ad=Krt   c                >    | D cg c]  }t        |       c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    )rd  rX  s     rU   convert_shape_to_symintrf     s     +..Qa ...s   c                N    t        d | j                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wr   )
alias_infor   r  s     rU   r   zis_view.<locals>.<genexpr>  s     FAq||4'Fs   )any_schema	argumentsops    rU   is_viewrp  
  s     F1E1EFFFrt   c                     yNFr   )r   s    rU   <lambda>rs        rt   c                   | j                   dk7  ryt        | j                  t        j                  j
                        s| j                  t        j                  u syt        t        j                  j
                  | j                        }|t        j                  u st        |      rt        fd| j                  D              S t        j                  j                  |j                  v xs  |      S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  6   K   | ]  }t        |        y wr   )is_pointwise_use)r   uis_pointwise_fns     rU   r   z#is_pointwise_use.<locals>.<genexpr>$  s     KA#A7Ks   )ro  rv   targetrN   _ops
OpOverloadr  getitemr   rp  rz   usersTag	pointwisetags)userz  r{  s    ` rU   rx  rx    s     vv 3::uzz445xGWGW9W%**''4F!!!WV_KKKK99&++-H1HHrt   	list[Any]c           	        t         j                  j                         g dfd} j                  | gt	        t         j
                  |||f       }t        | j                  j                        dk(  r2t        | j                  j                  d   j                        dk(  r|f}j                  |       t         j                  j                  i       }|fS )Nc                `    j                  |        j                  dt                     S )Narg)appendplaceholderrP   )r  g
graph_argss    rU   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg/  s,    #}}s3z?"3455rt   r5   r   Tensor)r  torch.Tensorr   r4   )rN   fxGraphrv  r$   r  rP   rl  returnsr  r  outputr3   )r{  r}   kwargsr  nodegmr  r  s         @@rU   gen_gm_and_inputsr  )  s     	A%'J6 1??u||^dF^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>rt   c                h    | dk(  ry t        |       }|j                         r|j                          y y Nr  )rW   rO   r   r
  s     rU   r   r   A  s4    /7$$&$$& 'rt   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )r   rN   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rU   timedr  I  sr     	d				B5\ 'F 
			B7Nrt   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |j                         S c c}w )Nz.6f)rN   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rU   print_performancer  [  sg     ll>CFmLuneV	4LG << 5(D	TH_S!#99;	 	Ms   A1c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r   r   )r  s   rU   rs  z#precompute_method.<locals>.<lambda>n  s     rt   N)rM   setattr)objmethodr  s     @rU   precompute_methodr  k  s     !WS&!#FC(rt   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rU   precompute_methodsr  q  s     '#v&'rt   c                <    t        | |kD        t        | |k        z
  S r   )r   )r  r  s     rU   cmpr  w  s    q1u:AE
""rt   c                ~    t        | t              r| g|z  S t        |       dk(  r t        |       | d   g      |z  S | S )Nr5   r   )rv   r   rP   r  )rR   sizes     rU   pad_listliker  {  sC    !SsTz
1v{tAw!v%%Hrt   c                D    t        |       dk(  rg S dd}t        | |      S )Nr   c                n    t        | t              r| S ddlm} t        | |      sJ | j	                         S )Nr5   )rD   )rv   r  	schedulerrD   get_name)elemrD   s     rU   	sort_funcztuple_sorted.<locals>.sort_func  s1    dC K0$ 1222}}rt   rS  )r  rk   r   r  )rP   sorted)rR   r  s     rU   tuple_sortedr    s&    
1v{	 !##rt   PRV)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y r   r   )r   s    rU   clear_cachezCachedMethod.clear_cache  s    ),rt   c                     y r   r   selfr}   r  s      rU   __call__zCachedMethod.__call__  rt  rt   N)r   r   r   None)r}   P.argsr  P.kwargsr   r  )r   r   r   staticmethodr  r  r   rt   rU   r  r    s    , ,Drt   r  c           	         | j                   }d| dd| i}t        d| d d dj                         |        t        j                  |       || d         }d
fd	}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                8    t        |       rt        |        y y r   r   delattrr  rS  s    rU   r  z"cache_on_self.<locals>.clear_cache  s    4D# rt   r  r   r   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  rS  s        @rU   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH "ioob!#n&=">?G &GNrt   c                    t        |       S )z]
    Variant of cache_on_self for properties. The only difference is the type signature.
    )r  )r   s    rU   cache_property_on_selfr    s    
 rt   c                     	 	 	 	 d fd}|S )Nc           	         d d| j                    dd| i}t        d d d dj                         |        t        j                  |       |d	         }dfd
}||_        |S )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerc                8    t        |       rt        |        y y r   r  r  s    rU   r  z<cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s    tS!c" "rt   r  r  )r   r  r  r  rS  
class_names       @rU   r  z'cache_on_self_and_args.<locals>.wrapper  s     :,a}F3 Rj' (+e ,//2e 4!U #$ )	
, $	#CL1	# (rt   )r   FN_TYPE[P, RV]r   r  r   )r  r  s   ` rU   cache_on_self_and_argsr    s     
$$	$L Nrt   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr5   irr  ) r  rv   rR  r  r  r  or_r   r  originsr"   r=   )node_scheduler  r  s      rU   aggregate_originsr    s     -&LL * 4(TYY 		!! L	
 		
 
M2??	3$$$|s   5B*
c                   t        |       }|dk(  rYd }|D cg c]6  }|j                  dk(  r%d|j                  v r|j                  d    ||      8 }}t        t	        |            }n|dk(  rg }|D ]  }|j                  dk(  sd }d}d|j                  v r|j                  d   d   }n"d|j                  v r|j                  d   d   }d	}|s]t        |d
   t              r|j                  |d
   |z          |j                  |d
   j                  |z           t        t	        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        dj                  dg|z         S c c}w c c}w )Noriginal_atenc                   | j                   d   }d}t        |t        j                  j                        r|j
                  j                  }|S t        |t        j                  j                        rt        |j                               }|S )Nr  r  )
r  rv   rN   r|  r}  _overloadpacketr   HigherOrderOperatorr  r   )originr  rS  s      rU   get_origin_meta_strz2get_fused_kernel_name.<locals>.get_origin_meta_str  so    "KK8MC-)>)>?#33<< J M5::+I+IJ-,,./Jrt   rv  rN   r  source_fn_stackr   fwd_source_fn_stackbackwardr5   inductor_noder   fused)r  ro  r  r  r"   rv   r  r  r   r   NotImplementedErrorjoin)r  descriptive_namesall_originsr  r  sources	source_fnsuffixs           rU   get_fused_kernel_namer  
  s    $M2KO+	 &
yyO+6;;.O,8	  '
 
 G,-	g	%! 	CFyyO+ 	$3 &,= >r BI*fkk9 &,A B2 FI'F ilC0NN9Q<&#89NN9Q<#8#86#AB	C" G,-	o	-&1
"VYY/5QFKK
 
 "!88WI'((G
<
s   ;E=<FFc                   ! t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }d |rt        d |D              }t        |      dk(  r_|d   j                   t         d      s/t         j                        D 	ci c]  \  }}	|	|
 }
}}	|
 _        |j                   fd       |D ]Z  }d	|j                  v r|j                  d	   |j                  d	   }d}t        |t        j                   j"                        rt%        |j&                        }n=t        |t        j                   j(                        rt%        |j+                               }|r||   j-                  |j*                         d
|j                  v r<|j                  d
   d   j*                  }||   j-                  |j*                         |j                  j/                  d      dk(  s3||j*                     j-                  |j*                         ]  dnd}|j0                   d| ddj3                  |j5                                ddj3                  |j5                                d}|j0                   dg}t7        |j9                               D ]@  \  }}|j-                  |j0                   d| ddj3                  t7        |                    B  dddlm |j-                  |j0                   d       t               }g }t        | j>                        sddl m!} 	 	 	 	 	 	 d)fd}d*d!d+!fd}| D ]  }	t        |	d      r|	jD                  t        |	jD                  d      r|	jD                  jF                  |	jD                  jF                  D ]  }|j*                  |v r|jI                  |j*                         |j                  jK                  |j*                        }|U |||j*                        \  }}|j-                  |j0                   d| d  ||       d!| d        t        |	jD                  d"      s|	jD                  jL                  )|	jD                  jL                  D ]T  }|j                  jK                  |j*                        }|+ |||j*                        \  }}|j-                  d#|z          V  |D ]2  }|j-                  |j0                   d|jO                  d$%              4 |j-                  |j0                   d&d'j3                  |              |d(j3                  |      fS c c}w c c}	}w ),aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    rv  Nc              3  4   K   | ]  }|j                     y wr   )r`  )r   ns     rU   r   z&get_kernel_metadata.<locals>.<genexpr>^  s     "Cq177"Cr   r5   r   )_inductor_kernel_metadata_node_to_idx_mapc                "    j                   |    S r   )r	  )r  single_graphs    rU   rs  z%get_kernel_metadata.<locals>.<lambda>f  s    lTTUVW rt   r  r  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [r"  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:r\  c                >   t        | j                        rAt        | j                  j                        r!| j                  j                  j                  }n| j                  }||}n|j
                  }	 | j                         }||fS # t        $ r d }Y ||fS w xY wr   )rv   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )bufferrw_namer  r   layoutr  s        rU   get_buffer_infoz,get_kernel_metadata.<locals>.get_buffer_info  s     fbll3
KK9 #)++"2"2">">K"("4"4K&"D&++D"#..0F V|# + "!FV|#"s   7B BBc           	     d    ddj                  | D cg c]  }t        |       c}       dS c c}w )N[r"  r  )r  r  )shaperR   s     rU   stringify_shapez,get_kernel_metadata.<locals>.stringify_shape  s-    499e%<c!f%<=>a@@%<s   -
c                    | y | j                          } | j                         }| j                   }dt        | j                      | | | dS )Nr  ")r  strider   r!   r   )r  shape_annotationstride_annotationdevice_annotationr  s       rU   stringfy_layoutz,get_kernel_metadata.<locals>.stringfy_layout  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?rt   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r  z2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]r  r  r   ztuple[str, ir.Layout | None])r  zIterable[int]r   r  )r  zir.Layout | Noner   r  )(r  ro  collectionsdefaultdictrR  r"   rP   r`  r   r   nodesr	  sortr  rv   rN   r|  r}  r  r  r  r   r  getcommentr  keysr  itemsr  r  r=   r_  r]  r'  r(  addtry_get_bufferr)  format_node)"r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr  node_to_idx_mapr  r  rS  sort_strmetadatadetailed_metadataoriginal_noder0  	all_reads
all_writesr]  r  r&  rr  
input_namer  woutput_namer   r  r  r  s"                                  @@@rU   get_kernel_metadatarI  A  s   $ $M2K+6W&)):VfWNW ,,T2N$006
 L""CN"CC}")!,22L<)TU8A,BTBT8U"Vfc11c6"V"VIXFW     8dii'DIIo,F,R IIo6MC-)>)>?-778M5::+I+IJ-,,./"3'..tyy9$))#))K(+00C3&&tyy1YY]],->499%,,TYY78  *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= > 
u  s=/diiu6N5OP	

   GOO#44D!EF%/\	 "
-9&$J$UX$-$(A
 # =q-0AMM4I1=='2q}}7J7J7V]]00 66Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ AMM84,,8]]11 =!"!7!7!?!>$)8)HQ"))#*;<=-=< # 	D$$??#3t'7'7PT'7'U&VW	
 	  GOO#4Jsxx
?S>T!UVTYY0111I X #Ws   VV7Vc                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)rR  r"   rQ   r  r6  r  )initial_queueskip_filterdominated_setr  users        rU   dominated_nodesrO    sz    
 'M}-M
  "JJ 	+D{40=(!!$'$$T*	+  rt   c                4  	 ddl m d	fd	t        |      \  }}|D cg c]  } 	|      s|j                   }}t        |       \  }}|D cg c]  } 	|      s|j                   }}t	        t        j                  g ||       S c c}w c c}w )Nr5   r  c                F   t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr9 t        | j
                  j                  j                  j                  f       S r   )	rv   r  r  r  r>   ComputedBufferInputsKernelInputBufferTemplateBuffer)r  r  is_unrealized_nodes    rU   rV  z*gather_origins.<locals>.is_unrealized_node  s    a&%aff--a'%aff--!RYY' 

!!!!	1
 -
 	
rt   )r  r>   r   r*  )r  r  r#   r  r"   	itertoolschain)
r}   r  kwargs_flattenr   valkwargs_originsargs_flattenargs_originsr  rV  s
           @@rU   gather_originsr^    s     
" %V,NA-;Wc?QRU?VckkWNW"4(OL!+7SC;Mc;RCKKSLSiooE|EnEFF XSs   BBB Bc                J    dddfddfddfd |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    c                    t        | t        j                        xr, t        | j                        dk(  xr | j                  d   dk(  S )N   r   r   )rv   rw   MulrP   r}   )exprs    rU   is_neg_leadzsympy_str.<locals>.is_neg_lead  s:    tUYY'VC		Na,?VDIIaLTVDV	
rt   c                `   t        | t        j                        rt        | j                        dk(  rO | j                  d         r: | j                  d          d | j                  d   j                  d          S dj                  t        | j                              S  |       S )Nra  r5   r   z - z + )rv   rw   rx   rP   r}   r  r{   )rc  rd  sympy_str_muls    rU   sympy_str_addz sympy_str.<locals>.sympy_str_add  s    dEII& 499~"{499Q<'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&rt   c                    t        | t        j                        rE |       rd | j                  d          S dj	                  t        | j                              S  |       S )N-r5   z * )rv   rw   rb  r}   r  r{   )rc  rd  sympy_str_atoms    rU   rf  z sympy_str.<locals>.sympy_str_mul#  s[    dEII&4  >$))A,7899zz#ndii"@AA!$''rt   c                   t        | t        j                        r| j                  S t        | t        j                  t        j
                  f      rd |        dS t        | t        t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t!        |       S )N()r"  )rv   rw   Symbolr   rx   rb  ra   r^   r_   r`   funcr   r  r{   	sympy_strr}   r  )rc  rg  s    rU   rj  z!sympy_str.<locals>.sympy_str_atom.  s    dELL)99uyy%))45}T*+1--(HMNii(()499SDII5N+O*PPQRRt9rt   )rc  r   r   r*  rc  r   r   r  r   )rc  rd  rg  rj  rf  s    @@@@rU   rp  rp    s$    

	'	( rt   c                    ddl m} t        j                  r3t	        |j
                  dd       x}r|j                  dk7  rt        |       S t        j                         S )Nr5   r\  current_node
index_expr)
r_  r]  rg   compute_all_boundsrM   interpreterr{  rd   re   unknown)r  r]  fx_nodes      rU   get_bounds_index_exprry  ;  sN     	!!~tDDWDNNl*5!!""$$rt   c                    | d   dk(  S )Nr   rE  r   )prefixs    rU   prefix_is_reductionr|  I  s    !9rt   c                J    | t         j                  k7  sJ t        | |dd      S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)rc   SIZErb   )r{  r=  s     rU   sympy_index_symbol_with_prefixr  M  s)     TYY vsDdCCrt   c                N    | xs t         j                  xr t         j                  S r   )rg   debug_index_assertsassert_indirect_indexing)checks    rU   generate_assertr  Y  s    /V//TV5T5TTrt   c                F    | d   dk7  sJ t        j                  | dd      S )r~  r   r   Tr  )rw   rn  r   s    rU   sympy_index_symbolr  ]  s)     7c>> <<d==rt   c                    	 	 	 	 	 	 dd}t        j                  |       j                  |j                         D ci c]  \  }}| |||       c}}      S c c}}w )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                    t        | t        j                        sJ t        |t              r,t        j                  || j
                  | j                        S |S )Nr  )rv   rw   r#  r  rn  r   is_nonnegative)replacedreplacements     rU   	to_symbolzsympy_subs.<locals>.to_symbolo  sP     (EJJ///k3'<< ++$33  rt   )r  r   r  zUnion[sympy.Expr, str]r   sympy.Symbol)rw   r$  xreplacer5  )rc  replacementsr  kr   s        rU   
sympy_subsr  i  sf    +A	 ==''(4(:(:(<=1IaO	= =s   A
c                    t        | t        j                        xs^ t        | t        j                        xrB t	        d t        j                  | j                         | j                               D              S )Nc              3  2   K   | ]  }t        |        y wr   is_symbolicr   rR   s     rU   r   zis_symbolic.<locals>.<genexpr>  s     N1AN   )	rv   rN   r1   r  rk  rW  rX  r  r"  )r  s    rU   r  r    sS    a& 1ell# 	ON	!((*(MNNrt   c                 &    t        d | D              S )Nc              3  2   K   | ]  }t        |        y wr   r  rj  s     rU   r   z"any_is_symbolic.<locals>.<genexpr>  s     ,!{1~,r  rk  )r}   s    rU   any_is_symbolicr    s    ,t,,,rt   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarc                    ddl m} | j                  j                  D ];  }t	        |      r|c S |j
                  j                  d      x}0 ||      s9|c S  y )Nr   )r)   rZ  )%torch.fx.experimental.symbolic_shapesr)   r`  r0  is_cudagraph_unsafe_fx_noder  r2  )r  r)   r  rZ  s       rU   %get_first_incompatible_cudagraph_noder    sV     L &t,K99==''C49Ns9SK rt   c                    t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphr  )nextiterreversedr`  r0  ro  )r  	last_nodes     rU   output_noder    s6    T(288>>234I<<8###rt   c                    | j                   j                  d      }t        d |D              }t        |       j                  d   }t        |t              r|n|f}t        d |D              }||z  S )Nr  rn  c              3     K   | ]P  }t        |j                  j                  d       t        j                        r|j                  d    j
                   R ywrZ  N)rv   r  r2  rN   r  r   )r   r  s     rU   r   z"get_all_devices.<locals>.<genexpr>  sB      9diimmE*ELL9 			%9s   AAr   c              3     K   | ]t  }t        |t        j                  j                        rNt        |j                  j                  d       t        j                        r|j                  d    j                   v ywr  )rv   rN   r  r4   r  r2  r  r   )r   r  s     rU   r   z"get_all_devices.<locals>.<genexpr>  sS      7c588==)sxx||E*ELL9 	7s   A:A<)r`  
find_nodesr"   r  r}   rv   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rU   get_all_devicesr    s}    ++}+=.8 9%9 /M "o""1%G$We4w7*H,6 77 -K ;&&rt   c                 |   t        t        j                  j                               D ]  } | j	                  d      st        j                  |    }|j
                  D ]  }|j	                  d      st        ||      }t        |t        j                  j                  j                  j                        sZ|j                  D ]i  }t        |t        j                  j                  j                  j                        s<|j                  j                   j"                  j%                          k  t        j                  | =  dt        j                  v rRt        j                  d   }t'        |j(                  j*                  j,                        `|j(                  j*                  `t1        j2                          y )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rR  sysmodulesr4  
startswith__dict__rM   rv   rN   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rU   unload_xpu_triton_pydsr    sK   CKK,,./ %%%&NOKK$ 	<I##I. I.EOO33EEVV #)"8"8 <%"!OO33EEYY
 #MM--1199;<	< KK$#%( #++-kk12""(()2JJ#JJLrt   _registered_cachesc                    t        | d      rt        | j                        st        |  d      t        j                  |        | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r  r  s    rU   clear_on_fresh_cacher    s?    
 3&hs.Gu$GHIIc"Jrt   c                 :    t         D ]  } | j                           y)z&
    Clear all registered caches.
    N)r  r  r  s    rU   clear_cachesr    s     " rt   c              #    K   t                ddlm}  |t        j                  |            	 t
        j                  j                  t        j                  di      5  t        j                  d        |t        j                  j                  d            }t
        j                  j                  t        j                  d|i      5  d t        | t              rt        |       dk(  sJ d	       t        j                  j!                  |      rtt        j"                  |      }| j%                  |D ci c]D  }d
|vr>|t        j                  j'                  t        j                  j                  ||            F c}       ddd       ddd       |rUt)               r(t*        j,                  j/                         r
t1                t3        j4                  t)               fd       t                yc c}w # 1 sw Y   xxY w# 1 sw Y   |xY w# t6        $ r t        j9                  d        w xY w# t                w xY ww)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                4    t         j                  d|      S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)ro  pathr  inductor_cache_dirs      rU   rs  zfresh_cache.<locals>.<lambda>2  s    S[[@&% 6A 6 rt   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr   patchdictosenvironr   r   r  r  rv   rP   existslistdirrQ  getsize
is_windowsrN   rH   rO   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter  triton_cache_dirfilesfr  s          @rU   fresh_cacher    s     ND1(2B2Bs2KL)ZZ__JJ24FG
 	 II35GH7/:  .@BR-ST mT2}-2W4WW2ww~~&67 "

+; <%,, */$%#*!#3 !"277??277<<@PRS3T#U U	( |		 6 6 8&(MM" )l  	5 	 	H  >@RS 	sn   -I0H !A-HA-H;A	G=HHAH 2I=HH	HHH !H;;H> >I

I)reversec                   | j                   }t        t        |             }t        t	        ||d            }|st        t        |            S |S )NTrS  r  )__getitem__r   rP   rR  r  r  )seqr  gettera_rsort_idxs        rU   argsortr  E  sE    __F
C/C F3FD9:HHX&''Ort   c          	     4    d fd}t        |      D cg c]9  \  }}|t        |t        j                        r|j                  j
                  n|f; }}}t        |t        j                  |      |      }|D cg c]  \  }}|	 }}}|S c c}}w c c}}w )Nc                n    | \  }}|\  }}dfd} |||k        ry |||kD        ry||k  ry||kD  ryy)Nc                N    t        | t              r| S j                  | d      S )NT)size_oblivious)rv   r*  evaluate_expr)rc  rb  s    rU   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate_  s(    $%**4*EErt   r   r5   r   )rc  z%Union[bool, torch.SymInt, sympy.Expr]r   r*  r   )r  r  a_idxa_valb_idxb_valr  rb  s          rU   r  zargsort_sym.<locals>.cmp[  sT    uu	F
 EEM"EEM"
 5=5=rt   r  )r  tuple[int, sympy.Expr]r  r  r   r   )	r   rv   rN   r1   r  rc  r  r  
cmp_to_key)	rb  r  r  r  r=  r   exprsr   r  s	   `        rU   argsort_symr  U  s    4  nC 
Z5<<8affkka@E  5i2237IE %&fc1c&F&M
 's   >B=Bc                t    | t         j                  k(  ryt        j                  d|       j                         S )Nro   r   r   )rN   rN  r   element_sizer  s    rU   get_dtype_sizer  |  s-     ;;r'4466rt   c                      e Zd ZU ded<   y)LineContextr   contextNr   r   r   r   r   rt   rU   r  r    s    Lrt   r  c                  "    e Zd ZU ded<   ded<   y)ValueWithLineMapr  r   zlist[tuple[int, LineContext]]line_mapNr  r   rt   rU   r  r    s    J++rt   r  c                      e Zd ZdZdddZej                  dd       ZddZddZ	ddZ
ddZddZdd	Zdd
ZddZ	 	 	 	 ddZdd dZdd!dZdd!dZ	 d"	 	 	 	 	 d#dZd$dZddZd%dZd&dZy)'IndentedBuffer   c                     g | _         || _        y r   )_lines_indent)r  initial_indents     rU   __init__zIndentedBuffer.__init__  s    GI%rt   c              #  b   K   | j                   }	 || _         d  || _         y # || _         w xY wwr   )tabwidth)r  r(  prevs      rU   set_tabwidthzIndentedBuffer.set_tabwidth  s,     }}	!$DM DMDDMs   /# /	,/c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|1t        |t              r|j                  ||j                  f       K|}t        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } t        |j                         |      S )Nr5   r-  )r   r#  rv   DeferredLineBaser  r  r  r  writecountr  getvalue)r  bufr   linemaplilines         rU   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap  s    j13++ 	&B"./t<B,2::/dC(((IIdOIIdOTZZ%%%A	&  88rt   c                6    | j                         j                  S r   )r4  r   r  s    rU   r/  zIndentedBuffer.getvalue  s    '')///rt   c                f   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.|}t        |t
              sJ |j                  d      r|j                  |d d        h|j                  |       |j                  d        |j                         S )N\r   r-  )	r   r#  rv   r,  r  r  endswithr-  r/  )r  r0  r2  r3  s       rU   getrawvaluezIndentedBuffer.getrawvalue  s    j++ 	 B"./t<B,dC(((}}T"		$s)$		$		$	   ||~rt   c                8    | j                   j                          y r   )r#  clearr6  s    rU   r<  zIndentedBuffer.clear  s    rt   c                ,    t        | j                        S r   )r*  r#  r6  s    rU   __bool__zIndentedBuffer.__bool__  s    DKK  rt   c                :    d| j                   | j                  z  z  S )Nr  )r$  r(  r6  s    rU   r{  zIndentedBuffer.prefix  s    dllT]]233rt   c                &    | j                  d       y )Nr-  	writeliner6  s    rU   newlinezIndentedBuffer.newline  s    trt   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nr  )rv   r  r#  r  r,  with_prefixr{  stripr  r3  s     rU   rB  zIndentedBuffer.writeline  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"rt   c                4    |D ]  }| j                  |        y r   rA  )r  linesr3  s      rU   
writelineszIndentedBuffer.writelines  s      	!DNN4 	!rt   c                H     t         j                  d fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwr   r$  )offsetr  s   rU   r  z"IndentedBuffer.indent.<locals>.ctx  s9     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  rO  r  s   `` rU   indentzIndentedBuffer.indent  s$    		"	"	' 
#	' urt   c                .    | xj                   |z  c_         y r   rN  r  rO  s     rU   	do_indentzIndentedBuffer.do_indent      rt   c                .    | xj                   |z  c_         y r   rN  rV  s     rU   do_unindentzIndentedBuffer.do_unindent  rX  rt   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   r-  )rv   r   floatr#  r  minrP   r  mathisinfr  rB  r   textwrapdedentrstriprP  )r  
other_coderG  rb  r3  r   s         rU   splicezIndentedBuffer.splice  s    j.15\F")) I!$4 TS5G)GHFI zz&!")) HdK0KK&&t,",,T4F3FG	H "4J'..0
#**,J%%d+ "q!"rt   c                    t        | j                        }| j                  D cg c]
  } ||       c}|_        |S c c}w N)r%  )r   r$  r#  )r  ro  r   r3  s       rU   r{   zIndentedBuffer.map  s4    DLL9-1[[9Td4j9

 :s   >c                @    t        |        d| j                          dS )Nrl  rm  )r  r/  r6  s    rU   __repr__zIndentedBuffer.__repr__  s     t*Qt}}/q11rt   c                    | j                   |j                   k(  sJ t        | j                         }|j                  | j                         |j                  |j                         |S rg  )r$  r   rK  r#  )r  otherr   s      rU   __add__zIndentedBuffer.__add__  sK    ||u}},,,DLL9t{{#u||$
rt   c                    || j                   v S r   )r#  )r  new_lines     rU   containszIndentedBuffer.contains!  s    4;;&&rt   Nr   )r%  r   r   r  )r(  r   r   rQ  )r   r  r   r  r   r  r   r*  )r3  z)Union[LineContext, DeferredLineBase, str]r   r  )rJ  z3Sequence[Union[LineContext, DeferredLineBase, str]]r   r  r   )rO  r   r   'contextlib.AbstractContextManager[None])rO  r   r   r  )F)rd  zUnion[IndentedBuffer, str]rG  r*  r   r  )ro  zCallable[[Any], Any]r   r   )rk  r   r   r   )rn  z)Union[DeferredLineBase, LineContext, str]r   r*  )r   r   r   r(  r&  rR  rS  r*  r4  r/  r:  r<  r>  r{  rC  rB  rK  rT  rW  rZ  re  r{   ri  rl  ro  r   rt   rU   r   r     s    H& ! !9(0(!4#!H!	!	 EJ"4"=A"	"4
2'rt   r   c                  (     e Zd Zd fdZddZ xZS )FakeIndentedBufferc                "    t         |           y r   )superr&  )r  	__class__s    rU   r&  zFakeIndentedBuffer.__init__&  s    rt   c                V    |dk(  rt         j                  | |      S t        d| d      )Nry  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rU   r|  z#FakeIndentedBuffer.__getattribute__)  s;    ;**466!$ (= =
 	
rt   rr  )r   r  r   r   )r   r   r   r&  r|  __classcell__ry  s   @rU   rv  rv  %  s    
rt   rv  c               #     K   t         j                  t         j                  }} 	 d  | |ct         _        t         _        y # | |ct         _        t         _        w xY wwr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rU   restore_stdout_stderrr  4  s@     %(ZZNN@!/
CJ
CJs   !AA  A AAc                  P    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zy
)r,  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y rE  )rG  r3  rH  s     rU   r&  zDeferredLineBase.__init__@  s    zz|D	rt   c                    t         )zJReturns either self.line or None to indicate the line has been 'unwritten'r  r6  s    rU   r  zDeferredLineBase.__call__E      !!rt   c                    t         )z3Returns a new deferred line with the same conditionr  rH  s     rU   	_new_linezDeferredLineBase._new_lineI  r  rt   c                @    | j                  | | j                         S r   r  r3  )r  r{  s     rU   rF  zDeferredLineBase.with_prefixM  s    ~~455rt   c                T    | j                  | j                  j                               S r   )r  r3  r  r6  s    rU   r  zDeferredLineBase.lstripP  s    ~~dii..011rt   c                >    | j                  | j                  |         S r   r  )r  r  s     rU   r   zDeferredLineBase.__getitem__S  s    ~~dii.//rt   c                ,    t        | j                        S r   )r*  r3  r6  s    rU   r>  zDeferredLineBase.__bool__V  s    DIIrt   c                ,    t        | j                        S r   )rP   r3  r6  s    rU   __len__zDeferredLineBase.__len__Y  s    499~rt   N)r3  r  )r   zUnion[str, None])r3  r  r   r   )r{  r  r   r   )r   r   )r  zUnion[int, slice]r   r   rs  r   r   )r   r   r   r   r&  r  r  rF  r  r   r>  r  r   rt   rU   r,  r,  =  s-    8
""620rt   r,  c                  4     e Zd ZdZd fdZddZddZ xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`c                @    t         |   |       || _        || _        y r   )rx  r&  rS  value_fn)r  rS  r  r3  ry  s       rU   r&  zDelayReplaceLine.__init__`  s     rt   c                j    | j                   j                  | j                  | j                               S r   )r3  replacerS  r  r6  s    rU   r  zDelayReplaceLine.__call__e  s#    yy  4==?;;rt   c                D    t        | j                  | j                  |      S r   )r  rS  r  rH  s     rU   r  zDelayReplaceLine._new_lineh  s    $-->>rt   )rS  r  r  zCallable[[], str]r3  r  rq  )r3  r  r   r  r   r   r   r   r&  r  r  r}  r~  s   @rU   r  r  ]  s    @!
<?rt   r  c                  4     e Zd ZdZd fdZddZddZ xZS )DelayMaybeLinez7At end of codegen return `line if `pred_fn() else None`c                2    t         |   |       || _        y r   )rx  r&  pred_fn)r  r  r3  ry  s      rU   r&  zDelayMaybeLine.__init__o  s    rt   c                >    | j                         r| j                  S d S r   )r  r3  r6  s    rU   r  zDelayMaybeLine.__call__s  s     LLNtyy44rt   c                .    t        | j                  |      S r   )r  r  rH  s     rU   r  zDelayMaybeLine._new_linev  s    dllD11rt   )r  zCallable[[], bool]r3  r  )r   z
str | None)r3  r  r   r  r  r~  s   @rU   r  r  l  s    A52rt   r  c                   t        | t        j                        r| }nt        j                  t               |       }t	        j
                  |      }t        j                  j                  rC|j                  J |j                  dk  s|j                  dk(  rt        j                  d       yy|j                  dk(  rdnd}|j                  }||k  rt        j                  d	||d
       yy)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrH   rl   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rv   rN   r   rV   r   createversionhipmajorr   r  r  multi_processor_count)index_or_devicer   propr  r  s        rU   
is_big_gpur  z  s    /5<<0 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I7:%I> 	 	
 rt   c                     t         j                  j                         r(t         j                  j                         j                  S t         j
                  j                  d      j                  S )NrF   )rN   rH   rO   get_device_propertiesgpu_subslice_countrF   r  r   rt   rU   get_max_num_smsr    sF    yyyy..0CCC::++F3IIIrt   c                     t         j                  j                         syt         j                  j                  t         j                  j	                               } | j
                  dk(  S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rN   rF   rO   r  r	  r  )device_propertiess    rU   
using_b200r    sJ     ::""$

889R9R9TU""b((rt   c                     t         j                  j                         r
t               S t         j                  j                         } t               | | z
  S dz
  S )zFHandle experimental carveout if set otherwise return hardware SM countr   )rN   rH   rO   r  r   _get_sm_carveout_experimental)carveouts    rU   get_num_smsr    sJ     yy  xx557HH,@HHaHHrt   c                    ddl m}m} |
t               }|j	                  d      }|| z  t
        z  } |||| |j                               S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r5   )r6   WorkspaceZeroModeF)r.  	zero_moder   
outer_name)codegen.commonr6   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr6   r  r  r  s          rU   get_tma_workspace_argr    sZ     @"}!++E2I--0CCD+<++-	 rt   c                    | j                   |vr!t        j                  d| j                   |       t        | j                  j
                        xr% | j                   |v xr t        | j                        S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r  allowed_layout_dtypess     rU   _use_template_for_gpur    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%rt   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w Nr,  )upperrg   max_autotune_gemm_backendsrP  rG  backendrR   s     rU   _use_autotune_backendr    M    ==?!<<BBDJJ3O	      Ac                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w r  )r  rg   max_autotune_conv_backendsrP  rG  r  s     rU   _use_conv_autotune_backendr    r  r  )enable_int32enable_float8check_max_autotunec                  ddl m}m} t        j                  t        j
                  t        j                  g}|r>t        j                  t        j
                  t        j                  t        j                  g}|r/|j                  t        j                  t        j                  g       t        | j                  j                        xr t        | |      xs) | j                  j                  dk(  xr | j                  |v xrS t         j"                  xs t         j$                  xs | xr* t'        d      xr  || j                  |j(                        S )Nr5   )BackendFeaturehas_backend_featurer  TRITON)r  r  r  rN   r   r:  r<  rD  extendr4  r5  r  r   r  r  r   rg   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r  r  r  r  r  r  layout_dtypess          rU   use_triton_templater    s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&-@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOrt   output_layout
add_guardsc                    ddl m} ddlm d
fddfd}dfd	 	 	 	 	 	 	 	 	 	 dfd |       xr t	        fd	|D              xr  ||       S )u  
    Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 2 ≤ rank ≤ 5
      * dtype ∈ {FP16, BF16, FP8-E4M3FN}
      * Every logical size ≥ 2
      * Base pointer 16-byte aligned
      * All "outer" dims have 16-byte aligned strides
      * The “inner” dim has stride 1 (contiguous)
      * For FP8 tensors, inner dim ≥ 32
    r   )has_triton_tma_devicer5   r\  c                X    j                   j                  j                  | t              S r   )r`  ra  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesr]  s    rU   _alignedzcan_use_tma.<locals>._aligned  s     ww<<ZWWrt   c                    | y| j                   }| j                  }| j                  } | j                        sy |||d      S )NTFallow_float32)r  r"  r   rO  )r  sizesstridesr   r  _is_tma_compatibles       rU   _is_tma_compatible_layoutz.can_use_tma.<locals>._is_tma_compatible_layout  sH    >-- &!%%tLLrt   c                    | j                         }| j                         }| j                         }| j                         j                  j
                  v ry |||d      S )NFr  )get_size
get_stride	get_dtyper  r`  unaligned_buffers)r  r  r  r   r]  r  s       rU   _is_tma_compatible_matrixz.can_use_tma.<locals>._is_tma_compatible_matrix#  sR    

,,. ::<177444!%%uMMrt   c                   t        |       }|j                  }|dk  s|dkD  ry|t        j                  t        j                  t        j
                  fvr|r|t        j                  k7  ryrKj                  j                  j                  |       }j                  j                  j                  |      }nd| D cg c]'  }j                  j                  j                  |      ) }}|D 	cg c]'  }	j                  j                  j                  |	      ) }}	t        fd|D              ryt        |      D 
	cg c]-  \  }
}	j                  j                  j                  |	d      r|
/ }}
}	t        |      dk7  ry|d   }t        |      D ]  \  }
}	|
|k(  r |	|z        r y ||   } ||z        sy|t        j
                  k(  r'j                  j                  j                  |d      syyc c}w c c}	w c c}	}
w )	Nra  r   Fc              3  l   K   | ]+  }j                   j                  j                  |d         - ywra  N)r`  ra  statically_known_geq)r   r   r]  s     rU   r   z:can_use_tma.<locals>._is_tma_compatible.<locals>.<genexpr>I  s+     P1177##88A>>Ps   14r5   r       T)rP   itemsizerN   r   r:  r4  r<  r`  ra  guard_int_seqsymbolic_hintrk  r   statically_known_equalsr  )r  r  r   r  rankr   sizes_i	strides_ir   str   r  	inner_idx	inner_dimr]  r  r  s                 rU   r  z'can_use_tma.<locals>._is_tma_compatible.  s    5z>> !8tax 8K8KLL%--!7gg&&44U;G((66w?IBGHQqww''55a8HGHFMN))77;NIN PPP
 #9-
2ww77A> 
 

 u:?!H	 y) 	EArI~BM*		 I&		H,- E'''0@0@0U0Ur1
 G IN
s   <,G:.,G??2Hc              3  .   K   | ]  } |        y wr   r   )r   r  r  s     rU   r   zcan_use_tma.<locals>.<genexpr>l  s     ?)!,?   )r  Union[int, sympy.Expr]r   r*  )r  Optional[Layout]r   r*  )r  r>   r   r*  )
r  Sequence[sympy.Expr]r  zSequence[_IntLike]r   torch.dtyper  r*  r   r*  )torch.utils._tritonr  r_  r]  rz   )	r  r  matricesr  r  r]  r  r  r  s	    `   @@@@rU   can_use_tmar    s~    " :XM	N:#:#: : 	:
 
:z 	 	5?h??	5%m4rt   )r  c                    t         j                  j                  r| nd }t        d |D              xr( t	        |||dxr t         j                  j
                  S )Nc              3  T   K   | ]   }t        |j                               d k(   " ywr  )rP   r  )r   r  s     rU   r   z*use_triton_tma_template.<locals>.<genexpr>v  s      5qC

"5s   &(r  )rg   r  enable_template_tma_storerz   r  enable_persistent_tma_matmul)r  r  r  r  s       rU   use_triton_tma_templater  q  sM     %mmEE]4F5H55 	7JO	7MM66rt   c                T    t        || |dsyddlm} ddlm}  |       xr  |       S )Nr  Fr   )%has_triton_tensor_descriptor_host_tmar5   is_datacenter_blackwell_arch)r  r  r  codegen.cuda.cuda_envr  )r  r  r  r  r  s        rU   !use_triton_blackwell_tma_templater  |  s2     #	: IC 12U7S7UUrt   )maxsizec                 d    	 t         j                  j                  d      duS # t        $ r Y yw xY w)zCheck if CuTeDSL is importable; cache the result for reuse.

    Call ensure_cute_available.cache_clear() after installing CuTeDSL
    in the same interpreter to retry the import.
    zcutlass.cuteNF)	importlibutil	find_specr   r   rt   rU   ensure_cute_availabler#    s3    ~~''7tCC s    # 	//c                n   t               syt        d      syddlm} t	        |j
                  j                        sy |       syt        j                  g}	t        ||	      syt        j                  st        j                  syt        | ||      syt        d | |fD              ry|r|ry|y||yy)a  
    Returns True if we can use the blackwell kernel for grouped mm.
    Required conditions:
        1. CuTeDSL backend is enabled
        2. CuTeDSL is available
        3. We are on a blackwell arch
        4. The dtype is bf16
        5. Max autotune or max autotune gemm is enabled
        6. A, B, and the output are 16B aligned
        7. We are not using dynamic shapes
        8. A is 2d
        9. B is 3d
        10. Offsets are provided
        11. Bias and Scale are not provided
    FCUTEDSLr5   r  )r  c              3  2   K   | ]  }t        |        y wr   )
is_dynamicr  s     rU   r   z3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>  s     
1Q:a=
1r  T)r#  r  r  r  r  r   r  rN   r:  r  rg   r  r  r  rk  )
mat_amat_br  a_is_2db_is_2doffsbiasscale_resultr  r  s
             rU    use_blackwell_cutedsl_grouped_mmr/    s    2 !" +C&--$$%')^^$M 76#;#; ue6:

15%.
11g|<3rt   c                <   ddl m} |j                  j                  j	                  ||z  |z  d      }|dk  s|t
        j                  j                  k  ryddlm	} t        j                  j                  ryt        j                  t        j                  t        j                  g}t!        | |      xr/ t
        j"                  xs t
        j$                  xr t'        d      }|r6 |       s/t(        j+                  d	t
        j                  j,                         y|S )
Nr5   r\  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)r_  r]  r`  ra  	size_hintrg   rF   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr3  rN   r  r  r   r:  rD  r  r  r  r  r   r  cutlass_dir)	r  r  r  r  r]  	gemm_sizer3  r  r   s	            rU   use_cutlass_templater:    s      **1q519r*BIA~V[[%N%NN> }} ]]ENNEKK@Mfm4 	-  <F$<$<	-!),  !#KK4 ''	 Jrt   c                    t         j                  j                  j                         }|dk(  ry| j                         |j	                  d      D cg c]  }|j                          c}v S c c}w )z8Check if CUTLASS should be used for the given operation.ALLTr,  )rg   rF   cutlass_enabled_opsr  rP  rG  )op_nameenabled_opsrR   s      rU   _use_cutlass_for_opr@    sU    ++11779Ke==?+2C2CC2HIQqwwyIIIIs   A,r   _IntLikec           
        ddl m} t        j                  j                  |z  }t
        j                  j                   xr |j                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xrO |j                  j                   xr6 |j                  j                   xr t        j                  j                   dkD  S )Nr   r\  )torch._inductor.virtualizedr]  rg   r  decompose_k_thresholdrN   r  r  r`  ra  statically_known_truerw   AndGeaot_modecpp_wrappernum_decompose_k_splits)r  r  r  threshold_multipler]  rD  s         rU   use_decompose_k_choicerL    s     ."MM??BTT MM 		5GG22II1A561A56
		5    		5 ###		5 MM0014rt   c           
        t         j                  j                  }ddlm} t        t        j                  j                        xr |j                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xr0 |j                  j                   xr |j                  j                    S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   r\  )rg   rocmcontiguous_thresholdrC  r]  r*  rN   r  r  r`  ra  rE  rw   rF  rG  rH  rI  )r  r  r  rO  r]  s        rU   use_contiguousrP    s     ";;;; . 	U]] 	$GG22II01450145
	$    	$ ###
rt   c                   t         j                  j                  }g d}t        |t        j
                        r|j                  s|S |dk(  rg S t        | t        j
                        r| j                  r&t        |t        j
                        r|j                  sd}nt        || z  ||z        }d}t	        j                  |      }|D cg c]  }||k  r||k\  r| }}g g g }}
}	|D ]Z  }||z  }|dk  r||dz
  z  dk(  r|dk\  r|	j                  |       0|dz  dk(  r|
j                  |       J|j                  |       \ t         j                  dk(  r|	|
z   |z   S |	|
z   |z   }|d | S c c}w )	N)rl   r  rn   rm      r   rR  ra  rm   r5   r  
EXHAUSTIVE)rg   r  rJ  rv   rw   r#  	is_numberr^  divisorsr  max_autotune_gemm_search_space)r  r  r  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitrU  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rU   get_k_splitsrb  4  s    ]]99N .!UZZ 	1		1ejj!!++1ejj!!++!q&!q&)K~~a H  k!g&< 	H  =?B>) %Q 3; EAI!#$$Q'RZ1_%%a( !!!$%" ,,< #55FF#&88>IK''=s   
E,c                T    t         j                  j                  |       j                  S r   )rN   rF   r  gcnArchNamer   s    rU   _rocm_native_device_arch_namerf  m  s    ::++F3???rt   c                     	 dd l } ddlm}m} ddlm} t        j                  j                  | j                        }||||fS # t        $ r dd}dd} G d d      }d }Y %w xY w)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     g S r   r   r   rt   rU   rh  z*try_import_ck_lib.<locals>.gen_ops_library      Irt   c                     g S r   r   r   rt   rU   ri  z.try_import_ck_lib.<locals>.gen_ops_preselected  rl  rt   c                      e Zd Zy)*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   r   rt   rU   rj  ro    s    rt   rj  )r   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesrh  ri  ck4inductor.universal_gemm.oprj  r  r  dirname__file__r   )rp  rh  ri  rj  package_dirnames        rU   try_import_ck_librv  r  sl    	
	
 ''//+*>*>? O-@/QQ  			 	 s   ;A A#"A#c                    t         j                  st         j                  syt        j                  j
                  sy| j                  j                  dk7  ryt        | j                        }t         j                  j                  D ci c]  }|j                  d      d   | c}xs |j                  d      d   |i}|j                         t         j                  j                  z  D cg c]  }||   	 }}|sy| j                  t        j                  t        j                   t        j"                  fvryt%               \  }}}}|st&        j)                  d       y|t         j                  _        yc c}w c c}w )NFrF   :r   z,Please pip install Composable Kernel packageT)rg   r  r  rN   r  r  r   r  rf  rN  archrP  r4  ck_supported_archr   r   r:  r<  rv  r   r  ck_dir)r  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          rU   use_ck_templater    s<   6#;#;==}}V# 0>K39;;3C3CDaqwws|A)D #q!;IO
 !%%'&++*G*GG! 	! ! %||EMM5>>5==II"3"51aBC+FKK+ E!s   E6,E;c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr5   r\  CKr   r1  r   r_  r]  r  r  r`  ra  r5  r  r  r  r  r]  s        rU   use_ck_gemm_templater    sR     	d# 	CF#	CGG&&q1uqy2&>Brt   c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr5   r\  CKTILEr   r1  r   r  r  s        rU   use_ck_tile_gemm_templater    sR     	h' 	CF#	CGG&&q1uqy2&>Brt   c                2    t        d      xr t        |       S )Nr  )r  r  r  s    rU   use_ck_conv_templater    s    %d+G0GGrt   c                |    t         j                  xs t         j                  xr | j                  j                  dk(  S r  )rg   r  r  r   r  r  s    rU   _use_template_for_cpur    s2    7v77&
--


%&rt   c                   ddl m} t        |j                  |      sJ |j                  j                  }|j                  j
                  }t        |       xrX |j                         t        j                  k(  xr5 t        |      dk(  xr% t        |      dk(  xr |d   |d   k(  xr |d   dk(  }t        | ||d      xr |j                  j                         xs |S )Nr5   )r?      ra  F)require_constant_mat2)r  r?   rv   r  r  r"  r  r  rN   r<  rP   use_cpp_gemm_templateis_contiguous)r  mat1mat2r?   	mat1_sizemat1_stridemat1_each_batch_is_contiguouss          rU   use_cpp_bmm_templater    s     dkk6***
   I++$$Kf% 	"NN-	"^q 	" "	" ^y|+		"
 ^q  " !t5Q !!#D'Drt   c                `   ddl m} ddlm} ddlm}	 ddlm}
 t        |       rt        d      syt        j                  j                  sy|j                         t        j                  t        j                   fv }t        j"                  t        j$                  t        j&                  t        j                  g} |
|||r| j(                  nd ||      \  }}}} }}t+        ||f      ryt-        ||j.                        r|j1                         } |	|j                               \  }} |d	||||j                         |j                         |t3               | |

      }dd}| j(                  |v xr= |d uxr7  ||      xr- t-        ||j4                        xr |j7                         xs | S )Nr5   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    | j                          | j                         d   dk(  S )Nr   r5   )freeze_layoutr  rR   s    rU   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride1	  s"    	||~b!Q&&rt   )rR   r>   r   r*  )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r  rg   cppweight_prepackr  rN   rK  r@  r<  r:  halfr   has_free_symbolsrv   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r  r  r  r  r  is_woq_int4r  r  r  r  r  	int8_gemmr  r  r  r  r  r   r  r  s                       rU   r  r    s    9M) (0Ee0L::$$ U[[%**$==I]]ENNEJJLM")"+&,,'#Aq!VT4 A$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C tR]]+	C ""$A,A(Art   c                 b    t         j                  xs t         j                   xs t        d      S )NATEN)rg   r  r  r  r   rt   rU   use_aten_gemm_kernelsr  ,	  s-    7v77 '	v	&'rt   c                  T    e Zd ZU  ej                  d      Zded<   ddZddZd	dZ	y)
DebugDirManagerr   r  prev_debug_namec                @    t        t        j                        | _        y r   )r  r  counterr   r6  s    rU   r&  zDebugDirManager.__init__6	  s    ../rt   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)rN   _dynamorg   debug_dir_rootr  r   new_namer6  s    rU   	__enter__zDebugDirManager.__enter__9	  sM    $}}33BB//0dggY?.2mm+rt   c                    t        j                  | j                         | j                  t        j
                  j                  _        y r   )r  r  r  r  rN   r  rg   r  )r  r}   s     rU   __exit__zDebugDirManager.__exit__>	  s*    dmm$.2.B.B+rt   Nrr  )r}   r   r   r  )
r   r   r   rW  r.  r  r   r&  r  r  r   rt   rU   r  r  2	  s(    iooa G0<
Crt   r  c                   ddl m} t               dfd}t        j                  j                  |d|      5  t        j                  j                           | |i |}d d d        t              fS # 1 sw Y   xY w)Nr5   r:   c                (    j                  |        y r   )r6  codesource_codess    rU   save_output_codez*run_and_get_code.<locals>.save_output_codeL	  s    rt   r  r  r  r   r  )
r`  r;   r"   r   r  r{  rN   r  resetrR  )r   r}   r  r;   r  r  r  s         @rU   run_and_get_coder  C	  st    
 %$.LL 
		=*<>N	O %T$V$% 4%%%% %s   'A55A>c                    t        | g|i |\  }}g }|D ]6  }|j                  t        j                  d|t        j                               8 ||fS )Nz	'''.*?''')r  r  r   findallDOTALL)r   r}   r  r  r  kernelsr  s          rU   run_and_get_kernelsr  U	  sZ     ,B@@@FLG Brzz,bii@AB7?rt   c                &     d fd}t        |      S )Nc                 R            } | j                         j                          | S r   )r   r  )r  r   s    rU   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backwarda	  s!    

rt   )r   r   )r  )r   r  s   ` rU   run_fw_bw_and_get_coder  `	  s    
 -..rt   c                X   ddl m} g dfdd	fd}t        j                  j	                  |d|      5  t        j                  j	                  |d      5  t
        j                  j                           | |i |}ddd       ddd       S # 1 sw Y   xY w# 1 sw Y   S xY w)
zLGet the inductor-generated code, but skip any actual compilation or running.r5   r:   c                (    j                  |        y r   r  r  s    rU   r  z"get_code.<locals>.save_output_codeo	  s    D!rt   c                     G d d      }| j                   r| j                         n| j                         \  }} |j                         |r |j                          |       S )Nc                       e Zd ZdZddZddZy)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                     y r   r   r6  s    rU   r&  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__v	  s    rt   c                     y r   r   r  s      rU   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.cally	  s    rt   Nrr  r}   r   r  r   r   r  )r   r   r   r   r&  r  r   rt   rU   DummyModuler  s	  s    Frt   r  )rI  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_coder  s       rU   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_moduler	  s]    	 	 04/?/?D))+T\\^ 	"k 	++,[../}rt   compile_to_moduler  Nr  )r  r;   r   r   )r`  r;   r   r  r{  rN   r  r  )r   r}   r  r;   r  r   r  r  s         @@rU   get_coder  i	  s    $ L", 	

.0I	
  	

-);=MN	  	          s#   "B'BBB	BB)c                |    t        | g|i |}dt        |      cxk  rdk  sn J dt        |              |d   S Nr5   ra  z%expected one or two code outputs got r   )r  rP   )r   r}   r  r  s       rU   get_triton_coder  	  sQ    B000LL!&Q& 
/L0A/BC& ?rt   c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S r  )r  rP   )r   r}   r  r   r  s        rU   run_and_get_triton_coder  	  sW     'r;D;F;OA|L!&Q& 
/L0A/BC& ?rt   c                    ddl m ddlm} |j                  g dfd}t
        j                  j                  |d|      5   | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   r:   rB   c                 ^     | i | | d   }t        |      sJ j                  |       y )Nra  )rv   r  )r}   r  r`  r;   graph_lowerings	real_inits      rU   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init	  s7    4"6"Q%///u%rt   r&  r  )torch._inductor.graphr;   torch._inductor.output_coderC   r&  r   r  r{  )	r   r}   r  rC   r  r  r;   r  r  s	         @@@rU   run_and_get_graph_loweringr  	  sq     4;((IO& 
		?J		B %T$V$% ?""% ?""s   	AA(c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr  	loweringsr  partial)aten_opoverride_fnr  orig_fns       rU   override_loweringr  	  s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                  d fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S r   r   )r  r0  outr  post_fnpre_fns      rU   r  z(add_scheduler_init_hook.<locals>.wrapper	  s+    y% i'Iu%
rt   r&  )r  r   r0  r   r   r   )torch._inductor.schedulerr  r&  unittestr   r  r{  )r  r  r  r  r  s   ``  @rU   add_scheduler_init_hookr  	  s9     4  G ==%%iWEErt   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rg   developer_warningsr   r  info)msgs    rU   developer_warningr  	  s$       Crt   c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr5   r   ri  z--only=N)r  argvr  rP   
ValueErrorr  )r=  r  s     rU   get_benchmark_namer  	  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx )>>)$s9~'(()   s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr5   Nr   r  s     rU   r   zis_ones.<locals>.<genexpr>
       %!qAv%   rz   r5  s    rU   is_onesr  
      %u%%%rt   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr   r  s     rU   r   zis_zeros.<locals>.<genexpr>
  r  r  r  r  s    rU   is_zerosr   
  r  rt   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r  N)rv   rN   r  r   )r   r   s     rU   r   z is_cpu_device.<locals>.<genexpr>
  s8      dELL) 	u||E**s   AAr  )inputss    rU   is_cpu_devicer$  
  s       rt   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rv   rw   r#  r   rN   rF  r>  )rZ  s    rU   get_sympy_Expr_dtyper&  #
  s=    c5::& B& ~~{{}}rt   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwr   )rN   r   r   )should_profiler}   r  r   s       rU   maybe_profiler)  -
  sE     ^^##T4V4 	G	 	 		 	s   "A7AA Ac                 l    t         j                  j                  } | dk  rt        j                         } | S Nr5   )rg   r  threadsrN   get_num_threads)r,  s    rU   r  r  6
  s+    jj  G{'')Nrt   c                     ddl m}   |        }|j                  dt        j                  j
                  rd      S d      S )Nr5   )get_backend_options
num_stagesra  r  )runtime.triton_helpersr/  r2  rN   r  r  )r/  optionss     rU   get_backend_num_stagesr3  =
  s2    ;!#G;;|%--*;*;QCCCCrt   c                   t        | t        j                  j                  j                  j
                        }||S ddlm}m} t        j                  j                         xr! t        j                  j                         dk\  }| t        j                  t        j                  t        j                  fv sJ t        j                  |      j                   j#                  d      rddlm}  |       }| t        j                  t        j                  fv r|r	 || |      S t        j                  j                  j                  j
                  r |t        j                  |      S  |t        j                  |      S | t        j                  t        j                  fv r
|r ||       S t        j                  j                  j                  j
                  r |t        j                        S  |t        j                        S )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    )is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)ro   r   
clock_rate)max_clock_rate)r   rN   backendsrF   matmul
allow_tf32triton.testingr6  r7  rO   get_device_capabilityr   r:  r<  inspect	signature
parametersr2  torch._utils_internalr9  )r   ds_topsr6  r7  SM80OrLaterr9  sm_clocks          rU   get_device_tflopsrF  E
  sm    UENN,?,?,F,F,Q,QRGM**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\J8!#U]]ENN33,UH==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];; 'u}}55rt   c                     ddl m}   |        S )Nr   get_dram_gbps)r=  rI  rH  s    rU   get_gpu_dram_gbpsrJ  q
  s    ,?rt   c                 x    ddl m}  | j                  j                  j	                  d      j                  dd      S )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r2  rL  s    rU   get_gpu_shared_memoryrO  x
  s0    % ==44Q7;;<LaPPrt   c                     t         j                  j                         rUt         j                  j                         j                  } t         j                  j                         j
                  }|| z  S d} d}|| z  S )Nr  i   )rN   rF   rO   r  	warp_sizemax_threads_per_block)rQ  rR  s     rU   get_max_numwarpsrS  
  sh    zz JJ446@@	 %

 @ @ B X X
 !I-- 	 $ I--rt   c                $    | j                  d      S )Nwelford)r  reduction_types    rU   is_welford_reductionrX  
  s    $$Y//rt   c                (    t        |       ry| dk(  ryy)Nr  online_softmax_reducera  r5   )rX  rV  s    rU   reduction_num_outputsr[  
  s    N+	2	2rt   c                 0    t        j                         dk(  S )NLinux)platformsystemr   rt   rU   is_linuxr`  
  s    ??''rt   c                 (    t         j                  dk(  S )Nri   )r  r^  r   rt   rU   r  r  
  s    <<7""rt   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wr   )rv   rw   r#  rT  r  s     rU   r   z#has_free_symbols.<locals>.<genexpr>
  s)     Jz!UZZ(<_<Js   35r  )itrs    rU   r  r  
  s    JcJJJrt   c            	     x   ddl m} | D ]  }t        ||j                  |j                  |j
                  |j                  |j                  f      r=t        |j                         xs d      st        |j                         xs d      s yt        ||j                        st        dt        |              y)Nr5   r  r   Tzunexpected type for is_dynamic F)r  r  rv   r  r  r  rR  r<   r  maybe_get_sizemaybe_get_strider>   	TypeErrorr  )r}   r  ts      rU   r'  r'  
  s     IbmmR[[":K:KRYYW
   0 0 2 8b9=M""$*> Aryy)=d1gYGHHI rt   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   rl  rm  r   rt   rU   rk  rk  
  s      K *rt   rk  c                ~   ddl m} t        j                  dd      5 }t	        j
                         }t	        j
                         } t        |t        |            j                  |  t        d|j                   |       t        |j                  |       t        j                         }t        ||      5   | |j                         d d d        t        j                         |z
  }	 ||j                         |j                  j                          |j                          t        d	|j                   |       t        |j                  |       |j!                         |j!                         k(  }
t"        j%                  d
||j&                  |
|	       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr5   )stable_topological_sortrG  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherro  r  NamedTemporaryFileior   r\   rX   	propagater  r`  r   nowr[   lint	recompiler/  r   r  r   )ro  r  inpr  ro  r  	before_ioafter_io
start_timetime_elapsedri  s              rU   pass_execution_and_saver  
  sU    9		$	$
 
 
KKM	;;=C	R#3C#89CCSI	"(($1-bhhY'\\^
#B, 	N	||~
2)


#!,bhhX& H$5$5$77hFF	
+
 
	 	
 
s%   BF3;F'CF3'F0	,F33F<c                ~    ddl m} t        | |j                        xr  t        | j                  |j
                        S )zB
    Check if input buffer is a multi-outputs template buffer
    r5   r  )r  r  rv   CppTemplateBufferr  MultiOutputLayout	input_bufr  s     rU   is_multi_outputs_templater  
  s9     i!5!56 :"..< rt   c                    ddl m} t        | |j                        xr2 t	        | j
                        dk(  xr t        | j
                  d         S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r5   r  r   )r  r  rv   MultiOutputrP   r#  r  r  s     rU   #is_output_of_multi_outputs_templater  
  sL      	9bnn- 	;	  !Q&	;%i&6&6q&9:rt   c                &   | yddl m} t        | |j                        xr- t        | |j                         xr |d u xs | j
                  |u xsA t        |       |j                  u xr' t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xs t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xsa t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  S )NFr5   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  rv   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr   rN   r   torchrecr  defaultr  r  r  ro  r  s      rU   is_collectiver  
  sG    | 	4--. 	3400	34Z14++r1 	T
b''' 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/rt   c                <    ddl m} t        |       |j                  u S Nr5   r  )r  r  r  r  )r  r  s     rU   is_waitr  $  s    :''rt   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        xr |d u xs  ||       S )Nr   GroupedSchedulerNodec              3  2   K   | ]  }t        |        y wr   )contains_collectiver  s     rU   r   z&contains_collective.<locals>.<genexpr>1  s     @a&q)@r  )r	  r  rv   rk  snodesr  r  )snode	filter_fnr  s      rU   r  r  *  sJ     ?%-.@5<<@@@$P)t*;*Oy?OPrt   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   r  c              3  2   K   | ]  }t        |        y wr   )contains_waitr  s     rU   r   z contains_wait.<locals>.<genexpr>:  s     :=#:r  )r	  r  rv   rk  r  r  r  )r  r  s     rU   r  r  6  s4    >%-.:U\\:::uzz""rt   c                    ddl m} t        |t        j                  j
                        r|g}t        | |j                        xr | j                  |v S r  )r  r  rv   rN   r|  r}  r  r  r  s      rU   is_fallback_opr  ?  sE     "ejj++,TdB--.I43C3Cr3IIrt   c                B    |||    j                   j                            S r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rU   buf_name_to_fused_snoder  J  s#     k(3??HHJKKrt   c                     yrr  r   r  s    rU   rs  rs  U  rt  rt   c                     ||       ry |j                  |        | j                  D ].  }t        |j                  ||      }||v rt	        |||||       0 y )Ncriteria_cb)r6  unmet_dependenciesr  r   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_deps          rU   r  r  P  sn     55!'' 
5HHk#5
 "44##	

rt   c                     yrr  r   r  s    rU   rs  rs  n  rt  rt   c           	     z    ||       ry |j                  |        | j                         D ]  }|j                  D ]}  }|j                  J |j                  j	                         dk(  r/|j                  j	                         |vrL||j                  j	                            }||v rnt        |||||         y )NOUTPUTr  )r6  get_outputsr  r  r  find_recursive_users_of_node)r  r  r  r  r  orN  user_ops           rU   r  r  i  s     55!  GG 	D99(((yy!!#x/yy!!#+==(););)=>G,,(""'	rt   c                b    t         j                  j                  j                  rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)ra  r   )rN   
_functorchrg   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rU   num_fw_fixed_argumentsr    s6     $$::   "669SSSrt   c                    dd}d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                ~    d| j                   vxr. d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  s    rU   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor  sH    aff$ .!&&(.!/.  qvv-		
rt   r   r  r5   )rR   r4   r   r*  )r`  r0  ro  r  rR  r   rP   )fx_gr  	arg_countstatic_arg_idxsr  s        rU   count_tangentsr    s    

 IOZZ 44= q!&&y1NI	 d5_)=#>????rt   c                  2    e Zd ZU ded<   ddZedd       Zy)	BoxedBoolr*  r   c                    | j                   S r   )r   r6  s    rU   r>  zBoxedBool.__bool__  s    zzrt   c                6    t        | t              r	d| _        | S yrr  )rv   r  r   r  s    rU   disablezBoxedBool.disable  s    c9%CIJrt   Nrs  )r  r   r   zUnion[BoxedBool, bool])r   r   r   r   r>  r  r  r   rt   rU   r  r    s     K  rt   r  c              #      K   ddl m} |j                  	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fd}t        j                  j                  |d|      5  d  d d d        y # 1 sw Y   y xY ww)Nr5   r7   c                @    j                  |        | |||||      S r   r  )r  kernel_namer  r@  gpucpp_definitionkernel_listorig_define_kernels         rU   define_kernelz.collect_defined_kernels.<locals>.define_kernel  s-     	;'!+{Hc>
 	
rt   r  )NTN)r  r8   r  r  r  r  r@  Optional[str]r  r*  r  r  r   r   )codegen.wrapperr8   r  r   r  r{  )r  r8   r  r  s   `  @rU   collect_defined_kernelsr    s     5-;; #'(,
"

 
  	

 
 &
 

 
		/-	P   s   AA*A	A*A'#A*c                    | dz   S )N__original__r   r  s    rU    get_cloned_parameter_buffer_namer    s    .  rt   c                    | t         v S r   )rL   re  s    rU   r  r    s    Yrt   c                &    | dk7  xr t        |       S )NrG   )r  re  s    rU   device_need_guardr    s    U?-vf~-rt   c                N   | t         j                  k(  r?t         j                  j                         r!t         j                  j	                         dk  S | t         j                  k(  rt         j
                  j                         ry| t         j                  t         j                  fv S )N)r  r   T)rN   r:  rF   rO   r>  rH   rF  r*  r  s    rU   ,needs_fallback_due_to_atomic_add_limitationsr    sk    5::#:#:#<zz//1F::	%..	 UYY%;%;%=ejj111rt   c                   | j                   t        j                  j                  j                  t        j                  j                  j
                  fv r|y| j                   t        j                  j                  j                  k(  rdnd}|d |fvxs |xr t        |      xr t        |      xs | j                   t        j                  j                  j                  k(  xrW |dk(  xrP |xrL |dk(  xrE t        j                  j                  xr) t        j                  j                  xs t               dk7  xs? ||k(  xr" |t        j                  t        j                  fv xs t        j                          S )NFr6  r   r  r5   )overloadpacketrN   r   atenscatter_reduce_scatter_reducescatter_r  r  rg   r  fallback_scatter_reduce_sumdynamic_threadsr  r*  rF  $are_deterministic_algorithms_enabled)r  rW  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rU   use_scatter_fallbackr    sZ    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 i'SJ5::u{{:S,S	8 557!rt   c                   ddl m}m} ddlm} t        dt        |        d       t        |       D ]  \  }}t        d|dd       ||u rt        d	       '||u rt        d
       7t        ||      r|j                         }t        |rdnd d       |r:|j                  J t        d|j                  j                  j                          t        d       |j                  j                  D ]  }t        |        t        d       |j                  j                  D ]  }t        |        t!        dt#        |              y)z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3rx  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r	  r  r  rP   r   rv   is_reductionr  r  reduction_hintr'  r(  r)  r   r  )r  r  r  r  r=  r  is_redr  s           rU   dump_node_scheduler    s=   
 O7	M 236
:;}- H	T#al?"$%%%%&m,&&(FfU$/?@yy,,,01N1N0OPQ*''-- c
+''.. c
 !9$t*FGG'Hrt   c                z    ddl m}  || j                         t        | j                        z  t
        z  dk(        S )Nr   )rE  )r  rE  storage_offsetr  r   GPU_ALIGN_BYTES)r   rE  s     rU   tensor_is_alignedr  +  s:     L 				 >&,,#?	??RVWW rt   c                |    t        | j                  j                        syt        j                  xs t        |       S rr  )r  r   r  rg   assume_aligned_inputsr  )example_inputs    rU   should_assume_input_alignedr	  9  s2     -&&++,''K+<]+KKrt   c                 6   t         j                  j                  j                         } | st	        j
                         S | j                  r| j                  j                  st	        j
                         S | j                  j                  }|j                         S r   )	rN   _guardsTracingContexttry_getrR  nullcontextrr  rb  suppress_guards)tracing_contextrb  s     rU   #maybe_get_suppress_shape_guards_ctxr  B  sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&rt   c                   t         j                  j                  j                  t        dd      5  t
        j                  j                          dd l}dd l	} |j                         } |j                  |      }ddlm} |j                  |       |j                  }|j!                  |j"                          | |i |}	|j%                         }
|j!                  |       |j'                  |       d d d        |	|
fS # 1 sw Y   	
fS xY w)Nr   Tr   )output_code_log)r
  r   r  r{  rg   rN   r  r  rv  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr/  removeHandler)r   r}   r  rv  r  log_capture_stringchr  
prev_levelr  r   s              rU   run_and_get_cpp_coder  R  s     
			#	#FGT	: *(R[[]"W""#56=""2&$**
  /T$V$'')  ,%%b)*  19!*  19s   CC>>D
c                   t        |       }||j                  S | D ]  }t        |t        j                        r|j
                  j                  c S t        |t        j                        sP|j                         D ]6  }t        |t        j                        s|j
                  j                  c c S  |j                         D ]6  }t        |t        j                        s|j
                  j                  c c S   y r   )	rX   rb  rv   rN   r1   r  r  r  r"  )r#  rr  inputr  r"  s        rU   shape_env_from_inputsr"  k  s     (I """  1eU\\*::''' eU\\*

 /dELL199.../  ,,. 1fell3!;;00011 rt   c                <     t              dk(  r S d fd}|S )Nr   c                z    t        |       \  }} |       }t        |      rt        j                  ||       |S r   )copy_misaligned_inputsrP   rN   _foreach_copy_)
new_inputsold_tensorsnew_tensorsr  inputs_to_checkr  mutated_input_idxss       rU   r  z)align_inputs_from_check_idxs.<locals>.run  sE    #9);$
 [ J {  k:
rt   )r'  list[InputType]r   r   )rP   )r  r*  r+  r  s   ``` rU   align_inputs_from_check_idxsr-    s#    
 ?q  Jrt   c                T   d| j                         v rd}n;t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nr   c              3  2   K   | ]  \  }}|d z
  |z    ywr  r   )r   r  r"  s      rU   r   z)clone_preserve_strides.<locals>.<genexpr>  s     Tf$Tr  r5   r   )r  r   r   r"  rN   
as_stridedclone)rR   needed_sizer  s      rU   clone_preserve_stridesr3    s    AFFH} T#affh
:STTWXX 	 a+6<<>FFAFFHahhj99rt   c                2   g }g }|du}|D ]  }| |   }t        |t        j                        sJ dt        |              |j	                         t
        z  sMt        |      | |<   |s^||v sc|j                  |       |j                  | |           ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )rv   rN   r  r  data_ptr	ALIGNMENTr3  r  )r'  check_inputs_idxsreturn_pair_idxsr(  r)  ret_pair_definedr   _inps           rU   r%  r%    s     ')K&(K (t3 
2!}$- 	
.tDzl;	
- ==?Y&248JqMA)9$9""4("":a=1
2 ##rt   c                    g }|D ]N  }| |   }t        |t        j                        s#|j                         t        z  dk(  s>|j                  |       P t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rv   rN   r  r5  r6  r  rP   )r#  static_input_idxsaligned_static_input_idxsr=  r!  s        rU   remove_unaligned_input_idxsr>    st     !#  2seU\\*0@90LQR/R%,,S12 $%->)??((rt   c                P   ddl m} t        j                  t        j                        j
                  }|j                  j                  j                  }|j                  j                  j                  j                  }t        j                  r'|j                  j                  j                  | |       y|j                  j                  j                  | |k        ry|j                  r)|j                  j                  j                  | dk        ry ||       xr  ||       |k  S )Nr5   r\  Tg@xDF)r_  r]  rN   iinforD  r   r`  ra  r5  rb  has_hintrg   assume_32bit_indexing	check_leqrE  aot_compilation)r   r]  int_maxr5  rA  s        rU   expr_fits_within_32bitrF    s    kk%++&**G  **Iww))22H##	""1g. 	ww--a7l; 	 7711!d(;  A;29Q<722rt   c                   t         j                  j                  j                         }||j                  t        |j                        dk(  sJ t        |       |j                  J |j                  D ]  }||j                  j                  d        !dt         j                  j                  j                         x}r|j                  dfd|j                  j                  t        fd|D                      y y y )Nr   Fc                f    t        |       S rj                  |       S j                  |       S r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callrb  s    rU   map_exprz4set_tracing_context_output_strides.<locals>.map_expr  s7     ("1v((<<Q??$55a88rt   c              3  .   K   | ]  } |        y wr   r   )r   r   rL  s     rU   r   z5set_tracing_context_output_strides.<locals>.<genexpr>  s     5!(1+5r  )r   r   r   z,Union[float, int, SymInt, SymFloat, SymBool])
rN   r  r  r  output_stridesrP   r"  r  rK  r  )r  compiled_graphr  r  r  rK  rL  rb  s        @@@rU   "set_tracing_context_output_stridesrP    s     mm**224Gw55A7))*a///).9	,,888#22 	E}&&--d3$)!--66>>@@3@(+(=(=%9 &&--5u55		  Brt   c                    t         j                  t         j                  S t        j                         syt        j                  j                         ry	 ddlm}  | t        j                  j                  d      k\  S # t        $ r Y yw xY w)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rg   fx_graph_remote_cache	is_fbcoderN   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacherS  ModuleNotFoundErrorjustknobs_getval_intrR  s    rU    should_use_remote_fx_graph_cacher[     s    ##/+++,,.H  5#8#8#M#M8$    s   A> >	B
	B
c                0    t        j                  dd|       S )Nz[^a-zA-Z0-9_]r   )r   subr  s    rU   normalize_namer^  3  s    66"C..rt   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                l    t         j                  dt        |             }t        j	                  ||      S )z"Convert torch.dtype to triton typetl.)_triton_type_rer]  r  _triton_type_mappingr2  )r   triton_type_names     rU   triton_typerd  I  s.    &**5#e*=##$46FGGrt   c                    t         j                  | |       }|j                  dd      }t        t        |      }t        |t        j                        sJ |S )Nr`  r  )_torch_triton_mappingr2  r  rM   rN   rv   r   )r   adjusted_type	type_namer  s       rU   triton_type_to_torchri  O  sL    )--eU;M%%eR0Iy)Ii---rt   c                   | j                    xr | j                         |j                         k(  xr | j                         |j                         k(  xr | j                  |j                  k(  xr{ | j                  |j                  k(  xr` | j                         j                         |j                         j                         k(  xr! | j                         |j                         k(  S r   )	is_mkldnnr  r"  r   r   untyped_storager5  r  r  r   s     rU   is_same_tensorrn  W  s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;rt   c                v   | j                   xr | j                         |j                         k(  xr | j                  |j                  k(  xrn | j                  |j                  k(  xrS t        j
                  j                  j                  |       t        j
                  j                  j                  |      k(  S r   )rk  r  r   r   rN   r   mkldnnr5  rm  s     rU   is_same_mkldnn_tensorrq  c  s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOrt   c                      y)N)r`  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   rt   rU   boolean_opsr  m  s    rt   c                  "    e Zd ZU ded<   ded<   y)OpDtypeRuler2   type_promotion_kindOptional[torch.dtype]override_return_dtypeNr  r   rt   rU   r  r    s    8800rt   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                *    t        ||      t        | <   y r   )r  r  )r   r  r  s      rU   #register_op_dtype_propagation_rulesr    s    
 (32(t$rt   zOrderedSet[str]op_requires_libdevice_fp64c                .    t         j                  |        y r   )r  r6  r  s    rU   #register_op_requires_libdevice_fp64r    s    ""4(rt   c                    ddl m} | s$|j                  j                         j                  } | dk(  rt
        j                  S | dk(  ry| dk(  rt
        j                  S t
        j                  S )Nr   r\  r  rG   rH   )	rC  r]  r`  get_current_device_or_throwr  rg   cpu_backendxpu_backendcuda_backend)r   r]  s     rU   get_current_backendr    s_    -gg99;@@e!!!				!!!"""rt   c                    | t         j                  t         j                  fv r7t        j                  j
                  rt               dk(  rt         j                  S | S )z"Maybe upcast [b]float16 to float32r  )rN   r   r:  rg   r  codegen_upcast_to_fp32r  r<  r  s    rU   upcast_compute_typer    s@     	%--00MM00!X-}}Lrt   KeyTypeValTypec                  Z    e Zd ZdZddZddZddZddZdddZddZ	dd	Z
dd
ZddZy)
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    c                     || _         i | _        y r   original_dict	new_items)r  r  s     rU   r&  zScopedDict.__init__  s    *13rt   c                Z    || j                   v r| j                   |   S | j                  |   S r   r  r  r  s     rU   r   zScopedDict.__getitem__  s.    $.. >>#&&!!#&&rt   c                "    || j                   |<   y r   )r  )r  rS  r   s      rU   __setitem__zScopedDict.__setitem__  s    #srt   c                >    || j                   v xs || j                  v S r   r  r  s     rU   __contains__zScopedDict.__contains__  s!    dnn$At/A/A(AArt   Nc                t    || j                   v r| j                   |   S | j                  j                  ||      S r   )r  r  r2  )r  rS  r  s      rU   r2  zScopedDict.get  s6    $.. >>#&&!!%%c733rt   c                z    t        | j                        }| j                  D ]  }|| j                  vs|dz  } |S r+  )rP   r  r  )r  r  r  s      rU   r  zScopedDict.__len__  sC    ""# 	A***Q	 rt   c              #     K   | j                   E d {    | j                  D ]  }|| j                   vs|  y 7 )wr   r  )r  r  s     rU   __iter__zScopedDict.__iter__  s@     %%%% 	A***	 	&s   ><!>>c                H    t        | j                  xs | j                        S r   )r*  r  r  r6  s    rU   r>  zScopedDict.__bool__  s    D&&8$..99rt   c                    t         r   r  r  s     rU   __delitem__zScopedDict.__delitem__  s    !!rt   )r  Mapping[KeyType, ValType])rS  r  r   r  )rS  r  r   r  r   r  )rS  r{  r   r*  r   )rS  r  r  Optional[ValType]r   r  r  )r   zIterator[KeyType]rs  )rS  r  r   r  )r   r   r   r   r&  r   r  r  r2  r  r  r>  r  r   rt   rU   r  r    s5    4'
$B4
:"rt   r  )frozen_defaultc              (    dfd}| |S  ||       S )Nc                4    t        j                  | d      S )NT)kw_onlyr   )dataclasses	dataclass)r   r   s    rU   wrapzir_dataclass.<locals>.wrap  s    $$S$vFFrt   )r   rk   r   rk   r   )r   r   r  s    ` rU   ir_dataclassr    s    G {9rt   c                     t         j                  j                  j                         } | "| j                  r| j                  j
                  S y r   )rN   r  r  r  fw_metadatabw_donated_idxs)r  s    rU   get_donated_idxsr    s=    mm22::<O"'B'B**:::rt   c                       e Zd ZdZdZdZdZdZy)TritonAttrsDescriptorVersionr   r5   ra  r  r!  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   rt   rU   r  r    s     LKK	  Grt   r  c                 P   t         j                  j                  d      t        j                  S dd l} dd l} t        | j                  j                  d      rt        j                  S t        | j                  j                  d      rt        j                  S t        j                  S )Nr  r   AttrsDescriptor)r   r!  r"  r  r  triton.backends.compilertriton.compiler.compilerr   r:  compilerr  r  r  )r  s    rU   #get_triton_attrs_descriptor_versionr    s{    ~~)1+888##v''):; ,777	))+<	=+777 ,333rt   c                 8    t               t        j                  k(  S r   )r  r  r  r   rt   rU   triton_version_uses_attrs_dictr    s    .04P4X4XXXrt   c                j   ddl m} | j                  }t        |t        j
                  j                        sy|t        j                  j                  j                  j                  t        j                  j                  j                  j                  t        j                  j                  j                  j                  fv rq ||| j                  | j                  d      }|O|\  }}|d   }|D ]@  }||j                  d   j                   t        j"                  t        j$                  fv s@ y y)a  
    Check if an FX node is cudagraph-unsafe based on its input arguments.

    Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
    with boolean indices triggers .nonzero() during capture, but integer indices
    are safe).
    r   )normalize_functionFT)normalize_to_only_use_kwargsindicesrZ  )torch.fx.operator_schemasr  r{  rv   rN   r|  r}  r   r  	index_putr  
index_put__unsafe_index_putr}   r  r  r   r*  rK  )rx  r  r{  
normalizedr   r  r  r=  s           rU   ,_fx_node_is_input_dependent_cudagraph_unsafer  "  s     =^^Ffejj334 		  ((		!!))		((00 
 (GLL'..t

 !"IAvY'G  ?sxx'<'<JJKKA (    rt   c                   | j                   }t        |      t        v ryt        |t        j
                  j                        r1t        j                  j                  j                  |j                  v ryt        |       ry| j                  j                  d      x}Kt        |t        t        f      s|gn|}|D ]+  }t        |t        j                         s|j"                  s+ y y)a   
    Check if an FX node is cudagraph-unsafe.

    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - Input-dependent unsafe ops (e.g., index_put with boolean indices)
    - Ops with sparse tensor outputs
    TrZ  F)r{  r  FORBIDDEN_CUDAGRAPH_OPSrv   rN   r|  r}  r   r  cudagraph_unsafer  r  r  r2  rR  r  r  	is_sparse)rx  r{  rZ  valsr   s        rU   r  r  F  s     ^^F 6{-- 	65::001HHLL))V[[8 4G< ||&&3&sT5M:u 	A!U\\*q{{	 rt   c                    ddl m} t        | |j                  |j                  f      ryt        | |j
                  |j                  f      syt        | dd      }|t        |      ryy)ah  
    Returns True if the node is an op that is not cudagraphable.
    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - index_put_ with boolean indices (triggers .nonzero() during capture)
    - Control flow nodes (Conditional, WhileLoop)
    - Ops with sparse tensor outputs
    r5   r  TFrx  N)	r  r  rv   Conditional	WhileLoopr  r=   rM   r  )r  r  rx  s      rU   is_cudagraph_unsafe_opr  k  sa      $67dR..@AdIt,G:7Crt   c                    t         j                  j                  dd      } t        j                         rUddlm}  |       }|rFt         j                  j                  |dd      }| r!t         j                  j                  || g      n|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  r2  rg   rU  libfb.py.parutilr  r  r  pathsep)r  r  runtime_pathlib_paths       rU   get_ld_library_pathr    sg    ::>>+R0D5')ww||L)UCH8<2::??Hd#34(DKrt   c                F    ddl m} t        | |      xr | j                  d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr  rv   partition_signatures)r  r  s     rU   #is_codegen_graph_partition_subgraphr    s*    L 	789 	5((4rt   c                     t         j                  j                  j                  j                  xs t
        j                  d uxr$ t         j                  j                  j                  S r   )rN   r  rg   r  
cudagraphs&_unstable_customized_partition_wrapperr  graph_partitionr   rt   rU   is_using_cudagraph_partitionr    sN    %%00 	F199E1 //
 
 
0
01rt   c                    ddl m} |j                  j                  j	                  | d      r6|j                  j                  j                  | d      rt        j                  S t        j                  S )Nr5   r\  l        i   )	r_  r]  r`  ra  statically_known_ltr  rN   rD  rF  )r  r]  s     rU   dtype_from_sizer    sP    ww++e
''


/
/h
?{{{{rt   )r  rH   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN BF16.
    r  rH   TF)rN   r   rp  _is_mkldnn_bf16_supportedr   s    rU   is_mkldnn_bf16_supportedr    3     eyy99;;	+	rt   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN FP16.
    r  rH   TF)rN   r   rp  _is_mkldnn_fp16_supportedr  s    rU   is_mkldnn_fp16_supportedr    r  rt   c           
     n   |D cg c]  }t        t        |             }}| D ]R  }t        |      t        |      k(  sJ t        |      D ])  \  }}t        ||   t        t        |                  ||<   + T g }|j	                  dj                  d t        ||      D                     t        |      t        |      dz  z   t        |      dz
  z   }|j	                  d|z         | D ]3  }|j	                  dj                  d t        ||      D                     5 dj                  |      S c c}w )N|c              3  6   K   | ]  \  }}d || dd   ywr  r  Nr   )r   hrG  s      rU   r   ztabulate_2d.<locals>.<genexpr>  s$     H41aAa0tWA,H   ra  r5   ri  c              3  6   K   | ]  \  }}d || dd   ywr  r   )r   r   rG  s      rU   r   ztabulate_2d.<locals>.<genexpr>  s$     Htq!!QCp4lHr  r-  )rP   r  r   r   r  r  r   r   )elementsheadersr   widthsrowr   rJ  total_widths           rU   tabulate_2dr    s   #*+ac#a&k+F+ 43x3w<'''cN 	4DAqF1Is3q6{3F1I	44 E	LLH3w3GHHIf+Vq1S[1_EK	LL{"# JSXXHs37GHHIJ99U ,s   D2c              #     K   t        | j                               t        |j                               z  }|D ]3  }| j                  |      }|j                  |      }|||n|||n|f 5 yw)a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r"   r4  r2  )dict1dict2
d1_default
d2_defaultall_keysrS  value1value2s           rU   	zip_dictsr    sv     ( %**,'*UZZ\*BBH  	
33 (Fj(Fj
 	
	
s   A-A/c                T   	 	 	 	 	 	 	 	 dd}	 	 	 	 	 	 	 	 dd}| j                  dt        j                  j                        }| j	                         } |rm || dd        || dd        || dt
        j                  j                           || dd	        || d
t        j                  j                          || dd       | j                  dt        j                  j                        }| j                  dt        j                  j                        }|dk(  r|rt        d      | S )a6  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor_mode.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    c                    | j                  |t        t        |            }||| |<   y |s||k7  rt        d| d| d      y y )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)r2  rM   rg   r   config_patchesconfig_nameconfig_valuer   s       rU   patch_configz2maybe_aoti_standalone_config.<locals>.patch_config  s]     "";0LM=*6N;'5L0";-q>qr  1rt   c                    | j                  |t        t        |            }||k7  rt        j	                  d||       || |<   y )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)r2  rM   rg   r   r  r  s       rU   force_patch_configz8maybe_aoti_standalone_config.<locals>.force_patch_config  sF     "";0LML KKV
 '3{#rt   z$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.)r  dict[str, Any]r  r  r  r   r   r  )r2  rg   aot_inductor_modecompile_standalonecopyrN   r  r  test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor   )r  r  r  r  r   r!  s         rU   maybe_aoti_standalone_configr"    sk   "	&	58	HK			
3&
358
3HK
3	
3 (++.  33
 $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 	(,,	

 	>+I5Q*..,11
 -00.33
 	).E]
 	

 rt   c                   t         j                  j                  r(t         j                  j                  dk(  rt	        d      t         j                  j                  r0t         j                  j
                  dk(  rt	        d      d}d}||fS t         j                  j                  dk(  rd}d}||fS | dk  ryd}t        j                          }||fS )	a  
    Decide whether we should mmap weights, and whether to store the weights with .so.

    If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

    Returns tuple (use_external_weights, use_mmap_weights).
    binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.r  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)rg   r  force_mmap_weights package_constants_on_disk_formatr   r   rU  )consts_sizeuse_mmap_weightsuse_external_weightss      rU   determine_aoti_mmap_flagsr*  P  s     	..@@MQJ
 	

 --44	A]   $#%555;;}L# #%555m# !++--!111rt   c                     ddl m}  | j                  j                  }|yt	        |t
              st        d      |dk(  ryt        j                  d|      st        d      y)zL
    Validates if a model name is suitable for use in code generation.

    r   rf   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  rg   r  model_name_for_generated_filesrv   r  r  r   r   )rg   
model_names     rU   is_valid_aoti_model_namer.  y  sh    
 '$$CCJj#&OPPR 88/<d
 	
 rt   c                2    |rt        |       S t        |       S r   )r)   r(   )rR   unbacked_onlys     rU   get_free_symbolsr1    s    $Q''Art   c                    t         j                  j                  sy| |  }|rE|j                  x}r7|j	                         x}r%|j
                  j                  dd      x}r| d| }t        j                  |       y)z
    Cudagraph partition may lead to extra memory overhead so we
    log partition reasons to help users understand the overhead.
    Nstack_tracez. Found from : 
 )	rg   r  r  r  get_origin_noder  r2  perf_hint_logr  )r  r{  r  warning_msgir_noderx  r3  s          rU   maybe_log_cudagraph_partitionr8    s     ==##HSE"K 			!W!//11W1#LL,,]DAA[A$%7}E+&rt   c                    i t         j                  dt         j                  j                  dt         j                  j	                  t
        j                              i} t        j                         rt        j                  d      | d<   | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr  
PYTHONHOME)r  r  r2  r  r  r  r  rg   rU  	sysconfigget_path)envs    rU   python_subprocess_envr@    sl    

** 	bjjnn%rzzsxx'@
	C  %..v6LJrt   c                  &    e Zd ZU dZded<   ded<   y)CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    r   num_partitionspartition_indexNr   r   rt   rU   rB  rB    s      rt   rB  .c                      e Zd ZU dZded<   y)CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   r   rt   rU   rF  rF    s    .2G+2rt   rF  c                    | t         _        y r   )r  r  )r  s    rU   !set_customized_partition_wrappersrH    s    5<*2rt   c                8   | j                   j                  }| j                   j                  g || j                   j                  | j                   j                        }| j                   j                  }t        j                  ||f      \  }}dd}|D cg c]7  } ||      r+t        j                  j                  j                  |d      n|9 }}dddfd}|D cg c]
  } ||       }}t        j                  ||      \  }}||fS c c}w c c}w )	Nc                    t        | t        j                  j                  j                        xr/ t        | t        j                  j                  j
                         S r   )rv   rN   r  r  r>   GeneratorStater  s    rU   _is_tensor_irz(snode_args_kwargs.<locals>._is_tensor_ir  sH    !U__//667 

u!!00A
 =
 	
rt   F)guard_shapec                2    t        j                  | ||      S )Nr   )rN   r   )r  r   r   s      rU   _tensorz"snode_args_kwargs.<locals>._tensor  s    {{4uV<<rt   c                    t        | t        j                        s| S  | j                         | j                  | j
                        }|S r   )rv   rN   r  r  r   r   )r   r  rO  s     rU   to_real_tensorz)snode_args_kwargs.<locals>.to_real_tensor  s7    !U\\*Haffh2
rt   rs  )r   r  )r   r   r   r   )r  r#  fill_non_provided_argsconstant_argsr  pytreer#   rN   r  r  ir_node_to_tensortree_unflatten)	r  r}   r  	flat_argsflat_args_pytree_specrL  r  rQ  rO  s	           @rU   snode_args_kwargsrY    s   ::D::,,*$*))*

D ZZF'-':':D&>'J$I$
 	   	,,QE,B	I = -66q"6I6((4IJLD&<%  7s   <D$Dc                    ddl m} | j                  }|j                  j                  r(|j	                  |j                  j                  dz         }|j                  d      S )Nr5   r\  r   )primals_r  fwd_rng_stater  r  )r_  r]  r   r`  removeprefixr  )r  r]  dep_names      rU   is_nonfreeable_buffersr_    sN    xxH 	ww||(();<I rt   c                p    t        ||  dz        5 }|j                         cddd       S # 1 sw Y   yxY w)z,Load a template file and return its content.z	.py.jinjaN)openread)r   template_dirr  s      rU   load_templaterd  -  s6    	lvY//	0 Avvx  s   ,5c                v   | j                   }t        |t        j                  j                  t        j                  j
                  f      sJ dt        |              t        j                  syt        t        j                  j                  j                  j                  t        j                  j                  j                  j                  g      }||v ryt        t        j                  j                  j                   g      }t        |t        j                  j
                        r||v S t#        |        S )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r{  rv   rN   r|  r}  r  r  rg   fallback_by_defaultr"   r   r  _assert_scalarr  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr    )r  r{  "skip_fallback_due_to_dynamic_shapefallback_hopss       rU   should_fallback_by_defaultrm  3  s    [[F&&

(F(FG O	?V~NO  %% *4IINN))11IINN**22	
*& 33 				@	@AM &%**889&&&t,,,rt   )	z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultc                    | t         v S )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)r>  s    rU   is_collective_oprp  h  s    n$$rt   rq  )rr   r   r   r   )r   r   r   r*  )   d   )r   Callable[[], Any]r   r   r   r   r   r]  )rq  rr  F)
r   rs  r   r   r   r   r   r*  r   r]  rs  )r   z"Union[Optional[torch.device], str]r   torch.device)r  zIterable[sympy.Expr]r   r   )r  r  r  r  r   r   )r  zIterable[_T]r   zValuesView[_T])r&  r  r'  r  r   r  )rS  r  r   r  )rY  z"Iterable[Union[int, torch.SymInt]]r   zlist[sympy.Expr])r   r  r   zUnion[int, torch.SymInt])rY  z Iterable[Union[int, sympy.Expr]]r   zlist[Union[int, torch.SymInt]])ro  torch._ops.OpOverloadr   r*  )r  r4   rz  z'Callable[[torch._ops.OpOverload], bool]r   r*  )r{  r   r}   r  r  r  r   z&tuple[GraphModule, list[torch.Tensor]])rF   )r   r  r   r  )r5   rF   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r  r   r]  )r   r  r  g      ?rF   )r  rv  r  rw  r   r   r  r   r  r]  r   r  r   r]  )r  r   r  r  r   r  )r  r   r  r   r   r  )r  r   r  r   r   r   )rR   zUnion[int, Sequence[int]]r  r   r   Sequence[int])rR   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   CachedMethod[P, RV])r   zCallable[P, RV]r   ry  )r  r  r   z*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r   z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r  )r  rz  r  r8   r   ztuple[str, str]r   )rK  zIterable[torch.fx.Node]rL  zOptional[Callable[[Any], bool]]r   OrderedSet[torch.fx.Node])r}   zSequence[IRNode]r  zdict[str, IRNode]r   r|  rq  )r  r   r   zValueRanges[Any])r{  r  r   r*  )r{  rc   r=  r   r   r  )r  r*  r   r*  )r   r  r   r  )rc  r   r  zdict[sympy.Expr, Any]r   r   )r  r   r   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])r}   r   r   r*  )r  torch.fx.GraphModuler   zOptional[torch.fx.Node])r  r}  r   r4   )r  r}  r   zOrderedSet[torch.device]rr  )r  r   r   r   )NNT)r  zOptional[dict[str, Any]]r  r  r  r*  r   rQ  )r  rw  r  r*  r   	list[int])rb  r+   r  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r  r*  r   r~  )r   r  r   r   rP  rp  )r  zUnion[int, torch.device]r   r*  r  )r  r   r   rt  r  Optional[int]r   r6   )r  r?   r  zlist[torch.dtype]r   r*  )r  r  r   r*  )
r  r?   r  r*  r  r*  r  r*  r   r*  )r  r>   r  r  r  r*  r   r*  )r  r>   r  r?   r  r*  r   r*  )r(  r   r)  r   r  r?   r*  r*  r+  r*  r,  Optional[Any]r-  r  r.  r  r   r*  )
r  r?   r  r   r  r   r  r   r   r*  )r>  r  r   r*  r   )
r  rA  r  rA  r  rA  rK  r   r   r*  )r  rA  r  rA  r  rA  r   r*  )r  rA  r  rA  r  rA  r   r~  )r   r  r   r  )r   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r  r?   r   r*  )r  r?   r  zUnion[ReinterpretView, Buffer]r  r>   r   r*  )FTFN)r  r?   r  r>   r  r>   r  r*  r  r*  r  r*  r  r  r   r*  )r   Callable[P, _T]r}   r  r  r  r   ztuple[_T, list[str]])r   rv  r   ztuple[Any, list[str]])r   r  r}   r  r  r  r   r   )r   r  r}   r  r  r  r   r  )r   r  r}   r  r  r  r   ztuple[Any, list[GraphLowering]])r  rv  r   rv  r   rQ  )r  rv  r  zOptional[Callable[..., Any]]r   r   )r  r  r   r  )r   r  )r5  rw  r   r*  )r#  zSequence[torch.Tensor]r   r*  )rZ  r   r   r  )r(  r*  r}   r   r  r   r   zIterator[Any])r   r  r   r]  )rW  r  r   r*  )rW  r  r   r   )rd  zIterable[Any]r   r*  )
ro  rv  r  r3   r{  rw  r  r  r   r  )r  z"Optional[Union[Buffer, Operation]]r   r*  )r  z Optional[Union[Node, Operation]]ro  z!Optional[torch._ops.OperatorBase]r   r*  )r  z"Optional[Union[IRNode, Operation]]r   r*  )r  rD   r  z-Optional[Callable[[BaseSchedulerNode], bool]]r   r*  )r  rD   r   r*  )r  zOptional[Operation]ro  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r   r*  )r  r  r  r  r  r  r   r   )r  rD   r  zMutableSet[BaseSchedulerNode]r  zdict[str, SchedulerBuffer]r  zdict[str, BaseSchedulerNode]r  zCallable[[Any], bool]r   r  )r  r   r  r   r   r   )r  r}  r   r   )r  r   r   rQ  )r   r  r   r  )r   r  r   r*  )r   r  r   r*  )r   r  r   r*  )r  ru  rW  r  r  r  r  r  r  r  r  r*  r   r*  )r  r{  r   r  )r   r  r   r*  )r  r  r   r*  )r   rt  )r   r  r}   r  r  r  r   ztuple[_T, str])r#  Sequence[InputType]r   zOptional[ShapeEnv])r  Callable[[list[InputType]], _T]r*  rx  r+  zOrderedSet[int]r   r  )rR   r  r   r  )r'  r,  r7  rx  r8  zOptional[OrderedSet[int]]r   z-tuple[list[torch.Tensor], list[torch.Tensor]])r#  r  r<  rx  r   rx  )r   r   r   r*  )r  rw  rO  rC   r   r  )r   r  r   r  )r   r  r   r  )r  r  r   r  r   r*  )r   ztuple[str, ...])r   r  r  r2   r  r  r   r  )r   r  r   r  )r   r  r   r  )r   r  r   r  )r   zOptional[type[Any]]r   r*  r   r   )r   zOptional[list[int]])r   r  )rx  torch.fx.Noder   r*  )r  r@   r   r*  )r  r8   r   r*  )r  r   r   r  )r   r  r   r*  )r  zSequence[Sequence[T]]r  zSequence[T]r   r  )NN)
r  r  r  r  r  ValType | Noner  r  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])r  r  r   r  )r'  r   r   ztuple[bool, bool])rR   r*   r0  r*  r   zOrderedSet[sympy.Symbol])zcudagraph partition due to N)r  r  r{  r  r  zOptional[BaseSchedulerNode]r   r  )r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )r  rD   r   z tuple[list[Any], dict[str, Any]])r  r9   r   r*  )r   r  rc  r%   r   r  )r  r  r   r*  (  
__future__r   r.  rR  r  enumr  r   r?  rv  rW  r  r_  r  r  r^  r   r  r   r  r=  r  ra  r  r
  collections.abcr   r   r   r   r   r	   r
   r   r   typingr   r   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   rw   rN   torch.utils._pytreer  _pytreerT  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   !torch.fx.passes.regional_inductorr    torch.utils._dtype_abbrsr!   torch.utils._ordered_setr"   r#   r$   pathlibr%   OPTIMUS_EXCLUDE_POST_GRADr  r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   torch._prims_commonr2   torch.fxr3   torch.fx.noder4   r  r6   r  r8   dependenciesr9   r`  r;   r  r<   r=   r>   r?   r@   rA   output_coderC   r  rD   rE   rL   rJ   r   rV   torch._dynamo.device_interfacerW   torch._dynamo.utilsrX   torch.autogradrY   torch.autograd.profiler_utilrZ   (torch.fx.passes.graph_transform_observerr[   torch.fx.passes.shape_propr\   torch.utils._sympy.functionsr]   r^   r_   r`   ra   torch.utils._sympy.symbolrb   rc   torch.utils._sympy.value_rangesrd   re   r  rg   runtime.runtime_utilsrh   r%  _IS_WINDOWS	getLoggerr   r   _logginggetArtifactLoggerr5  rk   r  r#  	VarRangesr  r   	InputTypeGPU_KERNEL_BIN_EXTSr  r6  r  r  rq   rs   r|   Functionr~   r  r   r   r   r   r  r  r  r  r   rV  rZ  rd  rf  rp  rx  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r  r  rI  rO  r^  rp  ry  r|  r  r  r  r  r  r  	frozensetr  r  r  r  r  r  r  r   r  r  rS  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r  	lru_cacher  r  r  r   rv  r  r,  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r#  r/  r:  r@  rA  rL  rP  rb  rf  rv  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r$  r&  r)  r  r3  rF  rJ  rO  rS  rX  r[  r`  r  r  r'  Enumrk  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r"  r-  r3  r%  r>  rF  rP  r[  r^  rb  r5  rf  compilera  rd  ri  rn  rq  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r"  r*  r.  r1  r8  r@  rB  PartitionFnTyper  rF  r  rH  rY  r_  rd  rm  ro  rp  )r  r   s   00rU   <module>r     s1   "        	     	  	   
             C B    $ $ ? : E 0 / ;  ($ 
  >>//C$",5!$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%g!00<H T]UZZ'(	U5<<ell:;<	'7 	 {Q'A-+2B XDX XB5
LENN  d#  $"GX #(	 
 !	
 4 #(	[[[ 
[ !	[
 [|  ;@
+*"*+A**#AL+	+++"/	)/#/G @OI	I<I 
I0 *8+0' 	!  	
 ( %'!  	
    )'#$  cNTT"
;sAv&*
+E8WQU^ E:++/+\C,4).4)O4) 	4)nW2CW2!W2 W2x 48*0 (G
G$5GG:,^%	DU	>2- $ $'& 
< !# I "	 .29+9	9 9 	9 9z !5 $ "  49 ( 	$$	7$ 	$
 $N Q7 7*  , , ,
S' S'l
 
 @ @ @?' ?2% 2  8 J J ) )I #'   	(+<	  #  	
  
< :>RWpp&6pKOp	ph BG&,:>	 BGVV&,V:>V	V  Q	  	>>> > 	>
 > > >  > 
>BBJ CO,) , EF!)?B	 *  . 5( 5(p @ @ R R:"JH&8@F	: ""&"&==
= = 	=
  = =  = 
=@'C C"&&& & 	&$ &2:/(V		 &	2:		## &#2:#$#* ...@.. .$ IMFF)EFF*	B&&   D D (6 (6V  Q.0(#K(*$)) * 

 
"- 
4A 
HK 
	 
F1	" -1#
*#)# 
#L( @D	Q	Q<	Q 
	Q#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!.2$&$!$ $ 	$
 $ $ 
$NH>L'  &2:2:*" ( %	0	: 37$$$$ 0$ 3	$<$ $$3N!3B	:&/ '#)* $%
  +?*D*D*FG$!QAG  "**Y'H	  & 1 1 1
 68 2 7
8 1 
	 /9l O :)# # )

)
-" 01 -"` D)t  *499  4 42Y!H"J4
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
FRj&2R6 :(,'	'' &' 
	'26 d#  $ 38$./@ 3 3 *:); &= F
"-L 
%K Hs   p#