
    i`                    ,   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Zd dl
Z
d dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmc mZ  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z( d d	l)m*Z* d d
l+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC ddlDm'Z' ddlEmFZF ddlGmHZHmIZImJZJmKZK ddlLmMZM ddlNmOZOmPZP ddlQmRZR ddlSmTZT ddlUmVZVmWZW ddlXmYZYmZZZm[Z[ erd dl\Z\e'j                  Z^e_e`d<    ej                  eb      Zcej                  e`d<   ej                  j                  Zfej                  j                  Zge G d d             Zhe G d  d!             Zie G d" d#             Zjd$ej                  d%e_fd&Zld'ej                  d%e_fd(Znd'ej                  d%e_fd)Zod$ej                  d%epfd*Zq G d+ d,      Zrd- Zs er       Zt	 	 dd.ej                  d/evej                     d0evej                     d1eveO   d2eew   d3e_d%ej                  fd4Zxd$ej                  d%e_fd5Zyd$ej                  d%e_fd6Zzd$ej                  d%e_fd7Z{d$ej                  d%e_fd8Z|d$ej                  d%e_fd9Z}d$ej                  d%e_fd:Z~d$ej                  d%e_fd;Zd$ej                  d%e_fd<Zd$ej                  d%e_fd=Zd$ej                  d%e_fd>Zd$ej                  d%e_fd?Zd$ej                  d%e_fd@ZdAej                  d%eevej                     evej                     eveO   eveO   f   fdBZdCevej                     dDewfdEZdFeevej                     eej                  dGf   f   d%epfdHZ	 	 	 ddIej8                  j                  d$ej8                  j                  dJedKedLepf
dMZdIej8                  j                  d$ej8                  j                  dNej8                  j                  dOej                  dPedQedLepd%ej8                  j                  fdRZdSej                  d%efdTZd%evej                     fdUZd$ej8                  j                  d%e_fdVZd%ej                  fdWZdXej                  d%efdYZdIej8                  j                  d%dfdZZdIej8                  j                  d%dfd[Zd\ej                  d]ej                  d^eewej                  f   d%dfd_Z	 ddCevej                     d\ej                  d]ej                  d`eeAej                        d%df
daZddbdAej                  dCevej                     dcevej                     ddepd`eeAej                        d%eej                  ej                  f   fdeZdddfdAej                  dgeevep      d`eeAej                        d%eej                  ej                  f   fdhZ epdi      Zdjepd%epfdkZd$ej                  d%epfdlZdIej                  fdmZej<                  dn        Zdoeej                  epf   d%eveej                  epf      fdpZdqej                  d%ej                  fdrZdsej8                  j                  dtej8                  j                  duej8                  j                  dvej8                  j                  dwejD                  dxepdyej8                  j                  dzej8                  j                  fd{ZdAej                  dsej                  dtej                  d|epd%eej                  ej                  f   f
d}ZdAej                  d%dfd~ZdAej                  d%dfdZd ZdAej                  de_d%ej                  fdZ	 dd.ej                  deidejdeeAej                        fdZd Zd%ehfdZdIej                  fdZd.ej                  deve   deve   dedeidevej                     d%eeevep   evep   f   fdZd dlmZ dej                  depd%ej                  fdZd Z	 dd.ej                  deid%evej                     fdZd.ej8                  j                  dCevej8                  j                     fdZd Zd Z	 ddddAej                  dgeevep      d%eej                  ej                  f   fdZ	 	 	 	 	 ddej8                  j                  dewdewde_deeewevew   f      de_deew   d%dfdZy)    Ndefaultdict)Callable)	dataclassreplace)AnyOptionalTYPE_CHECKINGUnion)countersis_node_meta_valid)(create_structured_trace_for_min_cut_info)config)
is_builtin)trace_structured)extract_tensor_metadata)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolshint_intis_symbol_binding_fx_nodestatically_known_falsestatically_known_true)graph_drawer)
OrderedSet)CheckpointPolicy   )GraphInfoProvider)dp_knapsackdp_knapsack_sliding_hirschberggreedy_knapsackilp_knapsack)KnapsackEvaluator)	AOTOutputSavedForBackwardsAOTOutput)_is_functional_graph)get_aot_graph_name)get_cuda_generator_meta_valis_with_effects)fx_graph_cseget_aten_targetraise_getitemsAOT_PARTITIONER_DEBUGlogc                      e Zd ZU dZee   ed<   ee   ed<   ee   ed<   ee   ed<   ee   ed<   dej                  fdZ	dej                  fd	Z
dej                  fd
Zdej                  fdZdej                  fdZy)OpTypesz8Class for keeping track of different operator categoriesfusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodec                 0    t        |      | j                  v S N)r/   r5   selfr:   s     W/var/www/html/engine/venv/lib/python3.12/site-packages/torch/_functorch/partitioners.py
is_fusiblezOpTypes.is_fusibleT   s    t$(8(888    c                 0    t        |      | j                  v S r<   )r/   r6   r=   s     r?   is_compute_intensivezOpTypes.is_compute_intensiveW   s    t$(B(BBBrA   c                 0    t        |      | j                  v S r<   )r/   r7   r=   s     r?   	is_randomzOpTypes.is_randomZ   s    t$77rA   c                 0    t        |      | j                  v S r<   )r/   r8   r=   s     r?   is_viewzOpTypes.is_view]   s    t$55rA   c                 0    t        |      | j                  v S r<   )r/   r9   r=   s     r?   is_recomputablezOpTypes.is_recomputable`   s    t$(=(===rA   N)__name__
__module____qualname____doc__r   r   __annotations__fxNoder@   rC   rE   rG   rI    rA   r?   r4   r4   J   s    BH%%%h//8$$"" **9rww 9C C8bgg 86BGG 6>BGG >rA   r4   c                      e Zd ZU eej
                     ed<   eej
                     ed<   eej
                     ed<   eej
                     ed<   eej
                  e	f   ed<   eej
                     ed<   e
j                  deej
                     fd       Zd	ej
                  defd
Zd	ej
                  defdZd	ej
                  defdZd	ej
                  de	fdZy)NodeInfoinputs_required_fw_nodesrequired_bw_nodesunclaimed_nodesfw_orderstatic_lifetime_input_nodesreturnc                 F     t        d  j                  D         fd      S )Nc              3       K   | ]  }|  y wr<   rQ   .0ns     r?   	<genexpr>z-NodeInfo.required_fw_nodes.<locals>.<genexpr>s   s     01Q0s   c                 "    j                   |    S r<   )rX   )r_   r>   s    r?   <lambda>z,NodeInfo.required_fw_nodes.<locals>.<lambda>s   s    a@P rA   key)sortedrU   r>   s   `r?   required_fw_nodeszNodeInfo.required_fw_nodesp   s!    0//06P
 	
rA   r_   c                     || j                   v S r<   )rU   r>   r_   s     r?   is_required_fwzNodeInfo.is_required_fwv   s    D++++rA   c                     || j                   v S r<   )rV   ri   s     r?   is_required_bwzNodeInfo.is_required_bwy   s    D****rA   c                     || j                   v S r<   )rW   ri   s     r?   is_unclaimedzNodeInfo.is_unclaimed|   s    D((((rA   c                 R    || j                   v sJ d| d       | j                  |   S )NNode z not in fw nodes!)rU   rX   ri   s     r?   get_fw_orderzNodeInfo.get_fw_order   s4    D+++IuQC7H-II+}}QrA   N)rJ   rK   rL   listrO   rP   rN   r   dictint	functoolscached_propertyrg   boolrj   rl   rn   rq   rQ   rA   r?   rS   rS   d   s     M"277++!"''**((277C<  !+BGG!44
4= 
 

, ,D ,+ +D +)bgg )$ ) bgg  #  rA   rS   c                   @    e Zd ZU eed<   eed<   eed<   eed<   eed<   y)MinCutOptionsban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)rJ   rK   rL   rw   rN   rQ   rA   r?   ry   ry      s      $$"&&!!rA   ry   r:   rZ   c                 z    | j                   j                  dd       t        j                  t        j                  fv S )N	recompute)metagetr    MUST_RECOMPUTEPREFER_RECOMPUTEr:   s    r?   must_recomputer      s5    99==d+''))0  rA   fx_gc                 T    | j                   j                  D ]  }t        |      s y y)NTF)graphnodesr   r   r:   s     r?   has_recomputable_opsr      s+    

   $ rA   c                     | j                   j                  D ]W  }t        |      st        |j                  d      s&t
        j                  j                  |j                  j                  v sW y y)NtagsTF)	r   r   r   hasattrtargettorchTagnondeterministic_seededr   r   s     r?   has_recomputable_rng_opsr      sU    

   4 V,		11T[[5E5EE rA   c                     t        | j                  d   t        j                  t        j                  f      ryt        | j                  d   t        j
                        sJ y)Nvalr!      )
isinstancer   r   SymIntSymBoolSymFloatr   s    r?   sym_node_sizer      sE    $))E"U\\5==$ABdii&777rA   c                       e Zd Zd Zy)InvalidNodeBasec                      y)NzInvalid NoderQ   rf   s    r?   __repr__zInvalidNodeBase.__repr__   s    rA   N)rJ   rK   rL   r   rQ   rA   r?   r   r      s    rA   r   c                 6    t        | j                  dd       dk7  S )N	namespace_c10d_functional)getattrr   r   s    r?   is_not_collectiver      s    4;;T26HHHrA   joint_graphrT   outputsoutputs_descssubgraphignore_must_be_in_fw_bwc                    t        j                         }i |D ]3  }|j                  |j                        }|j                  |_        ||<   5 | j
                  D ]:  }|s<t        |      r|dk7  r||vr
t        |<   $t        |      r|dk7  r||vr
t        |<   B|v rG|j                  dk(  r
t        |<   `|j                  dk(  rt        j                  |j                  i |j                  }	|	D 
cg c]/  }
t        |
t         j                        rt        |
   t               1 }	}
t#        |	      r
t        |<   |j%                  |fd      |<   |j                  dk(  r|j%                  |fd      |<   *|j                  dk(  s;= g }|D ]s  }
t        |
t         j                        rF|
vrt'        d	|
 d
      t        |
   t               rJ d	|
 d       |j)                  |
          c|j)                  |
       u |j+                  t-        |            }||j                  d<   |j/                          |j1                          |S c c}
w )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardforwardplaceholdercall_functionc                     |    S r<   rQ   xenvs    r?   rb   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>       CF rA   get_attrc                     |    S r<   rQ   r   s    r?   rb   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>   r   rA   outputrp   z couldn't be found in envz was invalid, but is outputdesc)rO   Graphr   namer   r   _must_be_in_backwardInvalidNode_must_be_in_forwardoppytreearg_tree_leavesargskwargsr   rP   r   any	node_copyRuntimeErrorappendr   tupleeliminate_dead_codelint)r   rT   r   r   r   r   	new_graphr:   new_nodeall_argsr   output_valuesoutr   s                @r?   "_extract_graph_with_inputs_outputsr      sd   $ 
I
C  ((3		D	 !! )&$T*
*&'D	 $D)	)&'D	3; WW%#CIWW'--tyyHDKKHH "a) 3q6?3H 
 8}'D	!++D2BCCIWW
"!++D2BCCIWW S)T M 	$a!|"U1#-F#GHH!#a&/: s56:   Q(  #	$ 

5/
0C$CHHV!!#NN?s   84Ic                     | j                   dk(  xr3 dt        | j                        vxr t        |        xr t	        |        S Nr   tangents)r   strr   _is_bwd_seed_offset_is_fwd_seed_offsetr   s    r?   
_is_primalr     sK    =  	*c$++..	*#D))	* $D))	rA   c                 R    | j                   dk(  xr dt        | j                        v S r   r   r   r   r   s    r?   _is_tangentr     s$    77m#F
c$++6F(FFrA   c                    t         j                  xry t        | j                  t        j
                  j                        xr t        | j                         xs1 | j                  t        j                  j                  j                  k(  S r<   )r   is_non_builtin_to_includer   r   r   _ops
OpOverloadr   opshigher_order triton_kernel_wrapper_functionalr   s    r?   r   r   !  s`    ++ 	DKK!6!6	7	W
4;;@W<W 	R;;%))00QQQrA   c                     | j                   dk(  xr0 dt        | j                        v xs dt        | j                        v S )Nr   bwd_seedbwd_base_offsetr   r   s    r?   r   r   (  =    77m# c$++&&O*;s4;;?O*OrA   c                     | j                   dk(  xr0 dt        | j                        v xs dt        | j                        v S )Nr   fwd_seedfwd_base_offsetr   r   s    r?   r   r   .  r   rA   c                 v    | j                   dk(  xr) t        | j                  j                  d      t              S )Nr   r   )r   r   r   r   r   r   s    r?   _is_backward_stater   4  s*    77m#W
499==3G(WWrA   c                 @    | j                   j                  dd       dk(  S )Npartitioner_tagis_backwardr   r   r   s    r?   _has_tag_is_backwardr   8  s    99==*D1]BBrA   c                 @    | j                   j                  dd       dk(  S )Nr   
is_forwardr   r   s    r?   _has_tag_is_forwardr   <  s    99==*D1\AArA   c                 @    | j                   j                  dd       dk(  S )Nr   must_be_in_forwardr   r   s    r?   _has_tag_must_be_in_forwardr   @  s    99==*D15IIIrA   c                 @    | j                   j                  dd       dk(  S )Nr   must_be_in_backwardr   r   s    r?   _has_tag_must_be_in_backwardr   D  s    99==*D15JJJrA   c                    t        |       ryt        |       xsP t        | j                  t        j
                  j                        xr  | j                  j                  j                  }t        |        xr t        |        xr |S NT)r   r-   r   r   r   r   r   _schema
is_mutabler   r   r:   r   s     r?   r   r   H  sy    "4( & 4;;

 5 56 	+KK** 
 !&& 	,T22	rA   c                     t        |       ryt        |       xsP t        | j                  t        j
                  j                        xr  | j                  j                  j                  }t        |       xr |S r   )
r   r-   r   r   r   r   r   r   r   r   r   s     r?   r   r   V  sa    #D) & 4;;

 5 56 	+KK**   %4*4rA   joint_modulec          	      l   t        j                  d | j                  j                  d      D         }t        j                  t	        t        | j                  j                  d                  j                  j                  dd gt        |      z              }|d | }||d  }|d | }||d  }||||fS )Nc              3   4   K   | ]  }|j                     y wr<   r   r^   r:   s     r?   r`   z+_extract_fwd_bwd_outputs.<locals>.<genexpr>d  s     	K$))	K   r   r   r   )	r   r   r   
find_nodesnextiterr   r   len)r   num_fwd_outputsr   r   fwd_outputsbwd_outputsfwd_outputs_descsbwd_outputs_descss           r?   _extract_fwd_bwd_outputsr  `  s     $$	K 2 2 = = = J	KG **T,$$//8/<=>CCGGTFS\)	
M
 *?+K/*+K%&67%o&67%68IIIrA   saved_valuesr   c                 V    | D ]$  }|j                   |k(  s| j                  |        y  y r<   )r   remove)r  r   saved_values      r?   _remove_by_namer  r  s0    # t#,rA   fwd_module_outputs.c                     t        |       }t        t        |       dz
  dd      D ]  }t        | |         r|dz   } |S  |S )Nr!   )r  ranger   )r  idxis      r?   find_first_sym_noder  y  sX      
!C3)*Q.B7 -a01a%CJ	 JrA   r   maxminpositionc           	         | j                  |      5  | j                  t        j                  j                  j
                  j                  |f      }t        j                  j                  j
                  j                  |j                  d         |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  |dgdf      }t        j                  j                  j                  j                  |j                  d   dgd      |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  |t        j                  f      }t        j                  j                  j                  j                  |j                  d   t        j                        |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  ||f      }t        j                  j                  j                  j                  |j                  d   |      |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  |f      }	t        j                  j                  j                  j                  |j                  d         |	j                  d<   t        |	j                  d         |	j                  d<   d d d        | j                  	      5  | j                  t        j                  j                  j                  j                   |	|f      }
t        j                  j                  j                  j!                  |	j                  d   |      |
j                  d<   t        |
j                  d         |
j                  d<   d d d        | j                  
      5  | j                  t        j                  j                  j                  j                  |
t        j"                  fd| d|j$                         }t        j                  j                  j                  j                  |
j                  d   t        j"                        |j                  d<   t        |j                  d         |j                  d<   d d d        |S # 1 sw Y   +xY w# 1 sw Y   fxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   	xY w# 1 sw Y   HxY w# 1 sw Y   S xY w)	Nr  r   tensor_metar  Tfp8_scale_pos__r   r   )inserting_afterr   r   r   atenabsdefaultr   r   amaxprimsconvert_element_typefloat64	clamp_min
reciprocalmulTensorfloat32r   )r   r:   r  r  r  abs_node	amax_nodeamax_64_nodeclamp_min_nodereciprocal_nodemul_node
scale_nodes               r?   calculate_quantization_scalingr7    s$    
		t	$ U&&IINN&& ' 
  %yy~~1199$))E:JKe'>x}}U?S'Tm$U 
		x	( W''IINN''RD$' ( 
	 !&		 3 3 ; ;MM% 2$!
	u )@	u@U(V	}%W 
		y	) 

**IIOO0088U]]+ + 
 $)99??#G#G#O#ONN5!5==$
%  ,Ce$,
-(

 
		|	, 

,,IINN$$,,$ - 
 &+YY^^%=%=%E%Ee$c&
E" .E&.
M*

 
		~	. 

--IINN%%-- " . 
 ',iinn&?&?&G&G&'
U# /F  '/
]+

 
			/ U&&IINN%%!3' ' 
  %yy~~1188  ' 
e (?x}}U?S'Tm$U 
		x	( 	Y((IIOO0088EMM*!(1TYYK8 ) 


 "'!E!E!M!MMM% %--"

 *AQVAW)X
&	Y IU UW W

 



 



 

U U	Y sZ   B3W.B9W;0CXB5X(B3X"4B5X/C"X<.W8;XXX"X,/X9<Yr6  
quant_typer+  	clamp_maxc           	      	   | j                  |      5  | j                  t        j                  j                  j
                  j                  |t        j                  f      }t        j                  j                  j
                  j                  |j                  d   t        j                        |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  ||f      }t        j                  j                  j                  j                  |j                  d   |j                  d         |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  ||f      }	t        j                  j                  j                  j                  |j                  d   |      |	j                  d<   t        |	j                  d         |	j                  d<   d d d        | j                  	      5  | j                  t        j                  j                  j                  j                  |	|f      }
t        j                  j                  j                  j                  |	j                  d   |      |
j                  d<   t        |
j                  d         |
j                  d<   d d d        | j                  
      5  | j                  t        j                  j                  j
                  j                  |
|fd| d|j                         }t        j                  j                  j
                  j                  |
j                  d   |      |j                  d<   t        |j                  d         |j                  d<   d d d        |S # 1 sw Y   bxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   S xY w)Nr  r   r  fp8_quant_pos_r!  r"  )r#  r   r   r   r(  r)  r&  r/  r   r   r$  r-  r.  r+  r9  r   )r   r:   r6  r8  r+  r9  r  target_node_32scaled_target_nodeclamp_min_scaled_nodeclamp_max_scaled_nodequant_activation_nodes               r?   perform_quantizationrA    s    
		z	* 

,,IIOO0088& - 
 &+YY__%I%I%Q%QIIeemm&
E" .E&.
M*

 
		~	. 

"00IINN%% *- 1 
 */););)B)B&
(>*
& 2I##E*2
.

 
		1	2 

 % 3 3IINN$$,,$i0 !4 !
 -2IINN,D,D,L,L##E*I-
""5) 5L!&&u-5
""=1

 
		4	5 

 % 3 3IINN$$,,'3 !4 !
 -2IINN,D,D,L,L!&&u-y-
""5) 5L!&&u-5
""=1

 
		4	5 
 % 3 3IIOO0088'4!(1TYYK8 !4 !
 IIOO0088%**51: 	""5)
 5L!&&u-5
""=1
 ! u

 



 



 



 


 ! s@   CQ<CQB5Q%B5Q*3CQ7QQQ'*Q47Rtensorc                 R    | j                         }| j                         }||z  dz  S )z
    Calculate the size of a PyTorch tensor in megabytes (MB).

    Args:
        tensor (torch.Tensor): Input tensor

    Returns:
        float: Memory size in MB
    i   )numelelement_size)rB  num_elementsrE  s      r?   calculate_tensor_sizerG    s-     <<>L&&(L<'K88rA   c            	          t         j                  j                  j                  d   j	                  dd      } | j                  d      D cg c]$  }t        t         |j                  d      d         & } }| S c c}w )N!activation_quantization_aten_passallowed_dtypesztorch.bfloat16;.r  )r   	_inductorr   post_grad_fusion_optionsr   splitr   )rJ  dtypes     r?   get_allowed_dtypesrQ  )  sz    __++DD+	c
,-  ;I:N:Ns:S16u{{3'+,N  s   )A8c                 B   t               }t        |       r| j                  d   j                  |vryt        j
                  j                  j                  d   j                  dd      }t        | j                  d         }t        j
                  j                  j                  d   j                  dd      s||k\  S t        j
                  j                  j                  d   j                  dd      rt        ||k\        xs t        ||k\         S t        ||k\        S )Nr   FrI  
size_in_mbd   skip_dynamo_guardsquantize_dynamic_shape)rQ  r   r   rP  r   rM  r   rN  r   rG  r   r   )r:   rJ  size_thresholdrS  s       r?   should_quantizerX  3  s   ')Nd#tyy'7'='=^'S__++DD+	c,  'tyy'78J??!!::+	c
&' ^++ ??!!::/

#&
./ )n, J+J.,HIIJ
 )~)EFFrA   c                      t         j                  j                  j                  d   j	                  dd      } t        t         | j                  d      d         S )NrI  r8  ztorch.float8_e5m2rL  r  )r   rM  r   rN  r   r   rO  )r8  s    r?   get_quant_typerZ  M  sN    ''@@+	c,+,  5***3/344rA   rP  c                 \    t        j                  |       }|j                  |j                  fS )z
    Calculate the range of values for a given torch.dtype.
    Args:
        dtype (torch.dtype): The input dtype.
    Returns:
        tuple: A tuple containing the minimum and maximum values.
    )r   finfor  r  )rP  infos     r?   calculate_ranger^  U  s%     ;;uD88TXXrA   c           
         | j                  d      d   }|j                  d   }t               }t        |      \  }}t	               }g g }}t        |      D ]  \  }	}
|
j                  j                  dd      s$t        j                  j                  j                  d   j                  dd      rOt        | |
|d	|	      }t        | |
|||||	      }t        |      s|j                  |       n|j                  |       n| j!                  |
      5  | j#                  t        j$                  j&                  j(                  j*                  |
|fd
|	 d|
j,                         }t        j$                  j&                  j(                  j+                  |
j                  d   |      |j                  d<   t/        |j                  d         |j                  d<   d d d        ||	<    t        |      D 
cg c]  \  }}
|j                  ||
       }}}
t1        |      }||z   }|r|d | |z   ||d  z   }|j3                  dt5        |             t6        d   dxx   dz  cc<   y # 1 sw Y   xY wc c}
}w )Nr   r  r   saved_for_quantizationFrI  use_scalingT-q=r;  r!  r"  r   r  inductor%activation_quantization_fwd_aten_passr!   )r  r   rZ  r^  rs   	enumerater   r   r   rM  r   rN  r7  rA  r   r   r#  r   r   r(  r)  r&  r   r   r  
update_argr   r   )r   r   r
  r8  r+  r9  position_to_quanttensor_scale_nodessym_scale_nodesr  r:   r6  
quant_noder  output_updated_argsr  scale_nodess                    r?   quantize_activation_fwrm  a  sz   *1-F++a.K!J*:6Iy*,b#K0 %5$99==159%%>>3c-&' <4E8

 24ZIx
 #:.&--j9#**:6 **40 !&!4!4		<<DD"J/-hZqD "5 "J 		<<DD IIe,j OOE*
 6M".6JOOM2 +5h'K%5T 7@6L+21da& 
 1
2C$6K%36I#$6OO 	 a234Z@AQFAA &s   CI$5I0$I-	c           
      
  	 | j                   D cg c]  }|j                  dk(  s| }}d }|D ]~  }|j                  j                  dd      s!|j                  j	                  d       |j                  j	                  d      }t
        j                  j                  j                  d   j                  dd      r| j                  |      5  d|j                  j                  dd	      z   	t        	fd
|D              }d d d        | j                        5  | j                  t
        j                  j                  j                   j"                  ||f      }t
        j                  j                  j                   j#                  |j                  d   |      |j                  d<   t%        |j                  d         |j                  d<   d d d        | j                  |      5  | j                  t
        j                  j&                  j(                  j*                  ||f      }t
        j                  j&                  j(                  j+                  |j                  d   |j                  d         |j                  d<   t%        |j                  d         |j                  d<   d d d        | j                        5  | j                  t
        j                  j                  j                   j"                  ||f      }t
        j                  j                  j                   j#                  |j                  d   |      |j                  d<   t%        |j                  d         |j                  d<   d d d        n| j                  |      5  | j                  t
        j                  j                  j                   j"                  ||fdt-        |j                        z         }t
        j                  j                  j                   j#                  |j                  d   |      |j                  d<   t%        |j                  d         |j                  d<   d d d        t/        |j0                  j3                               D ]   }|k7  s	||k7  s|j5                  ||       "  t6        d   dxx   dz  cc<   y c c}w # 1 sw Y   xY w# 1 sw Y   	xY w# 1 sw Y   ;xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nr   r`  Fdequant_typerI  ra  
fp8_scale_
fp8_quant_ c              3   @   K   | ]  }|j                   k(  r|  y wr<   r   )r^   	bwd_input
scale_names     r?   r`   z)quantize_activation_bw.<locals>.<genexpr>  s%      &%$>>Z7 "&s   r  r   r  dequant_r"  rc  %activation_quantization_bwd_aten_passr!   )r   r   r   r   popr   rM  r   rN  r#  r   r   r  r   r   r(  r)  r&  r   r$  divr.  r   rr   userskeysreplace_input_withr   )
r   r:   	bw_inputsactivation_nodero  r6  divided_target_node_32dequant_nodeuserrv  s
            @r?   quantize_activation_bwr    si   "'++J$M1IJIJO H@99==159IIMM2399==8L%%>>3c-'( **40 !-		0A0A,PR0S!SJ!% &)2& "J **:6 &+&9&9		<<DD"L1 ': 'O
 		<<DD IIe,l $((/
 ;R',,U3;O((7 **?; 
-2-@-@		**11-z: .A .* :?9K9K9R9R',,U3Z__U5K:*//6 00F0K0KE0RS +//>
 **+AB #(#6#6		<<DD4lC $7 $L
 		<<DD277> !%%e,
 8O$))%08L%%m4  **40 #(#6#6		<<DD"L1'#dii.8 $7 $L 		<<DD IIe,l !%%e,
 8O$))%08L%%m4 TZZ__./ @<'DO,C++D,?@MH@T Z@AQFAY K  
 
  sJ   SS4SB5S*CS%B5S2CS>S	S"	%S/	2S;	>T	
fwd_module
bwd_modulebwd_module_inputsc                     t        dd  fd       t         j                         t        dd  fd       t        dd fd        j                  j                  d	
      d   j                  d   }|D ]  }d|j
                  v s|t        j                  dd|j
                           }j                  j                  |      5  j                  j                  |j
                        }d d d        |j                  d   }j                  j                  |j                         d|j                  d<   ||j                  d<   |j                  |       j                  j                  |        t        j                  j                   j"                  d   j%                  dd      rt'        j                  j                  d
            }|d   }	t)        |      D ]  }
t+        |
      r|
}	 n  j                  j                  d	
      d   j                  d   }|D ]  }d|j
                  v sj                  j                  |	      5  j                  j                  |j
                        }d d d        j                  j                  |j                         |}	 t-        j                         t        dd fd       y # 1 sw Y   xY w# 1 sw Y   gxY w)Nartifactc                      dddS )N,before_activation_quantization_fwd_aten_passstringr   encodingrQ   rQ   rA   r?   rb   z5perform_fp8_activation_quantization.<locals>.<lambda>      B 
 rA   c                  ,     j                  ddd      S NFT)print_outputinclude_strideinclude_deviceprint_readabler  s   r?   rb   z5perform_fp8_activation_quantization.<locals>.<lambda>      :44tD 5 
 rA   metadata_fn
payload_fnc                      dddS )N+after_activation_quantization_fwd_aten_passr  r  rQ   rQ   rA   r?   rb   z5perform_fp8_activation_quantization.<locals>.<lambda>      A 
 rA   c                  ,     j                  ddd      S r  r  r  s   r?   rb   z5perform_fp8_activation_quantization.<locals>.<lambda>	  r  rA   c                      dddS )N,before_activation_quantization_bwd_aten_passr  r  rQ   rQ   rA   r?   rb   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  rA   c                  ,     j                  ddd      S r  r  r  s   r?   rb   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  rA   r   r  r   rq  z^fp8_quant_pos_\d+_rr  rt  ro  Tr`  rI  ra  r   r  rp  c                      dddS )N+after_activation_quantization_bwd_aten_passr  r  rQ   rQ   rA   r?   rb   z5perform_fp8_activation_quantization.<locals>.<lambda>A  r  rA   c                  ,     j                  ddd      S r  r  r  s   r?   rb   z5perform_fp8_activation_quantization.<locals>.<lambda>E  r  rA   )r   rm  r   r  r   r   resubr#  r   r   updatereplace_all_uses_with
erase_noder   rM  r   rN  r   rr   reversedr   r  )r  r  r  quant_fwd_module_outputsfwd_noderu  quant_bwd_inputro  quant_bwd_module_inputsbwd_input_locbw_inputscaled_fwd_module_outputsscale_bwd_inputs   ``           r?   #perform_fp8_activation_quantizationr    s   
 

	 :++,

	 

	  *//::h:GJOOPQR, 38==()-r8==AI !!11)< S","2"2">">HMM">"RS$>>.9L  ''6=AO  !9:3?O  0++O<''	23 66+	c- #'z'7'7'B'Bm'B'T"U/3 !89 	Hx( (	
 %/$4$4$?$?8$?$LQ$O$T$TUV$W!1 	0Hx}},%%55mD W&0&6&6&B&B&B&VOW$$++HMM: /	0 :++,

	?S S0W Ws   'K'K$K!	$K-	rY   c                 2   t         j                  j                  dd       	 y |r|D cg c]  }|j                   c}ng }| D ci c]  }|j                  | }}t        j
                  j                  j                  d   j                  dd      r)| D ci c]  }d|j                  vs|j                  |  }}|j                  j                  d      d   j                  d   }|j                  j                  d      D ci c]  }|j                  | }}d}	|D ]  }|j                  |v st        |      s|j                  |v r!t        j                  d	|j                         Md
|j                  d<   |j                  d   j                  |j                  d<   d
||j                     j                  d<   |j                  d   j                  ||j                     j                  d<   d
}	 |	rt        |||       y y c c}w c c}w c c}w c c}w )NrI  exclude_primalsFprimalsr   r  r   r   z*Skipping quantization of static input %s: Tr`  r   ro  )inductor_configrN  r   r   r   rM  r   r   r  r   rX  r2   debugr   rP  r  )
r  r  r  rY   r:   static_input_namessaved_values_namesr  r  should_perform_fp8_quants
             r?   enable_activation_quantizationr  K  s    	0044/	
 	
 	 '  ;;t; 
 7CCd$))T/CC66+	c
U#$ )5
 $	8RDIItO
 
 $))444A!DII!L$.$4$4$?$?=$?$Q 		4   %" 
,99**t/Dyy..		F		R26DII./(,		%(8(>(>DIIn%JNdii(--.FG@D		%@P@V@Vdii(--n='+$
,  +J
DUV  9 	< D
s   HH
H*HH)rY   saved_sym_nodesr	  c                   t        | |      \  }}}}| j                  j                  d      }	g t        t        |	      }
g t        t
        |	      }g t        t        |	      }g t        t        |	      }g t        t        |	      }t        | j                  ||z   |z   |z   ||d      }t        j                  j                         }|j                  d      D ]  }|j                  s-t        ||j                         t        ||j                         <|rIt!        d |j                  D              r-t        ||j                         t        ||j                         t        |      st        ||j                         |rJ  t#               }g }g }|D ]C  }t%        |      }|r#|j'                  |       |j)                  |       3|j)                  |       E t+        | j                        }t-        j.                  |||      D ]]  }d|j0                  vrt3        |j0                  d         |z
  }t5        |d       D ]  }||vr|j)                  ||           ||z  }_ |j7                          |j9                  ||z          t        | j                  |
|z   ||z   |z   |t;        t=        |      t=        |      z         D cg c]  }t?        |       c}z   d	      }t        | j                  ||z   |z   |z   |z   ||d      }t@        jB                  jE                  | |      }t@        jB                  jE                  | |      }tG        ||||       ||fS c c}w )
Nr	  r   r  r   c              3      K   | ]X  }|j                   t        j                  j                  j                  j
                  u xr t        |j                        d k(   Z ywr   N)r   r   r   r   wait_tensorr&  r  r{  r]   s     r?   r`   z+_extract_fwd_bwd_modules.<locals>.<genexpr>  sS      )
  HH		22>>FFF "AGG!")
s   AA r   c                     | j                   S r<   rt  )ss    r?   rb   z*_extract_fwd_bwd_modules.<locals>.<lambda>  s
    166 rA   rc   r   )$r  r   r  filterr   r   r   r   r   r   r   distributedis_availabler{  r  r   allr   r   addr   r   	itertoolschainr   r   re   clearextendr  r  r)   rO   _lazy_graph_module_make_graph_moduler  )r   r  r  r	  rY   r
  r  r  r  placeholdersprimal_inputstangent_inputsfwd_seed_offset_inputsbwd_seed_offset_inputsbackward_state_inputs	bwd_graphdistributed_enabledr:   saved_symbolssaved_sym_nodes_bindingsaved_sym_nodes_derivedsymbolsymbol_bindingsnew_symbolsr  r  	fwd_graphr  r  s                                r?   _extract_fwd_bwd_modulesr  z  s    	!O CK/1B  %%00M0BL7fZ67M9vk<89NIv&9<HIIv&9<HIGf%7FG2,&7:PPI  ++88:$$$6 )zzL$))4OTYY7
 !S )
 ZZ)
 &

 L$))4OTYY7%L$))4(((')2 /9lM     1*40f%#**40#**401 3<3E3EFO 7~V %		!"499U#34}D)9: 	?A '#**?1+=>	? 	$%" 25LLM 3..l"_4 3|,s?/CCD
 'q)
	

 	
I 3
	
	 !	!  		 
 	
I &&99,	RJ&&99,	RJ"j*.I z!!/
s   M!)static_lifetime_input_indicesrY   r  c                   g }d}| j                   j                  D ]&  }t        |      st        |      st	        |      s%|}( |J | j                   j                  D ]$  }t        |      s|j                  |       ||u s$ n t        d |D              }t        |       }	t        |       }
|	rIt        | j                         d   $t        j                  d       t        | |||      S t        | d      } t        j                   st#        |        t%        |        |g }t'        | ||      }g }g }t(        j*                  j-                         d }d	 }fd
}| j                   j                  D ]  }|j.                  |vr|j0                  dk(  r$|j.                  d | j3                         D        v rF|j4                  t(        j6                  j8                  j:                  j<                  u rt?        |      r|j                  |        ||      r|j@                  jC                  d      tD        jF                  k(  r|j                  |        ||      r)|	rJ d||j4                  f       |j                  |        ||      s|j0                  dk7  sJ d| d       |jH                  D cg c]  }|j.                  |vs| }}tK        d |D              r|jM                  |       tO        |      r|j                  |        tQ        tR        jU                  |      jW                               }tQ        tR        jU                  |      jW                               }t        jX                  rtY        | j                   |      }||jZ                  }t]        | ||||      \  }}|j                   j_                  t`               |j                   j_                  t`               |	r'|
rtc        | ||te        |            \  }}tg        |      }ti        |      }ti        |      }tk        |d      }te        |jl                        dkD  rtk        |d      }||fS c c}w )a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    Nc              3   T   K   | ]   }|j                   d k7  s|j                   " ywr   Nr   r   r  s     r?   r`   z$default_partition.<locals>.<genexpr>   s$      $tww(/B		$   ((r   zxTrying to unsafely apply AC to a non-functional graph with the default partitioner. Falling back to min-cut partitioner.)r	  r  Tis_default_partitionc                     d| j                   v xs= t        | j                   j                  d      t        j                  j
                        S )Nr  r   )r   r   r   r   _subclasses
FakeTensorr   s    r?   	is_tensorz$default_partition.<locals>.is_tensorF  s=    		) 
ZIIMM% %"3"3">">.
 	
rA   c                 n    t        d | j                  D              xr t        | j                        dkD  S )Nc              3   V   K   | ]!  }|j                   t        j                  k(   # y wr<   )r   operatorgetitemr^   r  s     r?   r`   z=default_partition.<locals>.is_multi_output.<locals>.<genexpr>M  s     GDx///Gs   ')r   )r  r{  r  r   s    r?   is_multi_outputz*default_partition.<locals>.is_multi_outputK  s.    GDJJGG $DJJ!#	
rA   c                     | j                  d      xrO | j                  dvxr?  xs: | j                  t        j                  j
                  j                  j                  uS )NF)impure_random)r   r   )	is_impurer   r   r   r   r   r  r&  )r:   r  s    r?   r  z$default_partition.<locals>.is_impureQ  sc     NNN/ 		 (' U;;eii&@&@&L&L&T&TT	
rA   r   c              3   &   K   | ]	  \  }}|  y wr<   rQ   )r^   kvs      r?   r`   z$default_partition.<locals>.<genexpr>f  s      3
!QA3
s   r   z,Trying to apply AC on a graph with impure opr   z	Expected z to be a tensorc              3   2   K   | ]  }t        |        y wr<   r   r]   s     r?   r`   z$default_partition.<locals>.<genexpr>  s     7!{1~7   r  r	  rY   )is_impure_nodeFr   )7r   r   r   r   r   r   r   r   r   r   r*   warningswarn#min_cut_rematerialization_partitioncleanup_recompute_tagsr   (unsafe_allow_optimization_of_collectivesforce_save_collectivesforce_save_bw_mutation_srcclassify_nodesr   r  r  r   r   named_modulesr   r   r$  _assert_scalarr&  r   r   r   r    	MUST_SAVEr{  r  r  r   rr   rs   fromkeysr|  _sync_decision_cross_ranksrY   r  r   r   functionalize_rng_opsr  #reordering_to_mimic_autograd_enginer0   thread_graphsafe_rng_from_hopsrV   )r   _joint_inputsr	  r  rY   forward_nodes	last_noder:   forward_node_namesgraph_has_recomputable_opsgraph_has_recomputable_rng_ops	node_infor  r  r  r  r  r_   backward_usages	fw_module	bw_moduler  s                        @r?   default_partitionr    sQ   @ MI""(( t$
4(8<OPT<UI    ""(( 4   &9	
 $ $+$  "6l!C%=l%K"! 2 23A6B MML 7 /.K	  .lQUV::|,|,$,(*%3_I LO++88:



$ ""(( ,&99..77j TYY 3
&4463
 &
 ;;%))..77???t ""4(4 99==%)9)C)CC%T?1 >4 1
 %$''_"< 	
v_-	
< '+jjUAFFBT4T1UU777 ""?3d#%Y,&\ l388:;L4==9>>@AO((1,2D2DlS"*&/&K&K#3''$?Iy OO''7H'IOO''7H'I!)#8iC4H$ Iy 8	B	 y)Iy)I.yeLI
9&&'!+29$O	ig Vs   9Q-Q-g    .ArD  c                      | |j                   z  S r<   )itemsize)rD  rP  s     r?   _tensor_nbytesr    s    5>>!!rA   c                 V   dt         fdd| j                  v r| j                  d   }t        |t              ryt        |t        t
        f      rt        fd|D              S t        |t              r"t        fd|j                         D              S t        |t        j                        r |      S t        dt        |       d|        | j                  d	k(  s:| j                  t        j                  j                   j"                  j$                  u ry
t        d|  d      )NrZ   c                     t        | t        j                        syt        t	        | j                         d      | j                        S )Nr      fallback)r   r   r.  r  r   rD  rP  r   s    r?   object_nbytesz_size_of.<locals>.object_nbytes  s1    !U\\*hqwwy4@!''JJrA   r   r!   c              3   .   K   | ]  } |        y wr<   rQ   )r^   r_   r#  s     r?   r`   z_size_of.<locals>.<genexpr>  s     5A}Q'5   c              3   4   K   | ]  \  }} |        y wr<   rQ   )r^   r!  r_   r#  s      r?   r`   z_size_of.<locals>.<genexpr>  s     @DAq}Q'@   zUnknown metadata type z	 on node r   r   rp   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)rt   r   r   r   rr   r   sumrs   itemsr   r.  r   typer   r   r   r$  r  r&  )r:   r   r#  s     @r?   _size_ofr+    s    KC K
 		iic<( dE]+5555T"@CIIK@@@U\\* %%3DI;ivNOOww*uyy~~/L/L/T/T T

vde rA   c           	      2   ddl m}  |t              }| j                  D ]3  }|j                  dk(  s||j
                  j                  xx   dz  cc<   5 t        j                  dt        |j                         t        j                  d      d             y )Nr   r   r   r!   z%sTrd   reverse)collectionsr   rt   r   r   r   rJ   r2   r]  re   r)  r  
itemgetter)r   r   cntr:   s       r?   
_count_opsr2    sr    '%c*C +77o%$$%*%+ HHT6#))+8+>+>q+A4PQrA   c                     g } t        t        j                  j                        D ]  }t	        t        j                  j                  |      }t        |t        j                  j                        sL|j                         D ]G  }t	        ||      }t        j                  j                  |j                  v s6| j                  |          | S r<   )dirr   r   r$  r   r   r   OpOverloadPacket	overloadsr   	pointwiser   r   )r   	attr_nameopoverloadpacketoverloadop_overloads        r?   pointwise_opsr<    s    
C( 
	"599>>9=*EJJ,G,GH(224 	H!"2H=Kyy""k&6&66

+,	
 JrA   	depth_mapc                     | D ci c]7  }t        |t        j                  j                  j                        s2|||   9 }}t        |j                         t        j                  d      d      S c c}w )Nr!   Tr-  )	r   r   rO   r:   rP   re   r)  r  r0  )r   r=  arg
arg_depthss       r?   sort_depthsrA    sf    '+ #z#uxx}}?Q?Q/RYs^J  *""$(*=*=a*@$OOs
   3A2A2gmc                   
 t        j                         i 
| j                  j                  d      D ]  }j	                  |
fd      
|<    t        | j                  j                        D ci c]  \  }}||
 c}}
fd}t        t        t        | j                  j                              }d}t        j                  }|D ]#  }|j                  D ]  }|   |k  s|   }|} % || S t        | j                  j                        d|    D ]U  }|j                  dk(  s|j                  t        j                   j"                  j$                  j&                  u sN ||       W t        | j                  j                        |   d D ]
  } ||        t        j                   j)                  |       }	|	S c c}}w )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traversal, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r  c                     |    S r<   rQ   r   s    r?   rb   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>  s    A rA   c                 *   | g}t               }t        |      dkD  rH|j                         } | |v s| v r'|j                  |        || j                  z  }t        |      dkD  rHt        |fd      }|D ]  } j                  | fd      | <    y )Nr   c                     |    S r<   rQ   )r_   orders    r?   rb   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>-  s    %( rA   rc   c                     |    S r<   rQ   r   s    r?   rb   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>/  r   rA   )r   r  ry  r  all_input_nodesre   r   )r:   	cur_nodesinsertable_nodesr   r   rG  s      r?   insert_node_in_graphzAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph  s    F	0:)nq ==?D''43;  & ---I )nq  ""28JK$ 	DD!++D2BCCI	DrA   Nr   )rO   r   r   r  r   re  r   rr   r  r   mathinfr{  r   r   r   r   r$  copy_r&  GraphModule)rB  r:   r  rL  r  first_node_in_bwdminimum_ordertangentr  new_gmr   r   rG  s             @@@r?   r  r    s   . 
I"$C ##}#5 @''.>?D	@ )2"((..(AB93T3YBED& &bhhnn=>NHHM! )MM 	)DT{]* %d$(!	))  	 RXX^^$%?u->'?@ '77o%$++9M9M9U9U*U &' RXX^^$U+<%=%?@ #T"# XX!!"i0FM[ Cs   5G	r  r  fw_nodebw_nodedevice	rng_countlast_fwd_inputlast_bwd_inputc                    |j                   }|J | j                  }	|j                  }
t        j                  j                  j
                  }| j                  j                  |      5  | j                  j                  d|       }t        |      |j                  d<   |}ddd       |j                  j                  |      5  |j                  j                  d|       }t        |      |j                  d<   |}ddd       t        |j                        }|d<   | j                  j                  |      5  |	j                  d||j                  g|j                  |      }ddd       |j                         |	j!                  |       t        |j                        }|d<   |
j#                  |      5  |
j                  d||j                  g|j                  |      }|j                  |       |
j!                  |       ddd       ||fS # 1 sw Y   zxY w# 1 sw Y   +xY w# 1 sw Y   xY w# 1 sw Y   ||fS xY w)a%  
    Note [CUDA Graph Safe RNG Functionalization]

    CUDA Graph capture doesn't work with get_rng_state and set_rng_state because these functions operate on CPU values,
    while CUDA Graph RNG capture uses on-device CUDA tensors. To solve this, we use graphsafe_set_state with a
    CUDA Generator registered to the CUDA Graph before capture begins. graphsafe_set_state updates the generator's pointer
    to reference a different GeneratorImpl, ensuring subsequent calls are correctly forwarded to the desired generator
    (and its cuda-tensor RNG state during graph capture).

    For each RNG operation's forward/backward pair:

    - We create two generators initialized with identical values
    - Each forward and backward call advances its respective generator equally
    - This keeps generators synchronized so forward and backward operations use matching RNG values

    When forward is called multiple times before backward (causing desynchronization):

    - We save the forward RNG state
    - We update the backward Generator's state before executing backward

    Before each CUDA Graph replay, replay_prologue updates captured RNG pointers with current states, ensuring backward Generator
    changes are reflected during replay.

    This function modifies both forward and backward computation graphs by:

    Creating RNG state placeholders for both passes
    Updating the forward node to use graph-safe RNG state
    Updating the backward node to use graph-safe RNG state

    For more details: https://github.com/pytorch/pytorch/issues/113541
    Nfwd_rng_state_r   bwd_rng_state_	rng_stater   r   r   )indexr   r   _prims	rng_primsgraphsafe_run_with_rng_stater#  r   r,   r   rs   r   create_noder   r   r  r  inserting_before)r  r  rU  rV  rW  rX  rY  rZ  
device_idxfw_graphbw_graphrc  fwd_rng_statebwd_rng_state	fw_kwargsfunctional_fw_node
bwd_kwargs
rng_outputs                     r?   %apply_graphsafe_rng_functionalizationro  M  s2   R J!!!HH#(<<#9#9#V#V  
	(	(	8 '!33nYK4PQ$?
$K5!&' 
	(	(	8 '!33nYK4PQ$?
$K5!&	' W^^$I*Ik		(	(	1 
%11(..07<<0	 2 

 !!"45  gnn%J+J{		"	"7	+ %))(..07<<0	 * 

 	%%j1G$% >))M' '' '
 
% >))s1   (9H9H&:.H3=AH?H#&H03H<?Inum_sym_nodesc                   ' t        j                         }d }dt        t        j                     fd'dt        t        j                     fd} ||       } ||      } ||      }	i }
| j
                  j                  D ]  }t        |      st        |j                  d      s&t        j                  j                  |j                  j                  v sW||j                     }||j                     }|	|j                     }||d|
|<    t        j                  j                  j                   }t        j                  j                  j"                  }d }|j
                  j%                  d	      D ]  }d
|j                  v s|} n |t'        d      g }t)        t+        |j
                  j%                  d	                  }t)        t+        |j
                  j%                  d	                  }t-        'fd|
j/                         D              }|j1                  t        j                  d             t3        |      dkD  }t        j4                  j6                  }t6        j8                  xr* | xr% |j:                   xs |j<                  j>                  }tA        |
j/                               D ]"  \  }}|d   }|d   } '|      }|j
                  }|j
                  }|r'|%|jB                  dk(  rtE        ||||||||      \  }}Z|jG                  |      5  |jI                  d||j                  g|jJ                  |jL                        }|jI                  dtN        jP                  |dfi       } ||      |jR                  d<   |jI                  dtN        jP                  |dfi       } tU        jT                  |jR                        | _)        |jW                  |        |jY                  |       |j[                  |       d d d        |jG                  |      5  dt)        |       }!|j]                  |!      }" ||      |"jR                  d<   d d d        |jG                  |      5  |jI                  d|"|j                  g|jJ                  |jL                        } |jW                  |        |jY                  |       d d d        % |rt)        t_        |j
                  j%                  d	                  }#|#jJ                  d   }$t3        |$      |z
  }%|$d |% ta        |      z   |$|%d  z   }&|j
                  jc                  |&       |j
                  jY                  |#       |je                          |je                          ||fS # 1 sw Y   xY w# 1 sw Y   LxY w# 1 sw Y   xY w)Nc                    i }| j                   j                  D ]i  }|j                  dk(  st        |j                  d      s*t
        j                  j                  |j                  j                  v s[|||j                  <   k |S )Nr   r   )
r   r   r   r   r   r   r   r   r   r   )gmodrandom_nodesr:   s      r?   get_rng_opsz*functionalize_rng_ops.<locals>.get_rng_ops  sl    JJ$$ 	/D?*DKK0II559I9II*.TYY'	/ rA   rZ   c                     d| j                   vry| j                   d   }t        |t              s|f}|D ]D  }t        |t        j                        s|j
                  j                  dk(  s8|j
                  c S  t        j
                  d      S )zV
        Check the example value of the node outputs to find the device type.
        r   Ncudacpu)r   r   r   r   r.  rW  r*  )r:   
candidates	candidates      r?   
get_devicez)functionalize_rng_ops.<locals>.get_device  s     		!YYu%
*e,$J# 	,I)U\\2##((F2$+++	,
 ||E""rA   rW  c                 (   ddl m}  |       }|J |5  | E| j                  dk(  r6|j                  t        j
                  j                               cd d d        S |j                  t	        j                               cd d d        S # 1 sw Y   y xY w)Nr   )detect_fake_moderw  )torch._guardsr}  r*  from_tensorr   rw  get_rng_state)rW  r}  	fake_modes      r?   get_sample_rng_statez3functionalize_rng_ops.<locals>.get_sample_rng_state  s    2$&	$$$ 	@!fkkV&; ,,UZZ-E-E-GH	@ 	@ (()<)<)>?	@ 	@ 	@s   >B"BBr   )fwdbwdr   r  rS  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisc              3   4   K   | ]  } |d            yw)r  NrQ   )r^   	node_pairr{  s     r?   r`   z(functionalize_rng_ops.<locals>.<genexpr>  s       )2
9U#$r'  rx  r!   r  r  rw  r   r_  r   r   rng_state_output_r   )3r  countr	   r   rW  r   r   r   r   r   r   r   r   r   ra  rb  run_and_save_rng_staterun_with_rng_stater  r   r  r  r   valuesdiscardr  rM  r   graphsafe_rng_functionalizationfallback_randomtest_configs*graphsafe_rng_func_ignores_fallback_randomre  r*  ro  re  rd  r   r   r  r  r   copyr  r  r   r   r  r   r   	recompile)(r   r  r  rp  uidru  r  joint_graph_rng_opsfw_graph_rng_opsbw_graph_rng_opsrecomputable_rng_ops_mapr:   	base_noderU  rV  run_and_save_rngr  bw_tangent_start_nodefw_rng_state_outputsrY  rZ  devicesmulti_cuda_devices
ind_config'use_rng_graphsafe_rng_functionalizationrX  r  rW  rg  rh  rl  statern  
state_namebw_rng_state_nodefw_output_node
fw_outputssym_node_start_idxr   r{  s(                                          @r?   r  r    s   2 //
C	#HU\\2 #$@Xell%; @ &l3"9-"9-!""(( 	S4 V,		11T[[5E5EE+DII6I&tyy1G&tyy1G:A'2R$Y/	S ||--DD//BB **m*< 		!$(! $o
 	
 (9??#=#=#=#OPQN(9??#=#=#=#OPQN 6N6U6U6W G OOELL'( W) ''J.. 	
""	
 *** R&&QQ , !**B*I*I*K L G-	9E"E"G$???? 4"v%-R	.*NN **73 3%-%9%9#$!..87<<8">>	 &: &" !,,#$$,a0	 -  %9$@

5!%11#$$*  2 
 #'))GLL"9
--j9##G,$++E2;3@ **+@A M0c<
$,$8$8$D!0DV0L!&&u-M
 **73 	-%11#&+W^^KgllK">>	 2 
 --j9##G,	- 	-}G-X d9??#=#=#=#JKL#((+
 _}<**+()*+,-. 	
 	w'"">2iA3 3@M M
	- 	-s&   #C.V*5V)8AV6V&	)V3	6W 	c                    | j                   j                  D ]t  }t        |j                  t        j
                  j                        s2|j                  j                  dk(  sLt        |      rXt        j                  |j                  d<   v y)z
    By default, the partitioner is not allowed to recompute collectives
    unless they come from a user-annotated AC region.
    See Note [Recomputing collectives in the partitioner]
    r   r   N)r   r   r   r   r   r   r   r   r   r    r	  r   )r   r:   s     r?   r  r    sh     ""(( @t{{EJJ$9$9:%%);;"4(%5%?%?DIIk"@rA   c                    t               }t        | j                  j                        D ]  }|j                  dk(  r|j
                  t        j                  j                  j                  j                  u }|rrt        |      r|j                  |j                  d          t        |      s|j                  d   |v st        j                   |j                  d   j"                  d<    y  y )Nr   r   r!   r   )r   r  r   r   r   r   r   r   r$  rO  r&  r   r  r   r   r    r	  r   )r   has_mutation_in_bwr:   is_copy_s       r?   r  r    s     5?L++112 77h;;%)).."6"6">">>+D1"&&tyy|4*40TYYq\EW5W1A1K1K		!!!+. !rA   c                     | j                   t        j                  k7  ry| j                  d   }d|j                  vxr | j
                  dk(  S )NFr   r  r   )r   r  r  r   r   r   )r:   parents     r?   is_getitem_of_multi_outputr    sA    {{h&&&YYq\F+J?0JJrA   r  c                   | j                   j                  D ]T  }t        |      r|j                  D ]i  }t        |      sd|j                  v sd|j                  v s-|j                  d   |j                  d   kD  sMt
        j                  |j                  d<   k |j                  j                  dd      st        d |j                  D              rt
        j                  |j                  d<   d|j                  vst        d |j                  D              st        |      rd|j                  d   j                  v r4|s8t
        j                  |j                  d<   W | S )a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    ac_graph_idr   has_backward_hookFc              3   2   K   | ]  }t        |        y wr<   r   r  s     r?   r`   z)cleanup_recompute_tags.<locals>.<genexpr>  s      E)-t$Er  c              3   2   K   | ]  }t        |        y wr<   r  r  s     r?   r`   z)cleanup_recompute_tags.<locals>.<genexpr>  s     @TN4(@r  r   )r   r   r   r{  r   r    r	  r   r   r  r   )r   r  r:   r  s       r?   r  r    s3    ""(( +@$

 H"4(%2%2		-0499]3KK-=-G-GDIIk*H yy}}0%8 E15E B& *:)C)C		+&*@TZZ@@ +40]diiPQlFWFW5W$ &6%?%?DIIk"W+@X rA   r  min_cut_optionsdont_banc                   %&'()*+,-./0 
t               t               /t        rQt        d | j                  D              }|t        d /j                  D              z
  }t
        j                  d|       d &d '&'/fd(	 dd l}(/fd	**/fd
}(fd)dt        f)/fd}	|j                         .t               %%./fd}
| j                  D ]L  }|j                  dk(  r|j                  v rm|j                  vr0.j                  |j                   dz   dt"        j$                         `.j                  |j                   dz   dt"        j$                         t'        |      r0.j                  |j                   dz   dt"        j$                         t)        |      st+        |      r |
|       j-                  |      r ||      r |
|       d|j.                  vxr d|j.                  vxs8 d|j.                  v xr( t1        |j.                  d   t2        j4                         }t7        |      rt        t9        |            }nQ|r<t1        |j.                  j;                  d      t<              rdnt"        j$                  }n |	|j>                        }.j                  |j                   dz   |j                   dz   |       |j@                  D ]>  }.j                  |j                   dz   |j                   dz   t"        j$                         @ O dtB        tD        jF                     dtH        dtH        f(fd}jJ                  r(jL                  D ]  }|j@                  D cg c]$  }j-                  |      rjO                  |      & }}|j@                  D cg c]  }j-                  |      s| }}tQ        |      dkD  sw ||tS        |            }tU        |j@                        D ]x  }j-                  |      sjO                  |      |kD  s* (||      s4|%v r9t
        j                  d|jO                  |      ||jO                  |              |
|       z  jV                  r^t               }| j                  D ]D  }j-                  |      sjO                  |      |fg}jO                  |      }tQ        |      dkD  sJtY        jZ                  |      \  }}||v r,|j]                  |       jO                  |      |dz   kD  rNtQ        |      dk(  r@t
        j                  d||jO                  |      jO                  |              |
|       |j@                  D ]J  }j-                  |      s (||      s|%vs$tY        j^                  |jO                  |      |f       L tQ        |      dkD  rG 	 |ja                  .dd      \  }}|\  }-t               }.fd |D        D ]   \  0}|jo                  -0fd!|D               " t               }|D ](  \  } }!| d d" |!d d# k(  sJ | d d" }"|j]                  |"       * tq        |       +ts        | j                        D #ci c]  \  }#}||#
 c}}#,tu        +fd$|D        ,fd%&      }$|$%fS # t        $ r}t        d      |d }~ww xY wc c}w c c}w # tb        $ ri t
        j                  d       t
        j                  dje                  |jf                  jh                  jk                  .                   tm        .        w xY wc c}}#w )'Nc              3      K   | ]H  }|j                   d k(  r7t        |j                  d      r!t        |j                  j                         J yw)r   _overloadpacketN)r   r   r   r   r  r  s     r?   r`   z solve_min_cut.<locals>.<genexpr>  sA      &
ww/)gdkkCT.U ++,&
s   AAc              3   2   K   | ]  }t        |        y wr<   )r   r^   r  s     r?   r`   z solve_min_cut.<locals>.<genexpr>  s      4
CF4
r  z&Ops banned from re-materialization: %sc                 D   |j                   t        j                  j                  j                  k7  ry|j
                  d   }t        j                  j                  j                  |      \  }}|D ].  }|j                  |   }| |u r yt        |t              s)| |v s. y yNFr   T)r   r   r   r   auto_functionalizedr   _higher_order_opsauto_functionalizeget_mutable_argsr   r   rr   )ab
mutable_opmutable_arg_namesr!  r   r?  s          r?   !can_fuse_into_auto_functionalizedz8solve_min_cut.<locals>.can_fuse_into_auto_functionalized  s    88uyy--AAAVVAY
 ##66GG
S	
% 	 D((4.CCx#t$8	  rA   c                     |j                   t        j                  j                  j                  k7  ry|j
                  d   }|D ]  }|j
                  d   |   }| |u s y y)NFtensors_to_cloner   T)r   r   r   r   r   r   )r  r  r  r   r?  s        r?   .can_fuse_into_triton_kernel_wrapper_functionalzEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functional
  sb    88uyy--NNNHH%78% 	D((8$T*CCx	 rA   c                 b   t        |      t        j                  k(  ry | |      ry | |      ry| j                  t        j
                  u r>| j                  d   j                  t        j                  j                  j                  u ryj                  |       xr j                  |      S )NTr   F)r/   r$  catr   r  r  r   r   r   r   r   r@   )r  r  r  r  op_typess     r?   r@   z!solve_min_cut.<locals>.is_fusible  s     1),Q29!Q?HH(((q	  yy%%FFG
 ""1%@(*=*=a*@@rA   r   zANeed networkx installed to perform smart recomputation heuristicsc                 <   j                  |       ryt        | g      }t        |      dkD  ro|j                         }|j                  D ]A  }j                  |      s ||      s yj                  |      s1|j                  |       C t        |      dkD  royr  )rG   r   r  ry  r{  rj   r  )r:   rJ  curr  r@   r  r  s       r?   is_materialized_backwardsz0solve_min_cut.<locals>.is_materialized_backwards.  s    D!v&	)nq --/C		 ( //5jd>S##D)MM$'	( )nq  rA   c                 h   | j                   dk7  ry| j                  t        j                  u ry| j                  j                  dd       t        j                  k(  ryt        j                  rj                  |       ry| j                  t        j                  j                  t        j                  j                  fv ryj                  rj!                  |       s/yj#                  |       sj%                  |       st'        |       ryj(                  r3 |       r+t*        j-                  d| t/        | j0                               y| j2                  dk  r| j2                  t        j4                  kD  ryj6                  r/t9        d | j:                  D              }t=        |       }|dz  |k  S y)	Nr   Fr   Tzmaterialized backwards: %s %si  c              3   h   K   | ]*  }t        |t        j                        st        |       , y wr<   )r   rO   rP   r+  r  s     r?   r`   zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>k  s&      % !*Q2H%s   22r   )r   r   r  r  r   r   r    r	  r   recompute_viewsrG   r$  lift_fresh_copyr&  
lift_freshr}   rI   rE   rC   r   r|   r2   r  r   r{  dist_from_bwmax_dist_from_bwr~   r(  r   r+  )r:   input_tensors_sizeoutput_sizer  r  r  s      r?   should_ban_recomputationz/solve_min_cut.<locals>.should_ban_recomputation<  sn   77o%;;(***99==d+/?/I/II!!h&6&6t&<;;4//779P9PQQ22++D1 ""4(006,T2 77<U=
 II5tU4::=NO t#(9(9F<S<S(S ++!$ %%)YY% " #4.K?%777rA   c                 f      j                   dk(  ryt         fd j                  D               S )Nr   Tc              3   0   K   | ]  } |        y wr<   rQ   )r^   r  r@   r:   s     r?   r`   z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>v  s     E$z$-Es   )r   r  r{  )r:   r@   s   `r?   is_materializedz&solve_min_cut.<locals>.is_materializedr  s*    77m#E$**EEEErA   rZ   c           
         t         j                  r| |v ryt        |       }t         j                  r!j	                  |       rt
        j                  S t        | j                  d   t              r-t        | j                  d   t        j                        st        S t        |dt        t        | j                   d      d      z  z        } |       r|S |dz  S )Nr   r   g?rT  r!      )r    treat_parameters_as_free_to_saver+  r  rG   rM  rN  r   r   r   r   r   INT_INFrt   r  r  r  )r:   rY   mem_szr  r  s      r?   get_node_weightz&solve_min_cut.<locals>.get_node_weightx  s    3333$!!h&6&6t&< 88Odii&5dii.= Vsc#d.?.?*Eq&IIJK4 MA:rA   c                    j                  |       ry| v r\t        | j                  t        j                  j
                        xr | j                  j                  dk(  }t        j                  s|syt        |       ryd| j                  v r(t        | j                  d   t        j                        ryj                  |        j                  d| j                  dz   t        j                          y)NFr   r   source_incapacityT)rG   r   r   r   r   r   r   r   r  r   r   r   r  add_edger   rM  rN  )r:   is_collectivebanned_nodesr  nx_graphr  s     r?   ban_recomputation_if_allowedz3solve_min_cut.<locals>.ban_recomputation_if_allowed  s    D!8 4;;

(=(=> @KK))-??  >>m $DII*TYYu-=u~~"N
 	(DII$5IrA   r   r  sinkr  _outr   r          start_nodes	max_rangec                    g }| D ]*  }t        j                  |
j                  |      |df       , t        |      dkD  rt        j                  |      \  }}}|s
j                  |      S |j
                  D ]_  }
j                  |      s
j                  |      |kD  r*
j                  |      | 	||      f}||vsJt        j                  ||       a t        |      dkD  r|S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushrq   r  heappopr{  rj   )r  r  sorted_nodesr_   r!  r:   node_is_fusibler  r   r@   r  s            r?   find_first_unfusiblez+solve_min_cut.<locals>.find_first_unfusible  s    
 9; 	OANN<)*@*@*CQ)MN	O ,!#',}}\'B$At_" --d33

 
:++D1 --d3i? !..t4"4.6C
 ,.|S9
:	 ,!# rA   z1used above/below fusible %s:(%s) -> %s -> %s:(%s)rT  ztoo long %s %s %s %sr  z-Failed to compute min-cut on following graph:
c              3   ,   K   | ]  }||   f  y wr<   rQ   )r^   r_   r  s     r?   r`   z solve_min_cut.<locals>.<genexpr>l  s     8Q$8s   c              3   0   K   | ]  }|v s|f  y wr<   rQ   )r^   r  non_reachableus     r?   r`   z solve_min_cut.<locals>.<genexpr>m  s     Aa=.@q!fAs   		c              3   (   K   | ]	  }|     y wr<   rQ   r^   r:   name_to_nodes     r?   r`   z solve_min_cut.<locals>.<genexpr>y  s     2d	2s   c                     |    S r<   rQ   )r   node_idxs    r?   rb   zsolve_min_cut.<locals>.<lambda>y  s    (1+ rA   rc   );r   get_default_op_listr1   r   r9   r2   r]  networkxImportErrorr   floatDiGraphr   rV   rT   r  r   rM  rN  r   r   r   rj   r   r   r   r.  r   r   r   r   rY   r{  rr   rO   rP   rt   rz   rg   rq   r  r  r   r{   r  r  r  r  minimum_cut	Exceptionjoin	readwriteedgelistgenerate_edgelistvisualize_min_cut_graphr  get_name_to_nodere  re   )1r   r  r  r  joint_module_opsops_ignorednxer  r  r  r:   is_non_tensor_nodeweightr  r  	used_nodeordersfw_usersfirst_unfusible_usevisited
start_nodefusiblestart_orderr!  r  	cut_value	partition	reachablecutsetnbrs	cut_nodesnode_innode_out	node_namer  r  r  r  r  r@   r  r  r   r  r  r  r  r  s1    ```                                 @@@@@@@@@@@@r?   solve_min_cutr'    s    <"$H% &
#))&
 

 ' 4
$554
 *
 
 	9;G"A&4lFe < zz|H(2L6 !! 3X77h9...9+++!!$))e"3Vdhh!O dii&0&488L$
 dii%/$((Kd248(.
 ##D).Ft.L(. "E}DII'EUtyy SDIIe4Dell)S%S 	 t=./F!$))--"6FDHH  %T9+P+PQF$))e+TYY-?&QJJ 	XDdii&0$))e2CdhhW	Xe3XL$rww- C C 4 ,,"44 	;I &OO++D1 &&t,F  "+I4L4LT4RH  6{Q&:8S[&Q#!)//2 ;D!006%2248;NN&y$7</$O%%229=/ %2248 5T:!;	;P 11'1|%++ !	VJ++J7''
3Z@2G $00<Kg,"w/3'>C  **3/+2CCG)HH."!..s3!..z: 15II VD!006&sD1 4w1G1G1Mt0TUV) g,"!	VF!~~h&I	9  )I}*4,F8i8 B4AdAAB ",I# !s|x},,,CRL	i !
 $K0L+4[5F5F+GHic4c	HH2	28ML %%e
  O
	Z
R  @A2<<00BB8LMN)	& Is=   [ )[0;[5[58[: %]/	[-[(([-:A2],c                    dd l }dd l}|j                  j                  |       j	                         }|j                  |      d   }|j                         D ]c  }| |j                            |j                            d   }|j                  t        |             |t        d      k(  sS|j                  d       e t        j                  d       |j                  d       y )Nr   r  rN  redz2Visualizing the failed graph to min_cut_failed.svgzmin_cut_failed.svg)r  pydotnx_pydotto_pydot	to_stringgraph_from_dot_data	get_edges
get_sourceget_destination	set_labelr   r  	set_colorr2   r]  	write_svg)r  r  r*  
dot_format	dot_graphedger  s          r?   r  r  ~  s    %%h/99;J))*5a8I##% "$//+,T-A-A-CDZPs6{#U5\!NN5!" HHAB,-rA   c                  x   g t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                   t         j"                  t         j$                  t         j&                  t         j(                  t         j*                  t         j,                  t         j.                  t         j0                  t         j2                  t         j4                  t         j6                  t         j8                  t         j:                  t         j<                  t         j>                  t         j@                  t         jB                  t         jD                  t         jF                  t         jH                  t         jJ                  t         jL                  t         jN                  t         jP                  t         jR                  t         jT                  t         jV                  t         jX                  t         jZ                  t         j\                  t         j^                  t         j`                  t         jb                  t         jd                  t         jf                  t         jh                  t         jj                  t         jl                  t         jn                  t         jp                  t         jr                  t         jt                  t         jv                  t         jx                  t         jz                  t         j|                  t         j~                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  } t         j                  t         j                  t         j                  g}|t         j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  t         j                  g	z  }|}| g t        j                  t        j                  t         j                  t         j                  t         j                  t        j                  t        j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t        j                  t        j                  z  } | t         j                  t         j                  gz  } | |z  } | t               z  } | t         j                  gz  } | t        D cg c]  }t        |       c}z  } t        |       }t        t        dt        f      t         j                  t         j                  t         j                  g      }t         j                  t         j                  t         j                  t         j                   t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  g}||z  }t        |t        |      |t        |      |      S c c}w )N.)r$  r  r  rz  atan2r-  r  r  pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltr%  bitwise_notceilfloorfracnegreluroundsilutruncr2   log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtrsqrtr,  sigmoidsoftplus	thresholdthreshold_backwardclampwherelerpaddcmulgelugelu_backwardr(  mean_grad_sum_to_sizesum_to_sizer'  totype_asr  r  squeeze	unsqueezersub_to_copyaliasviewslicetr(  broadcast_in_dimexpand
as_stridedpermuteselectrO  r)  clone	full_likevarstd_unsafe_viewreshapebroadcast_tensorsscalar_tensorones	new_zerosr  arangetriuvar_meanisinfr   fullzerosempty
empty_likeargmaxmaximumiota'_low_memory_max_pool_offsets_to_indicesr`  gatherr<  
zeros_liker   r   r   r   r   native_dropout	rand_like
randn_likemmconvolutionconvolution_backwardbmmaddmm#_scaled_dot_product_flash_attention'_scaled_dot_product_efficient_attention_flash_attention_forward_efficient_attention_forwardupsample_bilinear2d
_scaled_mmr4   )default_recomputable_opsrecomputable_view_opsr8   mr9   r7   r6   r5   s           r?   r  r    s   L0L0L0 	L0 	

	L0
 	L0 	L0 	L0 	L0 	L0 			L0 	L0 	L0 	L0 	L0 	L0  	!L0" 	#L0$ 	%L0& 	'L0( 	)L0* 	+L0, 	-L0. 	/L00 			1L02 	

3L04 			5L06 	7L08 			9L0: 	

;L0< 			=L0> 	

?L0@ 	AL0B 	

CL0D 	

EL0F 			GL0H 	IL0J 	KL0L 	

ML0N 	OL0P 			QL0R 	SL0T 			UL0V 			WL0X 	YL0Z 			[L0\ 			]L0^ 	_L0` 			aL0b 			cL0d 	

eL0f 			gL0h 	

iL0j 	kL0l 	mL0n 	oL0p 	qL0r 	sL0t 	

uL0v 	

wL0x 			yL0z 	{L0| 			}L0~ 	L0@ 	AL0B 			CL0D 	EL0F 	GL0H 			IL0J 	KL0L 	ML0N 	OL0P 	QL0R 	SL0T 			UL0V 	WL0Z "\\4>>4::F		




 
 %H $!		$!""$! 	

$! 		$!
 	$! 			$! 			$! 	$! 	$! 	$! 	$! 	$! 			$! 	$! 	

$!  	!$!" 	#$!$ 	%$!& 			'$!( 	)$!* 	+$!, 	-$!. 			/$!0 	1$!2 	

3$!4 	5$!6 			7$!8 	9$!: 	

;$!< 	

=$!> 	?$!@ 	A$!B 	C$!D 	

E$!F 	55G$! $L T[[ 99(/!   N1!3A!6 NN!":;HS#X./			dnndoo>J 	!!

0044%%))   #Z/K()8 + !Os   7d7c                 J    i }| j                   D ]  }|||j                  <    |S r<   )r   r   )r   r   r:   s      r?   r  r  7	  s.    L '"&TYY'rA   memoryruntimes
max_memoryall_recomputable_banned_nodesc                    t         j                  }|dk(  rt        |||      S |dk(  rt        |||      S |dk(  rt	        |||      S |dk(  rt        |||      S |dk(  rZt        j                  d       t        j                  | |||      }t	        ||t        |      j                  t        |	            S t        |      r ||| |||      \  }}	d
||	fS t        d|       )Ngreedyilpdpr$   dynamic_memory_budget_dpzdynamic_memory_budget_dp is an experimental solver. It does not guarantee performance improvements. Additionally, it is not guaranteed to be stable.)r   r   recorded_knapsack_input_memories recorded_knapsack_input_runtimes)graph_info_provider)knapsack_algomax_mem_budgetr  z,Not aware of memory budget knapsack solver: )r   activation_memory_budget_solverr%   r&   r#   r$   r2   warningr"   inialize_from_graphr'   get_knee_point_memory_budgetcallabler   )
r   r  r  r  r  r  SOLVERr  saved_node_idxrecomp_node_idxs
             r?   #_optimize_runtime_with_given_memoryr  >	  s     33Fvx<<	5FHj99	468Z88	3	3-fh
KK	-	-?	

 0CC#*G-3-5	
 $7**)) + 	
 		
 
&	*0KY8U+
' ^_55I&RSSrA   no_dispatchr   r!  c                     t        | j                        }fd}|D cg c]
  } ||       }}| j                         D cg c]
  } ||       }}| j                  ||      S c c}w c c}w )Nc                     t        |       S )Nr   )r   )dr!  s    r?   realize_symbolz8_remove_symbols_without_guarding.<locals>.realize_symbolu	  s    H--rA   )stride)rr   shaper  new_empty_strided)r   r!  r  r  r  r  s    `    r?    _remove_symbols_without_guardingr  r	  sk    ME. )..1^A.E.)*4AnQ4F4uV44 /4s   A'A,c                 F   	 t         j                  }d }|dk(  ry|dk(  rat               5  ddlm} t        j                  | j                   j                  f      \  	|j                  	 fd      }|cd d d        S |dk(  rudd	l
m} t        j                  | j                   j                  f      \  	 |d
      5 }  j                  i 	 d d d        j                         }t        |d      S t        d|       # 1 sw Y   y xY w# 1 sw Y   ?xY w)Nc                 z   t        | t        j                        rAt        | j                  d   t        j
                        rt        | j                  d   d      S t        | t        j                        rAt        | j                  d   t        j                        rt        | j                  d   d      S t        | t        j                        r(t        | j                  d   t        j                        ryt        | t        j                        r(t        | j                  d   t        j                        ry| S )Nr   r  r         ?T)r   rO   rP   r   r   r.  r  r   r   r   r   r"  s    r?   materialize_argz)estimate_runtime.<locals>.materialize_arg	  s    a!j&M3AFF5MDQQ277#
166%=%,,(OAFF5MD99277#
166%=%..(Q277#
166%=%--(PHrA   testingr!   profiler   )benchmarkerc                  (     j                    i S r<   )r   )r   r   r:   s   r?   rb   z"estimate_runtime.<locals>.<lambda>	  s    ;4;;3O3O rA   flops)FlopCounterModeF)displayz Not aware of runtime estimator: )r   *activation_memory_budget_runtime_estimatorr  $torch._inductor.runtime.benchmarkingr  r   tree_mapr   r   benchmark_gputorch.utils.flop_counterr  r   get_total_flopsr  r   )
r:   RUNTIME_MODEr  r  msr  modecounted_flopsr   r   s
   `       @@r?   estimate_runtimer  }	  s   DDL
 y 		"] 	H!???TYY<TULD&**+OPB	 	 
	 <DKK8PQfU+ 	)tDKK((	),,.=!$$=l^LMM#	 		) 	)s   ADDDD c                 P    !"#$%&'()* |dkD  s|dk  rt        d|       t        t        j                  t        j                  t        j
                  t        j                  t        j                        }t        j                  rt        |dddd      }|dk(  rj                  S t         |      \  }}|dk(  r|S dt        t        j                     dt        fd	! !j                        ( !|      &&(k  r|S &(fd
}dt        t        j                     f!&(fd"t        |ddd      }t         |      \  }} "|      |k  r|S t        |d      t               \  }	}
 "|	      |k  r|	S ddlm# t%        #fdj                  D              %dt$        t        j                     dt        t        j                     f#%fd} ||
      }|D cg c]1  }|j&                  j)                  dd      t*        j,                  k(  r|3 }}|D cg c]	  }||vs| }}t/        |t0        d       t3               dk(  rj                  |z   S  D cg c]  } |t1        |             c}' D cg c]  }t5        |       c}*ddlm)  ')*fd$t        j:                  r"$ *fd} |d       |d      g}|d   dd  |d   dd  k7  r|d   |d   fg}|r|j=                         \  }}|d   |d   z
  dk  r#|j?                  |       |j?                  |       F ||d   |d   z   dz        }|dd  |dd  k7  r|j?                  ||f       |dd  |dd  k7  r|j?                  ||f       |r|jA                          dd l!m"} |D cg c]  }|d   	 }}|D cg c]  }|d   	 }}|jG                  d       |jI                  ||d        tK        |      D ]"  \  }}|jM                  |d!|||   fd"d#d$%       $ |jO                  d&       |jQ                  d'       |jS                  d(       |jU                  d       |jW                         }|jY                          t[        j\                         }t        j^                  't        j^                  }t[        j`                  |d)       d*}tb        jd                  jg                         r?tb        jd                  ji                         r!d+tb        jd                  jk                          }tZ        jl                  jo                  |d,| d-tq                d.      }|js                  |       tt        jw                  d/|        $| 0      d   S c c}w c c}w c c}w c c}w c c}w c c}w )1Nr!   r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )rz   r{   r|   r}   r~   F)rz   r{   r|   r}   r  rZ   c                 :    t        t        t        |             dz  S N    eA)r(  mapr+  )r  s    r?   estimate_activations_sizez:choose_saved_values_set.<locals>.estimate_activations_size	  s    3x./#55rA   c                     | dz  z
  z  S r  rQ   )szmax_act_sizemin_act_sizes    r?   get_normalized_sizez4choose_saved_values_set.<locals>.get_normalized_size	  s    S\L899rA   activationsc                 &     |       z
  z
  z  S r<   rQ   )r  r  r  r  s    r?   get_mem_ratioz.choose_saved_values_set.<locals>.get_mem_ratio	  s"    )+6E<'
 	
rA   )rz   r{   r|   )r}   )get_node_storagec              3   .   K   | ]  } |        y wr<   rQ   )r^   r:   r  s     r?   r`   z*choose_saved_values_set.<locals>.<genexpr>	  s     T4 0 6Tr%  r  c                     | D cg c]1  }|j                   t        d      k  r |      vst        |      r|3 c}S c c}w r  )r  rt   r   )r  r  r  input_storagess     r?   get_recomputable_banned_nodesz>choose_saved_values_set.<locals>.get_recomputable_banned_nodes	  sL    
 "
 S)$Q'~=03 
 	
 
s   6?r   Tr-  r  c                            5  t        |t        | d      |      \  }}}d d d        t               }D ]  }	 |j                  |           |j                        sJ t        |||      \  }}	t        r+t        ||D 
cg c]  }
t        |
       c}
|	       |fS # 1 sw Y   xY w# t        $ r Y w xY wc c}
w )Nr   )	r   r  saved_node_idxsrecomputable_node_idxsexpected_runtimememories_banned_nodes normalized_memories_banned_nodesruntimes_banned_nodesmin_cut_saved_values)
r  r  r   r  BaseExceptionissubsetr'  r1   r   r+  )memory_budgetr  r   r  r  r  r  r  r  r!  r  aggressive_optionsr  r  r  r  s              r?   get_saved_values_knapsackz:choose_saved_values_set.<locals>.get_saved_values_knapsack 
  s   ] 	
 4%%M1%-	 &		 )3) 	C:3?@	   !>???'	
a !4'.K /'=!1)F'$%HQK' 2G&;%1 ---W	 	$ ! $'s#   B/ B;C
/B8;	CCc                 N     |       \  }}| t              |z
   |      fS )N)r  r   )r(  )r  r  r  r  r  r   r  r  s      r?   estimate_for_budgetz4choose_saved_values_set.<locals>.estimate_for_budgetP
  s@    -FYK.*L* )*-==l+ rA   r  r  gMbP?r  )
      )figsizeo)markerz.4fzoffset points)r   r  center)
textcoordsxytexthazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtime)exist_okrr  _rank_memory_budget_paretor!  z.svgz%Generated Pareto frontier curve at %s)r  r  r   )<r   ry   r   ban_recompute_used_far_apart!ban_recompute_long_fusible_chains#ban_recompute_materialized_backwardban_recompute_not_in_allowlistban_recompute_reductionsaggressive_recomputationr   rT   r'  rr   rO   rP   r  torch._inductor.fx_utilsr  r   r   r   r    r	  re   r+  r  r  torch.utils._mode_utilsr  visualize_memory_budget_paretory  r   sortmatplotlib.pyplotpyplotfigureplotre  annotatexlabelylabeltitlegridgcfshowosgetcwdmemory_budget_pareto_dirmakedirsr   r  r  is_initializedget_rankpathr
  r+   savefigr2   r  )+r   r  r  r  runtime_optimized_saved_valuesr!  r  more_aggressive_optionsmore_aggressive_saved_values%aggressive_recomputation_saved_valuesr  r  recomputable_banned_nodesr  must_save_nodesr:   r  optionsbisectslhsrhsmidpltitemx_valuesy_valuestxtfigfig_dirrank_suffixfig_namer   r  r  r  r  r  r  r  r  r  r  r  s+   ``                             @@@@@@@@@@@@r?   choose_saved_values_setrA  	  s   
 qMA-XYfXgh
 	
 $$AA#)#K#K%+%O%O & E E88O &&!"'',).$)
 (5)%"A --6RWW 6% 6 -Y-=-=>L,-KLL|#--:
4= 

 &##(%*	 '4Y 7'# ! 12]B++  % ;HY 2;7)< :;mK449T9CSCSTTN
 )
	bgg
  !>l K +66::k5)-=-G-GG 	
O  -!0H! ! %+!x%! ()Q./112O-.HQK( ,I#' 4,. ,.\ ,,	 	 's+-@-EF1:ab>WQZ^+
GAJ/0G";;=Sq6CF?T)NN3'NN3')3q6CF?a*?@qr7c!"g%NNC:.qr7c!"g%NNC:.  	'(/0DG00(/0DG00 	

7
#8C0  ) 	FAsLLs)hqk"*  	 	

?#

56		NOggi
))+**655GKK$/))+0A0A0P0P0R"5#4#4#=#=#?"@AK77<<+K=:L:N9OtT
 	H;XF %#yk	 	c
!d 10s*   '6V
#	V-V&VV"V4V#c                    ddl m d }fd}t        j                  j	                         rct        j                  j                         rDt        j                  j                         dkD  r" ||       r ||       rt               5          5  |D cg c]  }|j                   c}g}t        t        j                  j                               D cg c]  }g  }}t        j                  j                  ||d          t        |       }g }	i }
t        |      D ]w  \  }}|D cg c]  }||   	 }}d}|D ]C  }t        |      }||z  }|t        j                  j                         k(  s5||
|j                  <   E ||
d<   |	j                  |       y t        j                   |	t        j                  j"                  j%                               }t        j                  j'                  |t        j                  j"                  j(                  j*                         t-        t        j.                  |      j1                               }d	| d
|
 t3        dd fd       ||   D cg c]  }||   	 }}d d d        d d d        |S |S c c}w c c}w c c}w c c}w # 1 sw Y   )xY w# 1 sw Y   |S xY w)Nr   )unset_fake_temporarilyc                     | j                   D ]K  }t        |j                  t        j                  j
                        s2|j                  j                  dv sK y y)N>   c10d_functionalr   TF)r   r   r   r   r   r   r   )r   r:   s     r?   has_collectivesz3_sync_decision_cross_ranks.<locals>.has_collectives
  sM    %% 	DUZZ22++''+RR		
 rA   c                     dj                  d | j                  D              }t        j                  |j	                  d            j                         }t        t        j                  j                               D cg c]  }d  c}t               5          5  t        j                  j                  |       d d d        d d d        t        fdD              S c c}w # 1 sw Y   *xY w# 1 sw Y   .xY w)N/c              3   4   K   | ]  }|j                     y wr<   rt  )r^   r   s     r?   r`   zE_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<genexpr>
  s     >qAFF>r  zutf-8c              3   .   K   | ]  }d    |k(    ywr  rQ   )r^   r   
all_inputss     r?   r`   zE_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<genexpr>
  s     :!:a=A%:r%  )r
  r   hashlibsha256encode	hexdigestr  r   r  get_world_sizer  all_gather_objectr  )r   node_strrT   r!  rK  rC  s       @r?   has_same_nodesz2_sync_decision_cross_ranks.<locals>.has_same_nodes
  s    
 88>K,=,=>> 89CCE$)%*;*;*J*J*L$MNqdN
] 	D24 	D//
FC	D 	D :z:::	 O	D 	D 	D 	Ds*    	C#C4!C(?C4(C1	-C44C=r!   z
total size)rW  r  zpicked_rank_idx=z, saved_nodes of current rank=r  c                      dddS )N)aot_joint_graph_sync_decision_cross_ranksr  r  rQ   rQ   rA   r?   rb   z,_sync_decision_cross_ranks.<locals>.<lambda>
  s    G (% rA   c                       S r<   rQ   )sync_decision_cross_ranks_strs   r?   rb   z,_sync_decision_cross_ranks.<locals>.<lambda>
  s    #@ rA   r  )torch._subclasses.fake_tensorrC  r   r  r  r)  rP  r  r   r  rQ  r  re  r+  r*  r   rB  distributed_c10d_get_object_coll_device
all_reduceReduceOpMAXrt   argminr9  r   )r   r  rF  rS  r   objectsr!  saved_ops_names_all_ranksr   saved_sizessaved_ops_with_sizesr  saved_ops_namesop_namesaved_nodes
saved_sizer:   size_of_nodesaved_sizes_tensorpicked_rank_idxr_   rW  rC  s                        @@r?   r  r  
  s    E; 	&&(,,.,,.2K(;'] *	24 *	(45156G!%"3"3"B"B"DE::% : //0I7ST:V+K8L%'K35 (12K(L 	/$_DST|G4TT
' GD#+D>L,.Je//88:::F,TYY7	G
 6@$\2"":.	/ "'((99QQS" (("u'8'8'I'I'R'R'V'V )  "%,,/A"B"G"G"IJO.>>OOm  oC  nD  -E) A *C?)S$%QL Q*	 *	X <W 6: U:Q*	 *	 *	X sb   J5J)J--J)	J#A	J),J
8:J)3D J)3J$?J)J5J))J2	.J55J?c                    d}|rdnd}t        t        | j                  j                  d                  }| j                  j                  dt        j
                  j                  j                        D ]  }t        | |j                  d   j                        }t        |t        j                        sBg }|j                  j                  d      D ]  }||j                  v s| j                  j                  |      5  | j                  j!                  | d|       }	|d	z  }|j"                  d
   |	j"                  d
<   |	}|j%                  |	       ddd        |s| j                  j                  |      5  | j                  j'                  dt        j
                  j                  j                  g |j                  |i       }
|j)                  |
d       ddd       |j"                  j+                  d      }|r8|\  }}g ||D cg c]  }|j"                  d
    c}}||f
j"                  d<   | j                  j-                  |        | S # 1 sw Y   xY w# 1 sw Y   xY wc c}w )u  
    Graph-safe RNG lets torch.compile use CUDA Graphs for graphs with RNG ops.
    For graphs without HOPs, the partitioner adds placeholder nodes
    fwd_rng_state_* and bw_rng_state_* to the forward and backward graphs. At
    runtime, the AOTDispatcher retrieves these RNG states and passes them to the
    compiled graphs.

    This works well for no-HOP graphs. With HOPs, the partitioner runs
    recursively: it first partitions the HOP (producing forward/backward HOP
    subgraphs) and then stitches them back into the outer joint graph. For HOPs
    that contain RNG ops, the outer joint graph now includes HOP subgraph
    modules with extra RNG placeholders. We must thread these placeholders
    through the outer module partitioned forward and backward graphs—this
    function does exactly that. It collects the RNG placeholder nodes from the
    HOPs and creates corresponding placeholders in the outer forward and
    backward graphs.

    There is a catch: for a short period, the joint graph is in a “bad” state.
    The HOP subgraphs expect additional inputs (because of the new
    placeholders), but the outer graph call sites don't yet provide them. We
    can't fix this in the joint graph because the joint graph's input signature
    is fixed (primals, tangents). As a compromise, we keep the joint graph in
    somewhat of a bad state for some time and, once the outer forward and
    backward graphs are partitioned, insert the corresponding RNG placeholders
    and wire up the calls.
    r   rj  ri  r   r  r   )r   r   r!  r!   r   NT)propagate_metaeager_input_vals)r  r  r   r  r   r   r   invoke_subgraphr   r   r   r   rO   rP  r   r#  r   r   r   rd  r  r   r  )moduler   rX  
rng_string
last_inputhop_noder   new_rng_inputsplaceholder_noder^  new_hop_node_with_fixed_args
eager_vals
eager_argseager_kwargsinpnew_eager_argss                   r?   r  r  
  sf   8 I$/_Jhv||66-6HIJJLL++599#9#9#I#I ,  ,2 68==#3#:#:;h/N$,NN$=$=$=$O 9 !1!6!66  55jA 9$*LL$<$<)l!I;7%	 "Q	0@0E0Ee0L	u-%.
&--i89 9	9 \\11(; 	39<<3K3K'		..>>9(--9.9	40 224T 3 	 &]]../AB
/9,J&#&5CDc#((5/D&N
 '$M0556HI ''1Y,2\ MI9 9	 	" Es    AI  A#I-I9 I*-I6	c                    t        | j                        t               | j                  j                  D ]m  }|j                  dk(  r d|j
                  v rj                  |       nt        |      rj                  |       |v sSj                  |j                         o t        t        t        | j                  j                              }t        t        t        | j                  j                              }||z   }t        | |      \  }}}	}
j                  d |D               t        | j                  |||	d      }t        fd|j                  D              t        fd| j                  j                  D              }t        fdt!        |      D              }d	}i }| j                  j                  D ]  }|v s|||<   |d
z  } t#        ||||      S )Nr   r   r  c              3   F   K   | ]  }||j                   dk7  s|  y w)Nr   r  )r^   r  s     r?   r`   z!classify_nodes.<locals>.<genexpr>O  s$      !-ADDH4Ds   !!!r   c              3   Z   K   | ]"  }|j                   d k7  r|j                      $ ywr  r  r  s     r?   r`   z!classify_nodes.<locals>.<genexpr>U  s.      877h 	TYY8s   (+c              3   2   K   | ]  }|vr|vr|  y wr<   rQ   )r^   r:   rV   rg   s     r?   r`   z!classify_nodes.<locals>.<genexpr>Z  s*      6((T9J-J 	6s   c              3   2   K   | ]  \  }}|v s|  y wr<   rQ   )r^   r  pr  s      r?   r`   z!classify_nodes.<locals>.<genexpr>_  s"      -a!7T2T-s   r   r!   )r  r   r   r   r   r   r  r   r  r{  rr   r  r   r   r  r   re  rS   )r   r  r	  r:   r  r  rT   r
  r  r  r  forward_only_graphrW   rY   fw_cntrX   r   rV   rg   s    `              @@@r?   r  r  =  s   #L$6$67L-7\""(( 177m#
dkk(A!!$'!$'!!$'$$$$TZZ01 
L,>,>,D,DEFM!&)<l>P>P>V>V"WX33F O CK/1B    <FK1BI .8 8&,,8 .
 ,6 6 &&,,6 ,O
 #- -.- # FH""(( $$#HTNaKF # rA   )r  c          	      `	   | j                   j                          | j                          | j                   }t        j                  rt        |      }|| _         | j                   }t        |       }t        |       }	|rt        | d      } t        j                  st        |        t        |        |g }t        | ||      }
t        |
j                        dk(  rt        | ||||
j                         S t#        | j                   j$                        D ]  }|j&                  dk(  rt)        d      |_        #|
j-                  |      sd|_        <t)        d      |_        |j.                  D ]*  }t1        |j*                  |j*                  dz         |_        ,  t        j2                  }|j$                  D ]=  }t5        |j6                  j9                  d	d      t:              s.|j6                  d	   } n t=        ||
|
      }t        j>                  rt?        ||      }tA        tC        tD        |            }tA        tC        d |            }tG        | ||||
j                         \  }}|r|	rtI        | ||t        |            \  }}tK        |      }t        jL                  rddl'm&}  |||||
j                          tQ        |      }tQ        |      }tS        |d      }tS        |d      }tT        rtW        |D cg c]  }tY        |      t[        |      f c}      }t]        d |D              dz  }t^        ja                  d|       t^        ja                  d|       tc        d |j                   j$                  D              }tc        d |j                   j$                  D              }||z  }te        t(              }|j                   j$                  D ]R  }|jf                  |v sti        |jj                  d      s)|t[        |jj                  jl                        xx   dz  cc<   T t^        ja                  dt        |      t        |      t        |             tW        |jo                         tq        jr                  d      d      }t^        ja                  d|       ||fS c c}w )ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    Fr  Nr   )r	  r  rY   r   r  r!   r  )r  c                     t        |        S r<   r  )r_   s    r?   rb   z5min_cut_rematerialization_partition.<locals>.<lambda>  s    [^); rA   r  )enable_activation_offloadingr  Tc              3   2   K   | ]  }t        |        y wr<   )r+  r  s     r?   r`   z6min_cut_rematerialization_partition.<locals>.<genexpr>  s     'J'Jr  z'Theoretical Activations Stored: %.2f GBz,Theoretical Per Activation Storage Sizes: %sc              3   T   K   | ]   }|j                   d k(  s|j                   " ywr   Nr  r  s     r?   r`   z6min_cut_rematerialization_partition.<locals>.<genexpr>  $      %
477o;UDII%
r  c              3   T   K   | ]   }|j                   d k(  s|j                   " ywr  r  r  s     r?   r`   z6min_cut_rematerialization_partition.<locals>.<genexpr>  r  r  r  z# remat/fw/bw: %d/%d/%dr-  zCount of Ops Rematerialized: %s):r   r   r  r   cser.   r   r   r  r  r  r  r  r  rV   r  rY   r  r   r   rt   r  rj   r{  r  activation_memory_budgetr   r   r   r  rA  r  rr   r  r   r  r  r  r  ,_activation_offloading.activation_offloadingr0   r  r1   re   r+  r   r(  r2   r]  r   r   r   r   r   r  r)  r  r0  )r   r  compilerr	  r  r   	cse_graphr   r  r  r  r:   r  r  r  r  r  r  r  r  sorted_sizestotal_activations_size_gbfw_module_nodesbw_module_nodesremat_nodescountsrematerialized_opss                              r?   r  r  r  sB   D **,D zz &	&$$K!5l!C%=l%K"!-lQVW::|,|,$,(*%3_I 9&&'1, +*G(1(M(M
 	
 ++112 R77h #CD))$/ !D #CD

 R$'(9(94;L;Lq;P$Q!RR 33M!! diimmOT:EB IIo6M +#L ((1+|L6+|<=O;\JKL 4''$-$I$IIy ")#8iC4H$ Iy 4I>I **	
 	%11		
 y)Iy)I.yeLI.ydKIlKSV4KL %('J\'J$JS$P!:<UV 	?N$ %
"+//"7"7%
 
 % %
"+//"7"7%
 
 &7!,S!1OO)) 	>DyyK'GDKKAR,Ss4;;6678A=8	> 	%  		
 $LLN 3 3A 6
 	24FGi= Ls   ?R+tracedfnamefigname
clear_metaprogparse_stack_tracedot_graph_shapec                    |rWt        j                  | j                        }t        j                  | |      } | j                  j
                  D ]	  }i |_         t        j                  j                  |      \  }	}
|
sdt        j                  z   }
t        j                  d|	|
       t        j                  | |||      }|j!                         }t#        |d|
j%                  d      z         }|	 |
 }|	 ||       y  |||       y )NrL  zWriting FX graph to file: %s%s)r  r  write_)r  )r  deepcopyr   rO   rP  r   r   r%  r+  splitextr   torch_compile_graph_formatr2   r]  r   FxGraphDrawerget_main_dot_graphr   lstrip)r  r  r  r  r  r  r  r   r:   baseextgr   write_methods                 r?   
draw_graphr    s     MM&,,/		2LL&& 	DDI	  'ID#F555HH-tS9""+'		A 	
A1hC89LfSENE|UU&rA   )NF)g      @rb  r   r<   )r!   )rc  )fx_graphTNFN)r  ru   rL  r  r  loggingrM  r  r%  os.pathr  r  r/  r   collections.abcr   dataclassesr   r   typingr   r	   r
   r   r   torch._inductor.inductor_primstorch.distributedtorch.fxrO   torch.utils._pytreeutils_pytreer   torch._dynamo.utilsr   r   ;torch._functorch._activation_checkpointing.ac_logging_utilsr   torch._inductorr   r  torch._library.utilsr   torch._loggingr   rX  r   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   r   torch.fx.passesr   torch.utils._ordered_setr   torch.utils.checkpointr    rr  -_activation_checkpointing.graph_info_providerr"   "_activation_checkpointing.knapsackr#   r$   r%   r&   ,_activation_checkpointing.knapsack_evaluatorr'   _aot_autograd.descriptorsr(   r)   _aot_autograd.functional_utilsr*   _aot_autograd.logging_utilsr+   _aot_autograd.utilsr,   r-   compile_utilsr.   r/   r0   sympydebug_partitionerr1   rw   rN   	getLoggerrJ   r2   Loggerr   r$  r(  r4   rS   ry   rP   r   rP  r   r   rt   r   r   r   r   r   rr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r7  rP  rA  r.  rG  rQ  rX  rZ  r^  rm  r  rs   r  r  r  r  r  r  r+  r2  cacher<  rA  r  rW  ro  r  r  r  r  r  r'  r  r  r  r  r  r  r  r  rA  r  r  r  r  r  rQ   rA   r?   <module>r     sG           	  	  # $ * 6 6  %   $ $ < 6 + + A ? H L  ) / 3  L  L L @ ; M H H  %66 t 6'g''1W^^ 1yy~~		 > > >2      >    T r~~ $ 2>> d  C  I  #$)WWMW "'']W 	?	W
 smW "W XXWtRWW  Gbgg G$ GBGG  bgg $ bgg $ XRWW X XCrww C4 CBbgg B$ BJbgg J$ JKrww K4 Kbgg $ 5rww 54 5J..J
4=$rww-i$y/IJJ$$rww- s d277mU277C<-@@A K88>>K
((--K 
K 
	K
 K\C!88>>C!
((--C! C! 	C!
 C! C! C! XX]]C!L9%,, 95 9"D- G%((-- GD G45 5	5;; 	5 	=G%((.. =GT =G@MG%((.. MGT MG`WWW CL)W 
	W| BF	,Wrww-,W,W ,W "**RWW*=!>	,W
 
,Wj BFx"..x"rww-x" "'']x"
 x" "**RWW*=!>x" 2>>2>>)*x"@ :>AE@ ..@ 
 $,DI#6@  "**RWW*=!>@  2>>2>>)*@ F c("# " "277 s :Rbhh R  "Pbggsl!3 PU277C<=P8Q PKBNN Kr~~ K\Z*xx##Z*xx##Z* XX]]Z* XX]]	Z*
 LLZ* Z* HHMMZ* HHMMZ*zR ..R ~~R  ~~R  	R 
 2>>2>>)*R j@ @D @R^^  .K5..5;?5^^5x /3	W&W&W& #W& z"''*+	W&t."eW ePBHH -T-TK-T 5k-T 	-T
 -T $(=-T 5$s)T#Y&'-T` 05 5 5 5$NT u	u	u	 
"'']	u	pNN/3EHHMM/BNbM`2p g  :>g ..g  $,DI#6g  2>>2>>)*g Z ,0#%)'HH  '' ' 	'
 5d3i(
)' ' c]' 
'rA   