
    iT                        d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ e
rd dlmZ d d	fd
efdZ e j&                  ed d      Z e j&                  ed d	      Ze G d d             Ze G d d             Z G d d      Z G d d      Zd"dZd d dfdZd Zd#dZe G d d             Ze G d d              Zd! Z y)$    N)deque)	dataclass)AnyLiteralTYPE_CHECKINGprofile)
DeviceType)_KinetoEventc                     | j                   S N)childrenxs    O/var/www/html/engine/venv/lib/python3.12/site-packages/torch/profiler/_utils.py<lambda>r      s
    1::     Freversec              #      K   |rt         nd }t         ||             }|r4 ||      }|  | ||            D ]  }|j                  |        |r3y y w)Nc                     | S r    r   s    r   r   z_traverse.<locals>.<lambda>   s    q r   )reversedr   append)treenext_fnchildren_fnr   order	remaining
curr_eventchild_events           r   	_traverser!      s\     H[EeDk"I
Y'
 Z!89 	*K[)	* s   AAAc                 "    | j                         S r   )popr   s    r   r   r      s    aeeg r   T)r   r   c                 "    | j                         S r   )popleftr   s    r   r   r      s     r   c                   V    e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   e	d        Z
y)EventMetricsr   duration_time_nsself_time_nsidle_time_nsqueue_depthc                 T    | j                   dk(  ry| j                  | j                   z  S )Nr   g        )r(   r*   selfs    r   fraction_idle_timezEventMetrics.fraction_idle_time(   s*      A%  4#8#888r   N)__name__
__module____qualname__r(   int__annotations__r)   r*   r+   propertyr/   r   r   r   r'   r'   !   s=    cL#L#K9 9r   r'   c                   0    e Zd ZU eed<   eed<   dZeed<   y)Intervalstartendr   r+   N)r0   r1   r2   r3   r4   r+   r   r   r   r7   r7   /   s    J	HKr   r7   c                   >    e Zd Zd	dZd Zd ZdefdZdee	   fdZ
y)
EventKeyreturnNc                     || _         y r   event)r.   r?   s     r   __init__zEventKey.__init__7   s	    
r   c                 @    t        | j                  j                        S r   )hashr?   idr-   s    r   __hash__zEventKey.__hash__:   s    DJJMM""r   c                 \    | j                   j                  |j                   j                  k(  S r   )r?   rC   )r.   others     r   __eq__zEventKey.__eq__=   s    zz}}..r   c                 0    | j                   j                   S r   )r?   namer-   s    r   __repr__zEventKey.__repr__@   s    **//"#r   	intervalsc                    d}t        |d       }|rgt        | j                  j                  |d   j                        }t        | j                  j                  |d   j                        }||k  r|||z
  z  }d\  }}|t        |      k  r||   }||   }|dz  }|j                  |j                  kD  r2|j                  |j                  kD  r|dz  }U|j                  |_        |}t        | j                  j                  |j                        }t        | j                  j                  |j                        }||k  r|||z
  z  }|t        |      k  r|S )Nr   c                     | j                   S r   r8   r   s    r   r   z,EventKey.intervals_overlap.<locals>.<lambda>E   s
    AGG r   key)r      rQ   )	sortedmaxr?   start_time_nsr8   minend_time_nsr9   len)	r.   rK   overlap_timeoverlap_startoverlap_endijprev_intervalcurr_intervals	            r   intervals_overlapzEventKey.intervals_overlapC   sJ   9*;<	

 8 8)A,:L:LMMdjj44il6F6FGK{*m ;;1#i. %aLM%aLMFA  =#6#66 $$}'8'88FA*7*;*;M'A

 8 8-:M:MNMdjj44m6G6GHK{*m ;;! #i. $ r   r<   N)r0   r1   r2   r@   rD   rG   strrJ   listr7   r_   r   r   r   r;   r;   6   s-    #/$# $4> r   r;   c                   J    e Zd ZdeddfdZddZd ZddZd Zdd	e	d
e
fdZy)BasicEvaluationprofr<   Nc                 J   || _         i | _        | j                          t        | j                  j	                         d       | _        | j
                  D cg c]  }|j                   c}| _        g | _        | j                         | _
        | j                          y c c}w )Nc                 .    | j                   j                  S r   )r?   rT   r   s    r   r   z*BasicEvaluation.__init__.<locals>.<lambda>j   s    qww/D/D r   rO   )r	   metricscompute_self_timerR   keys
event_keysr?   eventscuda_eventscompute_queue_depthqueue_depth_listcompute_idle_time)r.   re   es      r   r@   zBasicEvaluation.__init__e   s    57  LL%D
 )-81qww8/1 $ 8 8 :  9s   B c                 6   | j                   j                  J t        | j                   j                  j                               }|r|j	                         }|j
                  }|j                  D ]"  }||j
                  z  }|j                  |       $ t        |      | j                  vs!J d|j                   d|j                          t        |      | j                  t        |      <   |j
                  | j                  t        |         _        |ryy)zM
        Computes event's self time(total time - time in child ops).
        NzDuplicate id: z, )r)   )r	   kineto_resultsr   experimental_event_treer#   r(   r   r   r;   rh   rC   rI   r'   )r.   stackr   	self_timer    s        r   ri   z!BasicEvaluation.compute_self_timeq   s     ||**666dll11IIKL J"33I)22 *[999	[)* J't||;  r*//1BC; 2>91UDLL*-. ",!<!< LL$ r   c                    | j                   j                  J | j                   j                  j                         }d d t        fd|D        d       }t        fd|D        d       }t        ||z   d       | _        i }d	}|D ]  t        |fd
|      }||<   ||n|}  d	}d}||z   | j                  z   }	d }
g }|	j                  |
       |	D ]  }t        |d      rE|j                         dz  }|j                         |j                         z   dz  }||v r
||   ||   }t        |d      r@|j                         }|j                         |j                         z   }||v r/||   *||   }n$t        |d      r|j                  }|j                  }|t        |      k  r@||   j                         k  r*|dz  }|t        |      k  r||   j                         |k  r*||z
  dz   }t        |d	      }t        |d      st        |d      r|j!                  t#        |             ]t        |d      sk|| j$                  t'        |         _         |S )z
        Computes queue_depth at each event. This will calculate the queue depth data for
        All the events in the tree.
        This will return a list of Interval of queue depth data of cuda launch and kernels.
        c                 `    h d}t        t        | d|             t        fd|D              S )z+Check if the event is a CUDA launch kernel.>   cudaLaunchKernel__cudaLaunchKernelcudaLaunchKernelExCcudaLaunchCooperativeKernel&cudaLaunchCooperativeKernelMultiDevicerI   c              3   @   K   | ]  }j                  |        y wr   )
startswith.0patternrI   s     r   	<genexpr>zUBasicEvaluation.compute_queue_depth.<locals>.is_cuda_launch_kernel.<locals>.<genexpr>   s     OGtw/Os   )ra   getattrany)rq   launch_patternsrI   s     @r   is_cuda_launch_kernelzBBasicEvaluation.compute_queue_depth.<locals>.is_cuda_launch_kernel   s0    O wq&!,-DOOOOr   c                     | j                         t        j                  k7  ryt        t	        | d|             j                         h d}t        fd|D               S )z,Check if the event is a CUDA runtime kernel.FrI   >   cpymemfreeallocc              3   &   K   | ]  }|v  
 y wr   r   r   s     r   r   zNBasicEvaluation.compute_queue_depth.<locals>.is_cuda_kernel.<locals>.<genexpr>   s     Kw7d?Ks   )device_typer
   CUDAra   r   lowerr   )rq   exclude_patternsrI   s     @r   is_cuda_kernelz;BasicEvaluation.compute_queue_depth.<locals>.is_cuda_kernel   sS     }}*//1wq&!,-335D  ?K:JKKKKr   c              3   4   K   | ]  } |      s|  y wr   r   )r   rq   r   s     r   r   z6BasicEvaluation.compute_queue_depth.<locals>.<genexpr>   s     D1+@+CQD   c                 "    | j                         S r   start_nsr   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>       !**, r   rO   c              3   4   K   | ]  } |      s|  y wr   r   )r   rq   r   s     r   r   z6BasicEvaluation.compute_queue_depth.<locals>.<genexpr>   s     =1>!+<Q=r   c                 "    | j                         S r   r   r   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   r   r   c                 "    | j                         S r   r   r   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   s    1::< r   r   c                 F    | j                         j                         k(  S r   )linked_correlation_id)r   cuda_launch_events    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   s!    !113$::<= r   rN   c                     t        | d      r| j                         dz  S t        | d      r| j                         S t        | d      r| j                  S t	        d      )Nstart_us  r   rT   zUnknown Event Type)hasattrr   r   rT   	Exceptionr>   s    r   new_old_event_comparatorzEBasicEvaluation.compute_queue_depth.<locals>.new_old_event_comparator   sW    uj)~~'$..uj)~~''uo.***011r   r   r   r   rT   rQ   )r	   rs   rl   rR   rm   index_of_first_matchsortr   r   duration_usr   duration_nsrT   rV   rW   rS   r   r7   rh   r;   r+   )r.   cuda_event_listcuda_launch_eventscuda_kernel_eventskernel_mappinglast_mapped_kernelindexcurrent_kernel_indexspawned_kernel_index
all_eventsr   ro   r?   
start_timeend_timecurrent_queue_depthr   r   r   s                   @@@r   rn   z#BasicEvaluation.compute_queue_depth   s    ||**666,,55<<>
	P	L $DD&
 $==&

 "!339O
 35!3 	T("=(	E 16N,-*/*;AS	T  !!'*<<t{{J
	2 ,.45  	PEuj)"^^-4
!NN,u/@/@/BBdJN*~e/D/P+9%+@(uj)"^^-
 >>+e.?.?.AAN*~e/D/P+9%+@(0"00
 ,, %s+='>>'(<=FFHZW$)$ %s+='>>'(<=FFHZW #79M"MPQ"Q"%&91"=uj)WUJ-G ''Z3FG 0<OXe_-9A 	PD  r   c                    d}d}g }| j                   r| j                  rw|t        | j                  d   j                  | j                   d   j                        t        | j                   d   j
                  | j                  d   j                        gz  }| j                   D ][  }|j                  dk(  r|s|j
                  }d}|j                  dkD  s2|s5|j                  t        ||j                               d}] | j                  D cg c]  }|j                   }}|D ]7  }t        |      j                  |      | j                  t        |         _        9 yc c}w )z4
        Computes idle time of the profile.
        Fr   r   TN)ro   rl   r7   rT   r8   r9   rV   r+   r   rh   r?   r;   r_   r*   )r.   idle
idle_startidle_intervals
data_pointrq   
event_listr?   s           r   rp   z!BasicEvaluation.compute_idle_time   sD   
 
)+  T[[Q55t7L7LQ7O7U7UV..r266B8S8ST N
 // 	J%%*4'^^
%%)d%%hz:;K;K&LM	 (,||4!agg4
4 	0E9A:/ LL%)6	0 5s   E c                    ddl }t        t        | j                              }|D cg c]  }|j                   }}dd}g }d}|t        |      k  r||   kD  r|dz  }t        |dz   t        |            D ]i  }	t        |fd|	      }
t        ||	|
      }|%||   |k\  s.|j                  t        ||   j                  ||   j                               |
|
n|} n |dz  }|t        |      k  r| j                  D cg c]  }|j                  |      r| }}|r|j                  |D cg c]  }| j                  |   j                   c}|j                         }|j                  |D cg c]  }| j                  |   j"                   c}|j                         }||j%                  |      z
  |j'                  |      z  }||j%                  |      z
  |j'                  |      z  }|d	|z  z   }t)        t+        ||d
      t-        j.                  d      d
      D cg c]  \  }}|	 }}}|d| }|S c c}w c c}w c c}w c c}w c c}}w )a  
        Filter and Rank the events based on some heuristics:
        1) Events that are in the falling phase of the queue depth.
        2) Events that have a high idle_time, self_time difference.

        Parameters:
            length: The number of events to return.
        r   N   rQ   c                     | k  S r   r   )r   bottom_threasholds    r   r   z-BasicEvaluation.rank_events.<locals>.<lambda>.  s    .?)? r   rN   )r8   r9   )dtypeg333333?T)strict)rP   r   )torchrb   r   ro   r+   rW   ranger   argmaxr   r7   r8   rh   r_   tensorr)   float32r/   meanstdrR   zipoperator
itemgetter)r.   lengthr   ro   rq   	qd_valuestop_threasholddecrease_intervalr[   r\   next_minimum_idxpeak_idxr?   r   rv   	idle_timenormalized_gainnormalized_selfheuristic_score_list_r   s                       @r   rank_eventszBasicEvaluation.rank_events  s    	)>)> ?@,<=qQ]]=	=#i. |//Q1q5#i.1  $8?q$  ")1:JK 'Ih,?>,Q%,, ,X6<<>Nq>Q>W>W
 -=,H(aA!" FA+ #i. 2 
&&'89 

 

 ?IJee$11Jmm % I EOPEe$77Pmm % I  )5::i+@@EIIiDXXO(5::i+@@EIIiDXXO#2S?5J#J 
 !',jF ++A. !Au J  $GV,Js >:
 K Qs   II	5 I7 I.Ir   print_enablec                    | j                  |      }|s|S |rdnd}|dj                  |D cg c]@  }d d| dt        |j                         d| j                  |   j
                  dz  d	d
d 	B c}      z  }|rt        |       |S c c}w )NzOptimizable events:
zNo events to optimize

zP--------------------------------------------------------------------------------z
Event:                z
Source code location: z
Percentage idle time: d   z.2fz%
)r   joinsource_code_locationr?   rh   r/   print)r.   r   r   r   outputr?   s         r   get_optimizable_eventsz&BasicEvaluation.get_optimizable_events[  s    %%f-
,6(<U$)) (  J g +EKK89 :||E*==CCH I	
		
 		
 &Ms   AB	
r`   )rQ   T)r0   r1   r2   r	   r@   ri   rn   rp   r   r3   boolr   r   r   r   rd   rd   d   s@    
!W 
! 
!=,m ^08GRS D r   rd   c                 z    ||t        |       k\  rt        |       }t        ||      D ]  } || |         s|c S  y r   )rW   r   )seq	predicater8   r9   r[   s        r   r   r   p  sF    
{cSXo#h5# SVH r   c                     | S r   r   r   s    r   r   r   y  s    a r   c                 h    | || } t        |       dk(  ry | j                  t        | |            |z   S )Nr   rO   )rW   r   rS   )r   rP   r8   r9   s       r   r   r   y  s6    
eC.C
3x1}99S#&'%//r   c                 ~    | ;t        j                  d| j                        }|| j                  } 1| j                  S y)Nz
\.py\(.*\)zNo source code location found)researchrI   parent)r?   matchs     r   r   r     s:    

		-4=LLEzz*r   c                  J    ddl m}   |        5  	 d d d        y # 1 sw Y   y xY w)Nr   r   )torch.autograd.profilerr	   r   s    r   _init_for_cuda_graphsr     s"    /	   s   "c                   l    e Zd ZU dZeed<   ed   ed<   ed   dz  ed<   eez  dz  ed<   eee	f   ed	<   y)
TimelineEventz-Represents an event in the profiler timeline.	timestamp)r8   r9   regular
event_typefilenamenodeNmarker_type
identifierr?   )
r0   r1   r2   __doc__r3   r4   r   ra   dictr   r   r   r   r   r     sD    7N122+,t33c	D  S>r   r   c                   V    e Zd ZU dZed   ed<   eez  ed<   edz  ed<   dZ	edz  ed<   y)ContextStackEntryz5Represents a context (filename or node) in the stack.r   context_typer   Nmetadatatid)
r0   r1   r2   r   r   r4   ra   r3   r   r   r   r   r   r   r     s3    ?,--c	TkCtr   r   c           
         ddl m} | j                  dg       }g d }fd}|D ]q  }d|vsd|vr ||      r;|d   d	d
 }|j                  d      r |d||       8	 t	        |      } |d|       O|d   }j                  t        |ddd|             s j                  d        g }	D ]  }
|
j                  xdk(  r` |
j                  J |
j                  dk(  rzt        |
j                  t              sJ |j                  |
j                        }|
j                  j                  d      }|	j                  t        d|
j                  ||             |
j                  dk(  sd}|
j                  j                  d      }t!        |	      D ]/  }|j"                  dk(  s|j$                  |k(  s#|j&                  } n |s|j                  di       }|
j                  |v s=||
j                     }|	j                  t        d|
j                  ||             uxdk(  rm t)        t+        |	      dz
  dd      D ]N  }|	|   }|
j                  |j"                  k(  s"|
j                  |j                  k(  s<|	j-                  |         dk(  sd}d}|
j                  j                  d      }t!        |	      D ]h  }|j$                  |k(  s|j"                  dk(  s#|j&                  s0|j&                  j                  dd      }|j&                  j                  dd      } n |s|s|
j                  j/                  di       }|r||d<   |s||d<    y# t
        $ r Y w xY w)an  
    Maps recorded profiler events to their corresponding fx nodes and adds stack traces.

    Builds a timeline of all events (regular ops and FX markers for filenames/nodes),
    sorts by timestamp, then processes chronologically while maintaining a context stack of active
    filename/node scopes. Regular events are augmented with stack traces and node names from the
    innermost active context. Runtime is O(n log n) for n events.

    Args:
        traced_data: Json of profiler events from Chrome trace

    Returns:
        Dict mapping recorded event names to their aten operations with added stack traces
    r   )_FX_METADATA_REGISTRYtraceEventsc                     | j                  d      dk(  xrD | j                  dd      j                  d      xr! | j                  dd      j                  d      S )Ncatcpu_oprI    z## z ##)getr   endswithr>   s    r   is_fx_marker_eventzLmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.is_fx_marker_event  sT    IIe( 6		&"%0076		&"%..u5	
r   c           	          |d   }||d   z   }j                  t        |d| ||             j                  t        |d| ||             y )Ntsdurr8   r9   )r   r   )r   r   r?   start_tsend_tsevent_timelines        r   append_fx_marker_eventzPmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.append_fx_marker_event  sX    ;E%L((GZUK	
 	&%ZG	
r   r  r  rI      z.pyr   r   r   Nc                     | j                   S r   )r   r   s    r   r   zBmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.<lambda>  s
    akk r   rO   r8   r   node_metadatar9   rQ   r   stack_tracezNo model stack trace availabler  args	node_name)torch.fx.tracebackr  r  r  r3   
ValueErrorr   r   r   r   r   r   
isinstancera   r?   r   r   r   r   r   r   rW   r#   
setdefault)traced_datar  trace_eventsr	  r  r?   content
node_indexr  context_stacktimeline_eventr   r   current_file_metadata	ctx_entryr  	node_metar[   current_stack_tracecurrent_node_name	event_tidr  r  s                         @r   0map_recorded_events_to_aten_ops_with_stack_tracer(    s    9??="5L +-N

  YuU 2e$FmAb)G&&z7EB!$WJ 'vz5A T{H!!-)T4QV"WX'Y, 12 .0M ) L>''%00<<<!--;%n&?&?EEE4889R9RSH(..2259C!(()&(A(A8S
 $//69,0)(..2259C%-m%< "	%22j@ ) 44=4F4F1!" -(=(A(A/SU(V)44E5B . 9 96I *00 1$*N,E,EyRU!" s=1A5r2> A -a 0I&22i6L6LL*559M9MM%))!,  '+#$(!*0044U;	!)-!8 
"I }}	1$11V;	@R@R2;2D2D2H2H -/O3/ 1:0B0B0F0Fvr0R- "
" '*;)//::62FD*.A]+(,=[)YL>! " s   M))	M65M6)r   Nr`   )!	functoolsr   r   collectionsr   dataclassesr   typingr   r   r   r   r	   torch.profilerr
   torch.autogradr   r   r!   partialtraverse_dfstraverse_bfsr'   r7   r;   rd   r   r   r   r   r   r   r(  r   r   r   <module>r2     s     	  ! . . + % + *>u * * !y  4EtT y  ,e
 
9 
9 
9   + +\I IX  qd 0+      O>r   