
    i?                     p   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlZd dlmZ g dZ G d d	e      Zd
 Zd Zd Zd Z G d d      Z G d d      Z edg d      Z G d de      Z G d de      Z G d de      Z G d d      Zd ZdZdZ d Z!d#d Z"	 	 	 	 	 	 	 	 	 	 d$d!Z#d" Z$y)%    N)defaultdict
namedtuple)
attrgetter)AnyOptional)
deprecated)
DeviceType)	EventListFormattedTimesMixinIntervalKernelFunctionEventFunctionEventAvgStringTableMemRecordsAccc                        e Zd ZdZ fdZd Zd Zd Zd Zd Z	e
d        Z	 	 	 	 	 	 	 	 dd	Zd
 Zd ZdedefdZ	 	 	 ddZd Z xZS )r
   a>
  A list of profiling events with helper methods for analysis and visualization.

    EventList extends the standard Python list to provide specialized methods for
    working with profiling events (FunctionEvent or FunctionEventAvg objects).
    It includes utilities for aggregating statistics, formatting output tables,
    and exporting profiling data.

    This class is typically returned by profiler methods and should not be
    instantiated directly by users.

    Args:
        *args: Standard list arguments.
        use_device (str, optional): Device type for profiling ("cuda", "xpu", etc.).
        profile_memory (bool, optional): Whether memory profiling was enabled. Default: False.
        with_flops (bool, optional): Whether to include FLOP counts. Default: False.

    Attributes:
        _use_device (str): Device type being profiled.
        _profile_memory (bool): Whether memory profiling is enabled.
        _with_flops (bool): Whether FLOP counting is enabled.
        _tree_built (bool): Whether the event tree structure has been built.

    Key Methods:
        table(...): Format events as a table string for display.
        export_chrome_trace(path): Export to Chrome tracing format.
        export_stacks(path, metric): Export stack traces with metrics.
        key_averages(...): Compute averaged statistics grouped by operation name.
        total_average(): Compute aggregate totals across all events (sums, not averages).

    Properties:
        self_cpu_time_total: Sum of self CPU time across all events.

    Example::

        import torch
        from torch.profiler import profile, ProfilerActivity

        with profile(activities=[ProfilerActivity.CPU]) as prof:
            x = torch.randn(100, 100)
            y = torch.matmul(x, x)

        # EventList is returned by prof.events()
        events = prof.events()

        # Display as formatted table
        print(
            events.table(
                sort_by="cpu_time_total", row_limit=20, top_level_events_only=False
            )
        )

        # Export to Chrome tracing format
        events.export_chrome_trace("trace.json")

        # Get averaged statistics
        avg_events = events.key_averages()
        print(avg_events.table())

        # Export stack traces
        events.export_stacks("stacks.txt", "self_cpu_time_total")

    See Also:
        - :class:`FunctionEvent`: Individual profiling event
        - :class:`FunctionEventAvg`: Averaged profiling statistics
        - :meth:`table`: Format events as a readable table
        - :meth:`key_averages`: Aggregate events by operation name
    c                     |j                  dd       }|j                  dd      }|j                  dd      }t        |   |i | || _        || _        d| _        || _        y )N
use_deviceprofile_memoryF
with_flops)popsuper__init___use_device_profile_memory_tree_built_with_flops)selfargskwargsr   r   r   	__class__s         V/var/www/html/engine/venv/lib/python3.12/site-packages/torch/autograd/profiler_util.pyr   zEventList.__init___   sf    ZZd3
$4e<ZZe4
$)&)%- %    c                 r    | j                          | j                          | j                          d| _        y )NT)_populate_cpu_children_remove_dup_nodes_set_backward_stacktracesr   r   s    r"   _build_treezEventList._build_treej   s.    ##% &&(r#   c                 "    | j                         S N)tabler(   s    r"   __str__zEventList.__str__p   s    zz|r#   c                    	 t               }t        t        |             D ]  }| |   j                  | |   j                  j                  | |   j                  k(  s=t        | |   j                  j
                        dk(  sc| |   j
                  | |   j                  _        | |   j                  | |   j                  _        | |   j
                  D ]  }| |   j                  |_         |j                  |        t        |      dk(  ry t        |       D cg c]  \  }}||vs| }}}| j                          | j                  |       Rc c}}w )N   r   )setrangelen
cpu_parentnamecpu_childrenkernelsadd	enumerateclearextend)r   	to_deleteidxchindevnew_evtss          r"   r&   zEventList._remove_dup_nodess   s2   ISY' 
'I((4S	,,11T#Y^^CDI00==>!C8<S	8N8NDI((53793D3DDI((0"3i44 =(,S	(<(<=MM#&
' 9~"*3D/RwsBS	=QRHRJJLKK!+ " Ss   E*Ec                    | D cg c]-  }|j                   s|j                  t        j                  k(  r|/ }}t	        |t        d            }t        j                  |d       }|D ]  \  }}t	        |d       }g }|D ]  }	t        |      dkD  r|d   }
|	j                  j                  |
j                  j                  k\  s-|	j                  j                  |
j                  j                  kD  r|j                          nG|
j                  |	       |	j                  t        d|	j                          |	j#                  |
       nt        |      dkD  r|j%                  |	         yc c}w )	a4  Populate child events into each underlying FunctionEvent object.

        One event is a child of another if [s1, e1) is inside [s2, e2). Where
        s1 and e1 would be start and end of the child event's interval. And
        s2 and e2 start and end of the parent event's interval

        Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10]
        be a parent of two other intervals.

        If for any reason two intervals intersect only partially, this function
        will not record a parent child relationship between then.
        threadkeyc                 2    | j                   | j                  fS r+   )rB   node_idevents    r"   <lambda>z2EventList._populate_cpu_children.<locals>.<lambda>   s    u||U]]&C r#   c                 \    | j                   j                  | j                   j                   gS r+   )
time_rangestartendrG   s    r"   rI   z2EventList._populate_cpu_children.<locals>.<lambda>   s&    5#3#3#9#9E<L<L<P<P;P"Q r#   r   Nz(There is already a CPU parent event for )is_asyncdevice_typer	   CPUsortedr   	itertoolsgroupbyr2   rK   rL   rM   r   append_cpu_childr3   AssertionErrorrD   set_cpu_parentappend)r   evtsync_eventseventsthreads
_thread_idthread_eventsthread_events_current_eventsrH   parents              r"   r%   z EventList._populate_cpu_children   ss   $ 
<<COOz~~$E 
 

 8$
 ##C
  *1 	-%J#QN 35N' -.)A-+B/F((..&2C2C2G2GG ++//&2C2C2G2GG '**,//6 ++7"0"J599+ V#  ,,V4 .)A-" %%e,%-	-9
s   2E0c                 V   fdi }| D ]D  } |      |j                   |j                  |j                  f}||vs6|j                   ||<   F | D ]S  } |      }||j                  t	        d      |j                  |j                  f}|j                  |g       |_         U y )Nc                 P    | y | j                   dk(  r| S  | j                        S Nr/   )scoper3   )rY   	bw_parents    r"   rf   z6EventList._set_backward_stacktraces.<locals>.bw_parent   s*    {a
 00r#   z1Expected fwd_thread to be set for backward parent)stacksequence_nrrB   
fwd_threadrV   get)r   
fwd_stacksrY   tprf   s        @r"   r'   z#EventList._set_backward_stacktraces   s    	1 
 	.C~%#))*?__cjj1J&$'IIJqM		.  	2C#A}<<'(K  ]]ALL1&NN1b1		2r#   c                 &    t        d | D              S )Nc              3   4   K   | ]  }|j                     y wr+   )self_cpu_time_total.0rH   s     r"   	<genexpr>z0EventList.self_cpu_time_total.<locals>.<genexpr>   s     ?5,,?   )sumr(   s    r"   rp   zEventList.self_cpu_time_total   s    ?$???r#   c	                 V    t        | ||||||| j                  | j                  ||      S )a  Print an EventList as a nicely formatted table.

        Args:
            sort_by (str, optional): Attribute used to sort entries. By default
                they are printed in the same order as they were registered.
                Valid keys include: ``cpu_time``, ``cuda_time``, ``xpu_time``,
                ``cpu_time_total``, ``cuda_time_total``, ``xpu_time_total``,
                ``cpu_memory_usage``, ``cuda_memory_usage``, ``xpu_memory_usage``,
                ``self_cpu_memory_usage``, ``self_cuda_memory_usage``,
                ``self_xpu_memory_usage``, ``count``.
            top_level_events_only(bool, optional): Boolean flag to determine the
                selection of events to display. If true, the profiler will only
                display events at top level like top-level invocation of python
                `lstm`, python `add` or other functions, nested events like low-level
                cpu/cuda/xpu ops events are omitted for profiler result readability.
            time_unit(str, optional): A time unit to be used for all values in the
                table. Valid options are: ``s``, ``ms`` and ``us``.

        Returns:
            A string containing the table.
        )
sort_by	row_limitmax_src_column_widthmax_name_column_widthmax_shapes_column_widthheaderr   r   top_level_events_only	time_unit)_build_tabler   r   )	r   rw   rx   ry   rz   r{   r|   r}   r~   s	            r"   r,   zEventList.table   s?    @ !5"7$;//''"7
 	
r#   c                 ^   ddl }| j                  sdn| j                  }t        |d      5 }d}|j                  d       | D ]  }|j                  |j                  dj                  |j                  |j                  j                  |j                  j                         |j                  s|j                  nd|j                   d|j                   d	             |j                  D ]P  }|j                  d
|j                   d|j                  j                   d|j                   d| d| d       |dz  }R  t        |       dkD  r=|j                  |j                         dz
  |j                          |j#                          |j                  d       ddd       y# 1 sw Y   yxY w)zExport an EventList as a Chrome tracing tools file.

        The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.

        Args:
            path (str): Path where the trace will be written.
        r   Ncudaw[zc{{"name": "{}", "ph": "X", "ts": {}, "dur": {}, "tid": {}, "pid": "CPU functions", "args": {{}}}}, z
" node_id:z, thread_id:z "z
{"name": "z", "ph": "s", "ts": z	, "tid": z , "pid": "CPU functions", "id": z, "cat": "cpu_to_z", "args": {}}, r/      ])osr   openwrite
trace_nameformatrK   rL   
elapsed_us	is_remoterB   rF   r6   r2   seektellSEEK_SETtruncate)r   pathr   device_namefnext_idrY   _s           r"   export_chrome_tracezEventList.export_chrome_trace  s    	$($4$4f$:J:J$_ +	G GGCL !!>>)' (.v,,113"}} 

)#++l3::,bQ(   !A GG%cnn%5 6!!$!5!5 6 7""%** .!!(	 ***5 7((	 qLG!'!!D 4y1}qvvx!|R[[1

GGCLW+	 +	 +	s   E/F##F,c                 
    g dS )N)rp   self_cuda_time_totalself_xpu_time_totalself_privateuse1_time_total r(   s    r"   supported_export_stacks_metricsz)EventList.supported_export_stacks_metricsU  s    
 	
r#   r   metricc           	         || j                         vr%t        dt        | j                               z         t        j                  dd      }t	        |d      5 }| D ]  }|j
                  st        |j
                        dkD  s)t        ||j                  dd      j                  dd      j                  d	d            }t        |      dkD  std
}t        |j
                        D ]  }||j                  |      z  }|dz  } |d d dz   t        t        |            z   }|j                  |dz           	 d d d        y # 1 sw Y   y xY w)Nzmetric should be one of: z ;	
____r   r   r   devicexpuprivateuse1 ;rN    
)r   
ValueErrorstr	maketransr   rg   r2   getattrreplaceintreversed	translater   )	r   r   r   translate_tabler   rY   metric_value	stack_strentrys	            r"   export_stackszEventList.export_stacks]  s>   ==??+d::<=>  --&9$_ 	2 299SYY!!3#*vx8 1 9	$L <(1,$&	%-cii%8 -E%)III%,I- %.crNS$83s<?P;Q$Q		D 012	2 	2 	2s    E-EA
EA(EEc                    | j                   st        d      t        t              }dt        t
        df   ffd}| D ]  | ||||         j                         ! t        |j                         | j                  | j                  | j                        }|D ])  j                  d| _        |sd_        |r#d_        + |S )a  Averages all function events over their keys.

        Args:
            group_by_input_shapes: group entries by
                (event name, input shapes) rather than just event name.
                This is useful to see which input shapes contribute to the runtime
                the most and may help with size-specific optimizations or
                choosing the best candidates for quantization (aka fitting a roof line)

            group_by_stack_n: group by top n stack trace entries

            group_by_overload_name: Differentiate operators by their overload name e.g. aten::add.Tensor
            and aten::add.out will be aggregated separately

        Returns:
            An EventList containing FunctionEventAvg objects.
        z5Expected tree to be built before calling key_averagesreturn.c                    t        | j                        t        | j                        t        | j                        t        | j                        t        | j
                        g}|r|j                  j                         |r$|j                  t        | j                               |dkD  r|| j                  d | z  }t        |      S Nr   )r   rD   rF   rP   	is_legacyis_user_annotationrX   overload_nameinput_shapesrg   tuple)rH   group_by_input_shapesgroup_by_stack_ngroup_by_overload_namerD   rY   s        r"   get_keyz'EventList.key_averages.<locals>.get_key  s     EIIEMM"E%%&EOO$E,,-C &

3,,-$

3u1123!#u{{#4$455:r#   r   r   r   Nr   )r   rV   r   r   r   r   r7   r
   valuesr   r   r   rg   r   r   )r   r   r   r   statsr   avg_listrY   s          @r"   key_averageszEventList.key_averagesu  s    .  G  :EEU9V	38_	$  	C.0@BX c#h	 LLN''//''	
  	'C		"3#34CI(#% )$&!	' r#   c                 N    t               }| D ]  }||z  }d|_         d|_        |S )a,  Compute aggregate statistics across all events.

        Accumulates statistics from all events into a single FunctionEventAvg object.
        This is primarily useful for computing total metrics (total CPU time, total
        memory usage, etc.) across the entire profiling session, regardless of
        operation type.

        Note:
            This sums up times and counts across ALL different operations, so the
            "average" metrics (like cpu_time) represent the average time per operation
            call across the entire session, mixing all operation types together.
            For per-operation averages, use :meth:`key_averages` instead.

        Returns:
            FunctionEventAvg: A single aggregate object with key="Total" containing
                accumulated statistics.

        NTotal)r   rD   )r   
total_statrY   s      r"   total_averagezEventList.total_average  s;    & &'
 	"C#J!JN	" !
r#   )Nd   K   7   P   NFN)Fr   F)__name__
__module____qualname____doc__r   r)   r-   r&   r%   r'   propertyrp   r,   r   r   r   r   r   r   __classcell__)r!   s   @r"   r
   r
      s    BH	& "0D-L24 @ @
   "#,
\6p
2# 2s 24 $$	BHr#   r
   c                 N    d}d}| |k\  r	| |z  ddS | |k\  r	| |z  ddS | ddS )+Define how to format time in FunctionEvent.    .A     @@.3fsmsusr   )time_usUS_IN_SECONDUS_IN_MSs      r"   _format_timer     sU    "LH,L(-Q//(H$S),,c]"r#   c                 L    |dk(  r| dk7  rt        d|        y| dz  |z  ddS )r   r   zExpected time_us == 0 but got NaNg      Y@.2f%)rV   )r   total_time_uss     r"   _format_time_sharer     s@    a< #A'!KLLo-c2!44r#   c                     d}d|z  }d|z  }t        |       |k\  r| dz  |z  ddS t        |       |k\  r| dz  |z  ddS t        |       |k\  r| dz  |z  ddS t        |       dz   S )z&Return a formatted memory size string.i         ?r   z GBz MBz KBz B)absr   )nbytesKBMBGBs       r"   _format_memoryr     s    	B	B	B
6{b3,#C(,,	V	3,#C(,,	V	3,#C(,,6{T!!r#   c                       t         fd      S )Nc                 .    t        t        |             S r+   )r   r   )r   r4   s    r"   rI   z!_attr_formatter.<locals>.<lambda>  s    gdD.A!B r#   )r   r4   s   `r"   _attr_formatterr     s    BCCr#   c                       e Zd ZdZ ed      Z ed      Z ed      Z ed      Z ed      Z	 ed      Z
ed        Zed	        Ze ed
e      d               Zy)r   z{Helpers for FunctionEvent and FunctionEventAvg.

    The subclass should define `*_time_total` and `count` attributes.
    cpu_timedevice_timecpu_time_totaldevice_time_totalrp   self_device_time_totalc                 \    | j                   dk(  rdS d| j                  z  | j                   z  S Nr   g        r   )countr   r(   s    r"   r   zFormattedTimesMixin.cpu_time  s+    jjAosQ31D1D+Dtzz+QQr#   c                 \    | j                   dk(  rdS d| j                  z  | j                   z  S r   )r   r   r(   s    r"   r   zFormattedTimesMixin.device_time  s+    jjAosT31G1G+G$**+TTr#   z<`cuda_time` is deprecated, please use `device_time` instead.categoryc                     | j                   S r+   )r   r(   s    r"   	cuda_timezFormattedTimesMixin.cuda_time  s     r#   N)r   r   r   r   r   cpu_time_strdevice_time_strcpu_time_total_strdevice_time_total_strself_cpu_time_total_strself_device_time_total_strr   r   r   r   FutureWarningr   r   r#   r"   r   r     s    
 #:.L%m4O()9:+,?@-.CD!01I!JR R U U F 	 
 r#   r   c                       e Zd Zd Zd Zy)r   c                      || _         || _        y r+   )rL   rM   )r   rL   rM   s      r"   r   zInterval.__init__  s    
r#   c                 4    | j                   | j                  z
  S )z4
        Returns the length of the interval
        )rM   rL   r(   s    r"   r   zInterval.elapsed_us  s     xx$**$$r#   N)r   r   r   r   r   r   r#   r"   r   r     s    %r#   r   r   )r4   r   durationc                   ^   e Zd ZdZddddddddddddej
                  dddddddddfdZd Zd Zd	 Z	e
d
        Ze
d        Ze
 ede      d               Ze
d        Ze
d        Ze
d        Ze
 ede      d               Ze
d        Ze
 ede      d               Ze
d        Zd Zy)r   a  Profiling information about a single function.

    FunctionEvent records the execution of a single operation during profiling.
    These events are obtained from the profiler/kineto and contain detailed
    timing and memory usage information.

    .. note::
        FunctionEvent objects are typically created by the profiler/kineto and should not
        be instantiated directly by users. Access them through the profiler's output.

    Attributes:
        id (int): Unique identifier for this event.
        node_id (int): Node identifier for distributed profiling (-1 if not applicable).
        name (str): Name of the profiled function/operator.
        overload_name (str): Overload name for the operator (requires _ExperimentalConfig(capture_overload_names=True) set).
        trace_name (str): Same as name, just changes ProfilerStep* to ProfilerStep#
        time_range (Interval): Time interval containing start and end timestamps in microseconds.
        thread (int): Thread ID where the operation started.
        fwd_thread (int): Thread ID of the corresponding forward operation.
        kernels (List[Kernel]): List of device kernels launched by this operation.
        count (int): Number of times this event was called (usually 1).
        cpu_children (List[FunctionEvent]): Direct CPU child operations.
        cpu_parent (FunctionEvent): Direct CPU parent operation.
        input_shapes (Tuple[int, ...]): Shapes of input tensors (requires record_shapes=true).
        concrete_inputs (List[Any]): Concrete input values (requires record_shapes=true).
        kwinputs (Dict[str, Any]): Keyword arguments (requires record_shapes=true).
        stack (List[str]): Python stack trace where the operation was called (requires with_stack=true).
        scope (int): at::RecordScope identifier (0=forward, 1=backward, etc.).
        use_device (str): Device type being profiled ("cuda", "xpu", etc.).
        cpu_memory_usage (int): CPU memory allocated in bytes.
        device_memory_usage (int): Device memory allocated in bytes.
        is_async (bool): Whether this is an asynchronous operation.
        is_remote (bool): Whether this operation occurred on a remote node.
        sequence_nr (int): Sequence number for autograd operations.
        device_type (DeviceType): Type of device (CPU, CUDA, XPU, PrivateUse1, etc.).
        device_index (int): Index of the device (e.g., GPU 0, 1, 2).
        device_resource_id (int): Resource ID on the device (ie. stream ID).
        is_legacy (bool): Whether this is from the legacy profiler.
        flops (int): Estimated floating point operations.
        is_user_annotation (bool): Whether this is a user-annotated region.
        metadata_json (str): Additional metadata in JSON format.

    Properties:
        cpu_time_total (float): Total CPU time in microseconds.
        device_time_total (float): Total device (CUDA/XPU/etc) time in microseconds.
        self_cpu_time_total (float): CPU time excluding child operations.
        self_device_time_total (float): Device time excluding child operations.
        self_cpu_memory_usage (int): CPU memory usage excluding child operations.
        self_device_memory_usage (int): Device memory usage excluding child operations.
        cpu_time (float): Average CPU time per call.
        device_time (float): Average device time per call.
        key (str): Key used for grouping events (usually same as name).

    See Also:
        - :class:`torch.profiler.profile`: Context manager for profiling
        - :class:`EventList`: List container for FunctionEvent objects with helper methods
        - :class:`FunctionEventAvg`: Averaged statistics over multiple FunctionEvent objects
    Nr   FrN   c                    || _         || _        || _        || _        || _        t        ||      | _        || _        || _        g | _	        d| _
        g | _        d | _        || _        || _        || _        |	| _        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        ||n|| _        || _        || _        || _        d| _        d| _        d| _         || _!        y )Nr/   rN   )"idrF   r4   r   r   r   rK   rB   ri   r6   r   r5   r3   r   concrete_inputskwinputsrg   re   r   cpu_memory_usagedevice_memory_usagerO   r   rh   rP   device_indexdevice_resource_idr   flopsr   self_cpu_percenttotal_cpu_percenttotal_device_percentmetadata_json)r   r  r4   rB   start_usend_usr   ri   r   rg   re   r   r  r  rO   r   rh   rF   rP   r  r  r   r  r   r  r  r   r  s                               r"   r   zFunctionEvent.__init__d  s   < #	"/)$,Xv$>!)3%'
1337-9*9(0 

)3%5(; &( +'2!-(0F6H 	  )$)
2D "!#$&!*r#   c                     | j                   t        j                  k7  rt        d      | j                  j                  t        |||             y )NExpected device_type to be CPU)rP   r	   rQ   rV   r6   rX   r   )r   r4   r   r
  s       r"   append_kernelzFunctionEvent.append_kernel  s;    z~~- !ABBF4:;r#   c                    | j                   t        j                  k7  rt        d      t	        |t
              st        d      |j                   t        j                  k7  rt        d      | j                  j                  |       y)zAppend a CPU child of type FunctionEvent.

        One is supposed to append only direct children to the event to have
        correct self cpu time being reported.
        r  z$Expected child to be a FunctionEventz$Expected child device_type to be CPUN)rP   r	   rQ   rV   
isinstancer   r5   rX   )r   childs     r"   rU   zFunctionEvent.append_cpu_child  sj     z~~- !ABB%/ !GHH
. !GHH  'r#   c                     | j                   t        j                  k7  rt        d      t	        |t
              st        d      |j                   t        j                  k7  rt        d      || _        y)a$  Set the immediate CPU parent of type FunctionEvent.

        One profiling FunctionEvent should have only one CPU parent such that
        the child's range interval is completely inside the parent's. We use
        this connection to determine the event is from top-level op or not.
        r  z%Expected parent to be a FunctionEventz%Expected parent device_type to be CPUN)rP   r	   rQ   rV   r  r   r3   )r   ra   s     r"   rW   zFunctionEvent.set_cpu_parent  s]     z~~- !ABB&-0 !HII/ !HII r#   c                     | j                   s| j                  t        j                  k7  ry| j                  t        d | j                  D              z
  S )Nr   c              3   4   K   | ]  }|j                     y wr+   )r  rr   r   s     r"   rs   z6FunctionEvent.self_cpu_memory_usage.<locals>.<genexpr>  s      +
',E""+
rt   )rO   rP   r	   rQ   r  ru   r5   r(   s    r"   self_cpu_memory_usagez#FunctionEvent.self_cpu_memory_usage  sJ    ==D,,
>$$s +
040A0A+
 (
 
 	
r#   c                     | j                   s| j                  t        j                  k7  ry| j                  t        d | j                  D              z
  S )Nr   c              3   4   K   | ]  }|j                     y wr+   )r  r$  s     r"   rs   z9FunctionEvent.self_device_memory_usage.<locals>.<genexpr>  s      .
*/E%%.
rt   )rO   rP   r	   rQ   r  ru   r5   r(   s    r"   self_device_memory_usagez&FunctionEvent.self_device_memory_usage  sJ    ==D,,
>''# .
373D3D.
 +
 
 	
r#   zO`self_cuda_memory_usage` is deprecated. Use `self_device_memory_usage` instead.r   c                     | j                   S r+   r(  r(   s    r"   self_cuda_memory_usagez$FunctionEvent.self_cuda_memory_usage  s     ,,,r#   c                 r    | j                   t        j                  k(  r| j                  j	                         S yr   )rP   r	   rQ   rK   r   r(   s    r"   r   zFunctionEvent.cpu_time_total  s*    z~~-??--//r#   c                     | j                   s| j                  t        j                  k7  ry| j                  t        d | j                  D              z
  S )Nr   c              3   4   K   | ]  }|j                     y wr+   )r   r$  s     r"   rs   z4FunctionEvent.self_cpu_time_total.<locals>.<genexpr>  s      )
%*E  )
rt   )rO   rP   r	   rQ   r   ru   r5   r(   s    r"   rp   z!FunctionEvent.self_cpu_time_total  sJ    ==D,,
>""S )
.2.?.?)
 &
 
 	
r#   c                 (   | j                   s| j                  sy| j                  t        j                  k(  ra| j
                  s9t        d | j                  D              t        d | j                  D              z   S t        d | j                  D              S | j                  t        j                  t        j                  t        j                  t        j                  fvrt        d| j                         | j                  j                         S )Nr   c              3   4   K   | ]  }|j                     y wr+   r
  rr   kinfos     r"   rs   z2FunctionEvent.device_time_total.<locals>.<genexpr>       De5>>Drt   c              3   4   K   | ]  }|j                     y wr+   r   )rr   r=   s     r"   rs   z2FunctionEvent.device_time_total.<locals>.<genexpr>  s      K-/B((Krt   c              3   4   K   | ]  }|j                     y wr+   r1  r2  s     r"   rs   z2FunctionEvent.device_time_total.<locals>.<genexpr>  r4  rt   DExpected device_type to be CUDA, PrivateUse1, MTIA, or HPU, but got )rO   r   rP   r	   rQ   r   ru   r6   r5   CUDAPrivateUse1MTIAHPUrV   rK   r   r(   s    r"   r   zFunctionEvent.device_time_total  s    ==z~~->>Dt||DDs K373D3DK H  
 Dt||DDD&&	(  %Z[_[k[kZlm  ??--//r#   zA`cuda_time_total` is deprecated. Use `device_time_total` instead.c                     | j                   S r+   r6  r(   s    r"   cuda_time_totalzFunctionEvent.cuda_time_total  s     %%%r#   c                    | j                   s| j                  sy| j                  t        j                  k(  r)| j
                  t        d | j                  D              z
  S | j                  t        j                  t        j                  t        j                  t        j                  fvrt        d| j                         | j
                  S )Nr   c              3   4   K   | ]  }|j                     y wr+   r6  r$  s     r"   rs   z7FunctionEvent.self_device_time_total.<locals>.<genexpr>  s      0,1''0rt   r8  )rO   r   rP   r	   rQ   r   ru   r5   r9  r:  r;  r<  rV   r(   s    r"   r   z$FunctionEvent.self_device_time_total  s    ==z~~-))C 0595F5F0 -   &&	(  %Z[_[k[kZlm  )))r#   zK`self_cuda_time_total` is deprecated. Use `self_device_time_total` instead.c                     | j                   S r+   r   r(   s    r"   r   z"FunctionEvent.self_cuda_time_total,  s     ***r#   c                     | j                   S r+   r   r(   s    r"   rD   zFunctionEvent.key4  s    yyr#   c           	         | j                   }| j                  }| j                  }dj                  g d| j                   d| j
                   d| j                   d| j                   d| j                   d| j                   d| j                  j                   d	| j                  j                   d
t        | j                  D cg c]  }|j                   c}       d| d| d| j
                   d| j                   dt        | j                          d| j"                   d| d| d| j$                   d| j&                   d| j(                   d| j*                   d      S c c}w )Nr   z<FunctionEvent id=z name=z overload_name=z device_type=z	 node_id=
 cpu_time=z
 start_us=z end_us=z cpu_children=r   _time=z thread= input_shapes= cpu_memory_usage=_memory_usage=z
 is_async=z is_remote=z seq_nr=z is_legacy=>)r   r  r  joinr  r4   r   rP   rF   r   rK   rL   rM   r   r5   rB   r   r  rO   r   rh   r   )r   r   r   r  r   s        r"   __repr__zFunctionEvent.__repr__8  s   oo**"66y y  y	 y y		{ y/ y$J\J\I] y ^ y++,y,5y6:ll^yCMyNRN_N_M`yay--.y.6y7;7J7J6KyLy  t7H7H Ie IJKy LMy NYMy Z`y al_lymy II;	y '	y (,{{m	y 4B	y CFdFWFWBXAY	yZ 	y
 !% 5 56y
 78y
 9D}y
 ESy
 TgRgy
hy y '2y 37..1Ay BJy KOJZJZI[y \gy hlgugufvy wxy	
 !Js   F	)r   r   r   r   r	   rQ   r   r  rU   rW   r   r%  r(  r   r  r+  r   rp   r   r>  r   r   rD   rL  r   r#   r"   r   r   (  sl   9D NN 9F+P<
(!" 
 
 
 
 Y-	 
-   
 
 0 00 K&	 
& * *& U+	 
+  
r#   r   c                   *    e Zd ZdZddZd Zd Zd Zy)r   a
  Averaged profiling statistics over multiple FunctionEvent objects.

    FunctionEventAvg aggregates statistics from multiple FunctionEvent objects
    with the same key (typically same operation name). This is useful for getting
    average performance metrics across multiple invocations of the same operation.

    This class is typically created by calling :meth:`EventList.key_averages()` on
    a profiler's event list.

    Attributes:
        key (str): Grouping key for the events (typically operation name).
        count (int): Total number of events aggregated.
        node_id (int): Node identifier for distributed profiling (-1 if not applicable).
        is_async (bool): Whether the operations are asynchronous.
        is_remote (bool): Whether the operations occurred on a remote node.
        use_device (str): Device type being profiled ("cuda", "xpu", etc.).
        cpu_time_total (int): Accumulated total CPU time in microseconds.
        device_time_total (int): Accumulated total device time in microseconds.
        self_cpu_time_total (int): Accumulated self CPU time (excluding children) in microseconds.
        self_device_time_total (int): Accumulated self device time (excluding children) in microseconds.
        input_shapes (List[List[int]]): Input tensor shapes (requires record_shapes=true).
        overload_name (str): Operator overload name (requires _ExperimentalConfig(capture_overload_names=True) set).
        stack (List[str]): Python stack trace where the operation was called (requires with_stack=true).
        scope (int): at::RecordScope identifier (0=forward, 1=backward, etc.).
        cpu_memory_usage (int): Accumulated CPU memory usage in bytes.
        device_memory_usage (int): Accumulated device memory usage in bytes.
        self_cpu_memory_usage (int): Accumulated self CPU memory usage in bytes.
        self_device_memory_usage (int): Accumulated self device memory usage in bytes.
        cpu_children (List[FunctionEvent]): CPU child events.
        cpu_parent (FunctionEvent): CPU parent event.
        device_type (DeviceType): Type of device (CPU, CUDA, XPU, PrivateUse1, etc.).
        is_legacy (bool): Whether from legacy profiler.
        flops (int): Total floating point operations.
        is_user_annotation (bool): Whether this is a user-annotated region.

    Properties:
        cpu_time (float): Average CPU time per invocation.
        device_time (float): Average device time per invocation.

    See Also:
        - :class:`EventList.key_averages`: Method that creates FunctionEventAvg objects
        - :class:`FunctionEvent`: Individual profiling event
        - :class:`EventList`: Container for profiling events
    Nc                 b   d | _         d| _        d| _        d| _        d| _        d | _        d| _        d| _        d| _        d| _	        d | _
        d | _        d | _        d | _        d| _        d| _        d| _        d| _        d | _        d | _        t(        j*                  | _        d| _        d| _        y )Nr   F)rD   r   rF   rO   r   r   r   r   rp   r   r   r   rg   re   r  r  r%  r(  r5   r3   r	   rQ   rP   r   r  r(   s    r"   r   zFunctionEventAvg.__init__u  s    "&
#$)-#$&'() +,#7;,0%)
$(
%&() *+"-.%;?37'1~~$
r#   c                 z   | j                   |j                   | _         |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        |j                  | _
        |j                  | _        |j                  | _        |j                  | _        t        |t        t         f      st#        d      |j                   | j                   k7  r%t#        d|j                    d| j                          | xj$                  |j$                  z  c_        | xj&                  |j&                  z  c_        | xj(                  |j(                  z  c_        | xj*                  |j*                  z  c_        | xj,                  |j,                  z  c_        | xj.                  |j.                  z  c_        | xj0                  |j0                  z  c_        | xj2                  |j2                  z  c_        | xj4                  |j4                  z  c_        | j6                  |j6                  | _        | S |j6                  | xj6                  |j6                  z  c_        | S )Nz8Expected other to be a FunctionEvent or FunctionEventAvgz Expected keys to match, but got z vs )rD   rF   rO   r   r3   r5   r   r   rg   re   rP   r   r   r   r  r   r   rV   r   r   rp   r   r  r  r%  r(  r   r  r   others     r"   r7   zFunctionEventAvg.add  s   88 yyDH ==DL!NNDM"__DN#..DO % 2 2D!&!4!4D % 2 2DDJDJ$00D"__DN#..DO&+&>&>D#%-1A!BC J  99  2599+T$((L  	u333%"9"99  E$=$== ##u'C'CC#!7!77  E$=$== ""e&A&AA"%%)G)GG%

ekk!
::DJ  [[$JJ%++%Jr#   c                 $    | j                  |      S r+   )r7   rP  s     r"   __iadd__zFunctionEventAvg.__iadd__  s    xxr#   c                 <   | j                   sdn| j                   }| j                  }| j                  }| j                  }d| j                   d| j
                   d| j                   d| d| d| d| dt        | j                         d	| j                   d| d
| dS )Nr   z<FunctionEventAvg key=z self_cpu_time=rE  z  self_rF  r   rG  rH  rI  rJ  )
r   r  r  r  rD   r  r   r   r   r  )r   r   self_device_timer   device_memorys        r"   rL  zFunctionEventAvg.__repr__  s    $(OOf::**00$TXXJod>Z>Z=[[efjfwfwex y M(8'9;-vk]Zhilmqm~m~i  iA A  $ 5 56a}NS`Raabd	
r#   )r   N)r   r   r   r   r   r7   rS  rL  r   r#   r"   r   r   G  s    +Z2+Z	
r#   r   c                       e Zd Zd Zy)r   c                 p    t        |      dkD  rt        j                  j                  |      n|| |<   | |   S rd   )r2   torch_C	_demangle)r   rD   s     r"   __missing__zStringTable.__missing__  s2     033x!|EHH&&s+S	Cyr#   N)r   r   r   r\  r   r#   r"   r   r     s    r#   r   c                       e Zd ZdZd Zd Zy)r   z=Acceleration structure for accessing mem_records in interval.c                     || _         g | _        g | _        t        |      dkD  rPt	        t        |      D cg c]  \  }}|d   j                         |f c}}      }t        | \  | _        | _        y y c c}}w r   )_mem_records_start_nses_indicesr2   rR   r8   start_nszip)r   mem_recordsirtmps        r"   r   zMemRecordsAcc.__init__  sn    '&(#%{a9[;QR41a1Q4==?A.RSC.13i+Ddm  Rs   A5
c              #      K   t        j                  | j                  |dz        }t        j                  | j                  |dz        }t	        ||      D ]   }| j
                  | j                  |       " yw)z
        Return all records in the given interval
        To maintain backward compatibility, convert us to ns in function
        i  N)bisectbisect_leftr`  bisect_rightr1   r_  ra  )r   r  r  	start_idxend_idxre  s         r"   in_intervalzMemRecordsAcc.in_interval  sp     
 &&t'7'7DI	%%d&6&6Fy'* 	6A##DMM!$455	6s   A7A9N)r   r   r   r   r   rn  r   r#   r"   r   r     s    G86r#   r   c                 4     g d}t         fd|D              S )N))autograd/__init___make_grads)rp  backward)ztorch/tensorrr  )_internal/common_utilsprof_callable)rs  prof_func_call)rs  prof_meth_callc              3   @   K   | ]  }|d    v xr |d   v    yw)r   r/   Nr   )rr   r   r   s     r"   rs   z&_filter_stack_entry.<locals>.<genexpr>  s)     OAaDEM3adem4Os   )all)r   filtered_entriess   ` r"   _filter_stack_entryrz    s     O>NOOOr#   z[memory]z[OutOfMemory]c                 .    t         t        ddddddg}| |v S )Nz profiler::_record_function_enterz$profiler::_record_function_enter_newzprofiler::_record_function_exitzaten::is_leafzaten::output_nrzaten::_version)MEMORY_EVENT_NAMEOUT_OF_MEMORY_EVENT_NAME)r4   filtered_out_namess     r"   _filter_namer    s2     	 *.)	 %%%r#   c                 N    t               }||    } |r| j                  d      rd} | S )NzProfilerStep#zProfilerStep*)r   
startswith)r4   with_wildcardstring_tables      r"   _rewrite_namer    s,    =LD???+"DKr#   c                 j  01234 t        |       dk(  ryt        d | D              }t        d | D              }| d   j                  }|s|rt        d      t        d | D              }t        d | D              }t	        t        | fd	d
      |||      } t        d | D              dz   }|t        ||      }t        d | D              dz   }|t        ||      }d}|}d}| D cg c]4  }|j                  t        |j                        dkD  s)|j                  6 }}t        |      dkD  }|r#t        d |D              dz   }|t        ||      }dg}|r|j                  d       |g dz  }||j                         nd}|r"|j                  d| d| d| d| dg       |r1|j                  ddg       |r|r|j                  | dd| dg       |j                  d       t        d | D              }|r|j                  d       d 0dg4dg10 g2d50124fd!	}d" } ||       |r ||       |d#|z   d D ]
  } ||        |r|j                  d$        ||       |r|j                  d%        ||d&'       |rj| D cg c]  }|j                  dkD  s|j                    }}t        |      dk7  r1 |t        |            \  }} |j                  d(|          ||       nd)}4d   }!1d   }"2d   }#d}g 33fd*}$d}%d}&| D ]  }|%|j                  z  }%|j                  t        j                   k(  r|j"                  r|&|j$                  z  }&K|j                  t        j&                  t        j(                  t        j*                  fv s|j,                  r|&|j$                  z  }& | |$d+|#z          |$|       |	r |$d+|#z          |$d,        |$|"        |$ |!j.                  |         |$|"       d- }'d. }(d})| D ]  }|)|k(  r n|	r|j0                  |)d#z  })|j2                  }*|t        |*      |d/z
  k\  r|*d|d/z
   d0z   }*t5        |j                  |%      |_        |j8                  st5        |j:                  |%      nd|_        |*g}+|r0|j>                  },|t        |,      |d/z
  k\  r|,d|d/z
   d0z   },|+|,gz  }+|+|j6                   |(|j                  |j@                  |
      |j<                   |(|j:                  |jB                  |
       |(|jD                  |jF                  |
      gz  }+|rt5        |j$                  |&      |_$        |+j                   |(|j$                  |jJ                  |
      |jH                   |(|jL                  |jN                  |
       |(|jP                  |jR                  |
      g       |rv|+j                  tU        |jV                        tU        |jX                        g       |r;|r9|+j                  tU        |jZ                        tU        |j\                        g       |+j                  |j^                         |r|+j                  |j`                         |r'|+j                  tc        |jd                        d|        |rA|j                  dk  r|+j                  d1       n |+j                  |j                  z  d2       |rAd}-t        |j                        dkD  r |'|j                  d   |      }-|+j                  |-        |$ |!j.                  |+        |s=dgt        |      d#z
  z  }.|j                  d#d D ]"  }/ |$ |!j.                  |. |'|/|      gz           $ |.j                  d        |$ |!j.                  |.          |$|"        |$d3 |(|%tg        |%      |
              |r1 |$d||j                         nd d4 |(|&tg        |&      |
              dji                  3      S c c}w c c}w )6zUPrint a summary of events (which can be a list of FunctionEvent or FunctionEventAvg).r   r   c              3   :   K   | ]  }|j                   d kD    ywr   NrB  rq   s     r"   rs   z_build_table.<locals>.<genexpr>&  s     Ou%66:O   c              3   :   K   | ]  }|j                   d kD    ywr  r*  rq   s     r"   rs   z_build_table.<locals>.<genexpr>'  s     P77!;Pr  z9use_device is None, but there is device performance data.c              3   l   K   | ],  }|j                   d uxr t        |j                         dkD   . y wr   )r   r2   rq   s     r"   rs   z_build_table.<locals>.<genexpr>/  s;       
		4	'	GC0B0B,Ca,G	G   24c              3   l   K   | ],  }|j                   d uxr t        |j                         dkD   . y wr   )r   r2   rq   s     r"   rs   z_build_table.<locals>.<genexpr>4  s;       
		D	(	IS1D1D-E-I	Ir  Nc                 |    t        | j                  dd      j                  dd      j                  dd            S )Nr   r   r   r   )r   r   )rY   rw   s    r"   rI   z_build_table.<locals>.<lambda>=  s5    OOFH5WUH-W]H5	! r#   T)rD   reverser   c              3   F   K   | ]  }t        |j                          y wr+   )r2   rD   rr   rY   s     r"   rs   z_build_table.<locals>.<genexpr>J  s     ;SCL;s   !   c              3   X   K   | ]"  }t        t        |j                               $ y wr+   )r2   r   r   r  s     r"   rs   z_build_table.<locals>.<genexpr>N  s      KSc#c&6&6"78Ks   (*   c              3   @   K   | ]  }t        d  |D                yw)c              3   2   K   | ]  }t        |        y wr+   r2   )rr   r   s     r"   rs   z)_build_table.<locals>.<genexpr>.<genexpr>\  s     25CJ2s   N)max)rr   rg   s     r"   rs   z_build_table.<locals>.<genexpr>\  s     Gu2E22Gs   NamezOverload Name)z
Self CPU %zSelf CPUzCPU total %z	CPU totalzCPU time avgNonezSelf z %z totalz	 time avgzCPU MemzSelf CPU Memz Memz
# of Callsc              3   :   K   | ]  }|j                   d k7    yw)rN   N)rF   r  s     r"   rs   z_build_table.<locals>.<genexpr>  s     =s*=r  zNode IDr   c                     dxx   d|z   t        |       z   dz   dz  z   z  cc<   dxx   d| z  dz  z   z  cc<   dxx   | z   z  cc<   y )Nr   z{: }r   -)r   )paddingtext_dirSPACING_SIZEheader_sep_lstline_length_lstrow_format_lsts     r"   
add_columnz _build_table.<locals>.add_column  sh    qHs7|+c1S<5GH	
 	qS7]cL.@AAg44r#   c                 l   g d}| dk  rt        d|        t        dt        t        j                  |       dz  t        t        |      dz
                    }|dk\  r|t        |      k  st        dt        |       d|       t        dt        j                  |      d	z        |t        |         fS )
N)FLOPsKFLOPsMFLOPsGFLOPsTFLOPsPFLOPsr   z'Expected flops to be positive, but got    r/   z&Expected log_flops to be in range [0, z), but got 
   g      )
rV   r  minmathlog10floatr2   powfloorr   )r  flop_headers	log_flopss      r"   auto_scale_flopsz&_build_table.<locals>.auto_scale_flops  s    
 A: #J5'!RSS3tzz%014eC<MPQ<Q6RST	Q9s</@#@ 8\9J8K;W`Vab  BI.57c)n9UVVr#   r/   zInput ShapeszSource Location<)r  zTotal Fc                 J    j                  |        j                  d       y )Nr   )rX   )r   results    r"   rX   z_build_table.<locals>.append  s    adr#   =z1This report only display top-level ops statisticsc                 t    t        |       |kD  r)t        |       |z
  }| |d  } t        |       dkD  rd| dd  z   } | S )Nr  ...r  )r   src_column_widthoffsets      r"   	trim_pathz_build_table.<locals>.trim_path  sI    t9''Y!11F=D4y1}tABx'r#   c                 \    d}d}|dk(  r	| |z  ddS |dk(  r	| |z  ddS |dk(  r| ddS |S )Nr   r   r   r   r   r   r   )r   default_strr~   r   r   s        r"   override_time_unitz(_build_table.<locals>.override_time_unit  sc    &,S133$(-R00$c]"%%r#   r  r  z--z8.3fzSelf CPU time total: z time total: )rJ  )5r2   anyr   RuntimeErrorr
   rR   r  r  rg   rX   upperr:   r  rp   rP   r	   rQ   r   r   r9  r:  r;  r   r   r3   rD   r   r  rO   r   r  r   r  r  r   r   r  r  r   r  r   r  r   r  r%  r  r(  r   rF   r   r   r   rK  )5r[   rw   r|   rx   ry   rz   r{   r   r   r}   r~   has_device_timehas_device_memr   has_input_shapeshas_overload_namesname_column_widthshapes_column_widthDEFAULT_COLUMN_WIDTHflops_column_widthr  rY   stacks	has_stackheadersr   append_node_idr  r  r   	raw_flopsflops_scaleflops_header
row_format
header_sepline_lengthrX   sum_self_cpu_time_totalsum_self_device_time_totalr  r  event_limitr4   
row_valuesr   	src_fieldempty_headersr   r  r  r  r  r  s5    `                                              @@@@@r"   r   r     s2	    6{aOOOOPPPN%%J /VWW  
   
  	 ")!
  ;F;;a?( 13HIKFKKaO*!"57NO-#syy'<SYYRSAS		F  FaIGGG!K 	  +"#35IJhG'  G )3(>*""$FK}%}B'-v&-y)		
 	
 .NN"m4(K=- NN< =f==Ny! LTNTN$}oO5 5W&  !$%Q++-. )'() ~&&'()#c2*0B3CIIMSYYB	By>Q*:3y>*J'[,NNVL>23)*J"J"J!!$KJ F  !" E3#:#::??jnn,&#*D*DD&OO&& ** '#*D*DD&E$ s[ !vs[ !BC
:
:g&'
:
 K p6)# S^^%?1Kww ,T>SVW>W1W50146>D1##%< 

 << s113JK 	 V
--M%1&*?!*CC -.K1F1J Lu T=/)J  '')D)Di !!""C$:$:I c..	
 	

 '9**,F(C$ &2266! ,,&--s/H/H) ')<)<i"  #3#7#78"3#<#<=	 n!! 's'>'>?&s'C'CD	 	II	
 ckk*c#"2"234H5HIJyyA~!!$'!!SYY%<T$BDI399~!%ciil4DE	i( z  *-.DCL1$45M12 %J%%'9U<L+M*NN   $$:$$m45ap6d :

 23JLYpLqs| }~ **@J$$&fM N-.H,WqJrt}~A	
 776?O	H Cs   ,`+>`+`+`0*`0c           
         g }| D ]  }|j                  dd      }|d   j                  dd      }|d   j                  dd      }|j                  d      D cg c]#  }|j                         s|j                         % }}|r|d   nd}|j                  |dd	 |||j                  d
d      d        |j	                  d        g }|D ]%  }|j                  d|d    d|d    d|d           ' dj                  |      S c c}w )zk
    Extract and format all events with stack traces in a canonical way
    for deterministic testing.
    r4   r   r   	node_namestack_tracer   rN   N   tsr   )
event_namer  r  
start_timec                     | d   S )Nr  r   )xs    r"   rI   z/_canonicalize_profiler_events.<locals>.<lambda>  s
    !L/ r#   rC   zevent=r  z node=z stack_trace=)rj   splitstriprX   sortrK  )	r[   events_with_tracesrH   r  r  r  r   linesrY   s	            r"   _canonicalize_profiler_eventsr    s6   
  
YYvr*
&M%%k26	Fm''r: %0$5$5d$;IqqwwyII#(eBib!!("o&*#iia0		

(  9: E! 
S&'vc+.>-?}SQ^M_L`a	


 99U- Js   D-D)F)
NNr   r   r   r   FFFN)%ri  rS   r  collectionsr   r   operatorr   typingr   r   typing_extensionsr   rY  torch.autogradr	   __all__listr
   r   r   r   r   r   r   r   r   r   r   r   rz  r|  r}  r  r  r   r  r   r#   r"   <module>r     s       /    (  %	w wt5"D   <	% 	% 
H<	=\
' \
~@
* @
F+ 6 6,	P  * && hX$r#   