
    i5              
          d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
Z
d dlmc mc mc mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' g dZ( e%e)      Z*e G d d             Z+ G d d      Z,dee-z  dz  de.e	   de-fdZ/de!de0e-dz  e1dz  f   fdZ2de+dee-z  dz  de.e	   de3e1e	f   fdZ4y)    N)Callable)	dataclassfield)Any)get_default_numa_options)eventsmetrics)
WorkerSpec)LocalElasticAgent)DefaultLogsSpecs	LogsSpecsSignalException)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_logger)NumaOptions)LaunchConfigelastic_launchlaunch_agentc                      e Zd ZU dZeed<   eed<   eed<   dZedz  ed<   dZe	ed<   d	Z
e	ed
<   dZe	ed<   dZe	ed<    ee      Zee	ef   ed<   dZeed<   dZeed<   dZeed<   dZe	ed<   dZe	dz  ed<    ee      Zee	e	f   ed<   dZe	dz  ed<   dZe	ed<   dZedz  ed<   dZe	ed<   dZee	   dz  ed <   dZee	   dz  ed!<   d"Z e!ed#<   d$ Z"y)%r   a  
    Creates a rendezvous config.

    Args:
        min_nodes: Minimum amount of nodes that the user function will
                        be launched on. Elastic agent ensures that the user
                        function start only when the min_nodes amount enters
                        the rendezvous.
        max_nodes: Maximum amount of nodes that the user function
                        will be launched on.
        nproc_per_node: On each node the elastic agent will launch
                            this amount of workers that will execute user
                            defined function.
        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
        rdzv_endpoint: The endpoint of the rdzv sync. storage.
        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
            to be removed in future versions, see the note below. The default timeout is 900 seconds.
        run_id: The unique run id of the job (if not passed a unique one will be
                deduced from run environment - flow workflow id in flow - or auto generated).
        role: User defined role of the worker (defaults to "trainer").
        max_restarts: The maximum amount of restarts that elastic agent will conduct
                    on workers before failure.
        monitor_interval: The interval in seconds that is used by the elastic_agent
                        as a period of monitoring workers.
        start_method: The method is used by the elastic agent to start the
                    workers (spawn, fork, forkserver).
        metrics_cfg: configuration to initialize metrics.
        local_addr: address of the local node if any. If not set, a lookup on the local
                machine's FQDN will be performed.
        local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
        event_log_handler: name of the event logging handler as registered in
          `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
        duplicate_stdout_filters: If non-empty, duplicates stdout to a file containing only lines
                                that match _any_ of the filter strings.
        duplicate_stderr_filters: If non-empty, duplicates stderr to a file containing only lines
                                that match _any_ of the filter strings.
        virtual_local_rank: Enable virtual local rank mode for workers (defaults to False).
                           When enabled, LOCAL_RANK is set to 0 for all workers and
                           CUDA_VISIBLE_DEVICES is adjusted so each worker accesses its
                           assigned GPU at device index 0.


    .. note::
        `rdzv_timeout` is a legacy argument that will be removed in future.
        Set the timeout via `rdzv_configs['timeout']`

    	min_nodes	max_nodesnproc_per_nodeN
logs_specs run_iddefault_rolerolerdzv_endpointetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restartsg?monitor_intervalspawnstart_methodlog_line_prefix_templatemetrics_cfg
local_addrnullevent_log_handlernuma_optionszSIGTERM,SIGINT,SIGHUP,SIGQUITsignals_to_handleduplicate_stdout_filtersduplicate_stderr_filtersFvirtual_local_rankc                    d}| j                   dk7  r| j                   | j                  d<   nd| j                  vr|| j                  d<   | j                  t               | _        | j                  {t
        j                  j                         r\t
        j                  j                         | j                  k(  r0t               | _        t        j                  d| j                         y y y y )Ni  r%   timeoutzUsing default numa options = %r)r&   r$   r   r   r1   torchcudais_availabledevice_countr   r   loggerinfo)selfdefault_timeouts     X/var/www/html/engine/venv/lib/python3.12/site-packages/torch/distributed/launcher/api.py__post_init__zLaunchConfig.__post_init__q   s    "+/+<+<Di(d///+:Di( ??".0DO %

'')

'')T-@-@@ 8 :DKK94;L;LM A * &    )#__name__
__module____qualname____doc__int__annotations__r   r   r   strr   r    r"   r   dictr$   r   r&   r(   r)   floatr+   r,   r-   r.   r0   r1   r   r2   r3   listr4   r5   boolrA    rB   r@   r   r   '   s#   /b NN#'J	D 'FCD#M3L##(#>L$sCx.>L#L#!e!L#+/cDj/"'"=Kc3h=!Jd
!#s#'+L+$+<s<15d3i$.515d3i$.5$$NrB   r   c                   2    e Zd ZdZdedeez  dz  fdZd Zy)r   a  
    Launches an torchelastic agent on the container that invoked the entrypoint.

        1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
           ``entrypoint`` can be a function or a command.
        2. The return value is a map of each worker's output mapped
           by their respective global rank.

    Usage

    ::

    def worker_fn(foo):
        # ...

    def main():
        # entrypoint is a function.
        outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
        # return rank 0's output
        return outputs[0]

        # entrypoint is a command and ``script.py`` is the python module.
        outputs = elastic_launch(LaunchConfig, "script.py")(args)
        outputs = elastic_launch(LaunchConfig, "python")("script.py")
    config
entrypointNc                      || _         || _        y N)_config_entrypoint)r>   rP   rQ   s      r@   __init__zelastic_launch.__init__   s    
 %rB   c                 V    t        | j                  | j                  t        |            S rS   )r   rT   rU   rL   )r>   argss     r@   __call__zelastic_launch.__call__   s    DLL$*:*:DJGGrB   )	rC   rD   rE   rF   r   r   rI   rV   rY   rN   rB   r@   r   r      s-    4&& sNT)&HrB   r   rQ   rX   returnc                     t        | t              r| j                  S t        | t              r(| t        j
                  k(  rt        d |D        d      S | S y)a  Retrieve entrypoint name with the rule:
    1. If entrypoint is a function, use ``entrypoint.__qualname__``.
    2. If entrypoint is a string, check its value:
        2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
            which does not start with hifen letter (for example, "-u" will be skipped).
        2.2 otherwise, use ``entrypoint`` value.
    3. Otherwise, return empty string.
    c              3   2   K   | ]  }|d    dk7  s|  yw)r   -NrN   ).0args     r@   	<genexpr>z'_get_entrypoint_name.<locals>.<genexpr>   s     >A#>s   r   )
isinstancer   rC   rI   sys
executablenext)rQ   rX   s     r@   _get_entrypoint_namere      sL     *h'"""	J	$'>>CCrB   rdzv_parametersc                     | j                   dk7  ry| j                  }|j                         }|st        d      t	        |d      \  }}|dk(  rt        d| d      ||fS )Nstatic)NNzKEndpoint is missing in endpoint. Try to add --master-addr and --master-portr%   )default_portzport is missing in endpoint: z. Try to specify --master-port)backendendpointstrip
ValueErrorr   )rf   rk   master_addrmaster_ports       r@   _get_addr_and_portrp      s     (*''H~~HY
 	
  9PRSKb+H:5ST
 	
 %%rB   rP   c                    | j                   sDt        t        j                         j                        }t
        j                  d|       || _         t        ||      }t
        j                  di d|d| j                  d| j                  d| j                  d| j                   d| j                  d	| j                  d
| j                  d| j                  d| j                   d| j"                  j$                  d| j&                  d| j(                  d| j*                  d| j,                  d| j.                  d| j0                         t3        d| j                  | j                  | j                   | j                  | j                  | j4                  d| j                  }t7        |      \  }}| j,                  t8        j:                  d<   t=        | j>                  | j                  |tA        |      tC        jD                  |      | j                  | j                   ||| j4                  | j(                  | j*                  | j.                  | j0                  | jF                        }tI        || j"                  | jJ                  | jL                        }	d}
	 tO        jP                  tO        jR                  | j&                               |	jU                         }tW        jX                  |	j[                         | j(                         |j]                         rt_        ||j`                        |jb                  |
r|jd                  jg                          S S # t^        $ r  th        $ r2 d}
tW        jX                  |	jk                         | j(                          tl        $ r0 tW        jX                  |	jk                         | j(                          w xY w# |
r|jd                  jg                          w w xY w)Nz3config has no run_id, generated a random run_id: %saR  Starting elastic_operator with launch configs:
  entrypoint               : %(entrypoint)s
  min_nodes                : %(min_nodes)s
  max_nodes                : %(max_nodes)s
  nproc_per_node           : %(nproc_per_node)s
  run_id                   : %(run_id)s
  rdzv_backend             : %(rdzv_backend)s
  rdzv_endpoint            : %(rdzv_endpoint)s
  rdzv_configs             : %(rdzv_configs)s
  max_restarts             : %(max_restarts)s
  monitor_interval         : %(monitor_interval)s
  log_dir                  : %(log_dir)s
  metrics_cfg              : %(metrics_cfg)s
  event_log_handler        : %(event_log_handler)s
  numa_options             : %(numa_options)s
  signals_to_handle        : %(signals_to_handle)s
  duplicate_stdout_filters : %(duplicate_stdout_filters)s
  duplicate_stderr_filters : %(duplicate_stderr_filters)s
rQ   r   r   r   r   r"   r    r$   r(   r)   log_dirr-   r0   r1   r2   r3   r4   )rj   rk   r   r   r   r.   TORCHELASTIC_SIGNALS_TO_HANDLE)r   local_world_sizerQ   rX   rdzv_handlerr(   r)   rn   ro   r.   r0   r1   r3   r4   r5   )specr   r+   r,   T)namefailuresFrN   )7r   rI   uuiduuid4rG   r<   warningre   r=   r   r   r   r"   r    r$   r(   r)   r   root_log_dirr-   r0   r1   r2   r3   r4   r   r.   rp   osenvironr
   r   tuplerdzv_registryget_rendezvous_handlerr5   r   r+   r,   r	   initialize_metricsMetricsConfigrunr   recordget_event_succeeded	is_failedr   rx   return_valuesru   shutdownr   get_event_failed	Exception)rP   rQ   rX   r   entrypoint_namerf   rn   ro   rv   agentshutdown_rdzvresults               r@   r   r      s   
 ==TZZ\%%&LfU*:t<O
KK	F$	
/	
))	
 ))	
 f33		

 fmm	
 F//	
 V11	
 F//	
 F//	
  7 7	
 v((55	
 6--	
  !9!9	
 F//	
  !9!9	
  '(G(G!	
" '(G(G#	
'&P + ##%%}}""""$$ 

O  2/BK 4:3K3KBJJ/0[[..4["99/J((00$$ 22((!'!@!@!'!@!@!44D$ $$((!'!@!@	E M )""7#8#89K9K#LMe//163K3KL
 #$ 
 ## &&(     e,,.0H0HI e,,.0H0HI &&( s   B"M A;OO O:)5r}   rb   ry   collections.abcr   dataclassesr   r   typingr   r8   -torch.distributed.elastic.rendezvous.registrydistributedelastic
rendezvousregistryr   torch._utils_internalr   torch.distributed.elasticr   r	   *torch.distributed.elastic.agent.server.apir
   :torch.distributed.elastic.agent.server.local_elastic_agentr   )torch.distributed.elastic.multiprocessingr   r   r   0torch.distributed.elastic.multiprocessing.errorsr   $torch.distributed.elastic.rendezvousr   *torch.distributed.elastic.rendezvous.utilsr   'torch.distributed.elastic.utils.loggingr   torch.numa.bindingr   __all__rC   r<   r   r   rI   rL   re   r   rG   rp   rJ   r   rN   rB   r@   <module>r      s   
 
  $ (   E E : 5 A X 
 N E P > * =	H	 [N [N [N|$H $HNX^d%: $s) PS (&)&
3:sTz!"&&})})3%}) s)}) 
#s(^	})rB   