
    i                        U d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d	d
gZ e       s:d dlZ G d d      Zd Zeej6                  d   _        eej6                  d   _        yd dlmZ  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-  e j\                  e/      Z0e
r	 d dl1m2Z2 e5e6dz  e jn                  dz  f   Z8ejr                  ju                  eg        G d dejv                        Z< e<       Z=e<e>d<   d"de6fdZ? G d d
      Z	 d#de@eAe6z  e6e jn                  z  e5e6e jn                  f   z  f   deAde5e6df   dz  dee8   fdZBdddde6d e5eAdf   de5e6df   dz  de@eAe6z  e6e jn                  z  e5e6e jn                  f   z  f   dz  def
d!Zy# e3$ r e0ji                  d       Y w xY w)$    N)Iterator)zip_longest)OptionalTYPE_CHECKINGUnion)is_available)_MeshLayout)IntTupleis_intsuffix_product)not_noneinit_device_mesh
DeviceMeshc                       e Zd Zy)_DeviceMeshStubN)__name__
__module____qualname__     W/var/www/html/engine/venv/lib/python3.12/site-packages/torch/distributed/device_mesh.pyr   r      s    r   r   c                       y Nr   r   r   r   _init_device_mesh_stubr       s    r   ztorch.distributed.device_mesh)Backend)_get_default_group_resolve_process_groupget_backendget_process_group_ranksget_rankget_world_size	GroupNameinit_process_groupis_initialized	new_groupProcessGroupsplit_group)	ArrayLikezCDeviceMesh requires numpy >= 1.21 to be installed for type checkingc                   r    e Zd ZddZddZddZededefd	       Z	ededefd
       Z
dddeded   fdZy)_MeshEnvreturnNc                     g | _         y r   )
mesh_stackselfs    r   __init__z_MeshEnv.__init__I   s	    02DOr   r   c                 f    t        | j                        dk(  rt        d      | j                  d   S )Nr   z#No device mesh is currently active!)lenr-   RuntimeErrorr.   s    r   get_current_meshz_MeshEnv.get_current_meshL   s.    4??#q("#HII??2&&r   device_meshc                 X    t        j                  dd       |s|S |j                         S )NzdThis get_root_mesh API will be deprecated soon.Please use `get_root_mesh` inside DeviceMesh instead.   
stacklevel)warningswarn_get_root_mesh)r/   r6   s     r   get_root_meshz_MeshEnv.get_root_meshR   s3    
 MMH
 ""--//r   device_typec                 4    t        |       j                         S r   )_get_device_handledevice_countr?   s    r   num_devices_per_hostz_MeshEnv.num_devices_per_host`   s    %k2??AAr   c                 B    t               t        j                  |       z  S r   )r!   r*   rD   rC   s    r   	num_hostsz_MeshEnv.num_hostsd   s     "#x'D'D['QQQr   mesh_dim_namec                 R    t        j                  dd       |j                  |      S )NznThis _get_all_submeshes API will be deprecated soon.Please use `_get_all_submeshes` inside DeviceMesh instead.r8   r9   )r;   r<   _get_all_submeshes)r/   r6   rG   s      r   rI   z_MeshEnv._get_all_submeshesl   s,     MMM
 11-@@r   r+   Nr+   r   )r6   r   r+   r   )r   r   r   r0   r5   r>   staticmethodstrintrD   rF   listrI   r   r   r   r*   r*   H   s    	3	'	0 
	Bc 	Bc 	B 
	B 
	R3 	R3 	R 
	R	A+	A<?	A,	Ar   r*   _mesh_resourcesr?   c                 $    t        t        | d      S )a:  
        Get the module corresponding to the device_type which is cuda or cuda-like device.
        For example, when the device_type is cuda, the module `torch.cuda` is returned.
        Return None when there is no corresponding module for device_type, otherwise
        return the corresponding module.
        N)getattrtorchrC   s    r   rA   rA   x   s     uk400r   c                      e Zd ZU dZeed<   ej                  ed<   eedf   dz  ed<   e	ed<   dZ
ed    ed<   eed f   ed	<   	 dCddd
ddddddedeej                  df   dz  deedf   dz  deedf   dz  dededz  de	dz  dej                  dz  ded    ddfdZedefd       Zedej                  fd       Zedeedf   dz  fd       Zd Zede	dej                  dedededz  f
d       Zede	dej                  deedf   dz  deedf   dee   f
d       ZdDdZdDd ZdEd!Zdefd"Zd# Z d$e!defd%Z"deeedf   z  dd fd&Z#dCd'eez  dz  de$fd(Z%dee$   fd)Z&de	d*eedf   dd fd+Z'	 	 dFd-edz  dedd fd.Z(dedz  fd/Z)d-edefd0Z*deedf   de	fd1Z+d-eded    fd2Z,e	 dCdd3d4e$ee$   z  dedeej                  df   dz  deedf   dz  dd f
d5       Z-dCd'edz  defd6Z.edefd7       Z/edeedf   fd8       Z0defd9Z1dCd'eez  dz  defd:Z2dee   dz  fd;Z3	 	 d,d-edz  ddez  e4jj                  z  eee4jj                  f   z  dd fd<Z6	 dGd=ed>eedf   deedf   deeedz  e4jj                  dz  f   df   dd f
d?Z7	 dCd=eez  d>eedf   deedf   deeee4jj                  z  eee4jj                  f   z  f   dz  dd f
d@Z8edAed    dd fdB       Z9y)Hr   aX
  
        DeviceMesh represents a mesh of devices, where layout of devices could be
        represented as a n-d dimension array, and each value of the n-d dimensional
        array is the global id of the default process group ranks.

        DeviceMesh could be used to setup the N dimensional device connections across the cluster,
        and manage the ProcessGroups for N dimensional parallelisms. Communications could happen on
        each dimension of the DeviceMesh separately. DeviceMesh respects the device that user selects
        already (i.e. if user call `torch.cuda.set_device` before the DeviceMesh initialization),
        and will select/set the device for the current process if user does not set the device
        beforehand. Note that manual device selection should happen BEFORE the DeviceMesh initialization.

        DeviceMesh can also be used as a context manager when using together with DTensor APIs.

        .. note::
            DeviceMesh follows SPMD programming model, which means the same PyTorch Python program
            is running on all processes/ranks in the cluster. Therefore, users need to make sure the
            `mesh` array (which describes the layout of devices) should be identical across all ranks.
            Inconsistent `mesh` will lead to silent hang.

        Args:
            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
            mesh (ndarray): A multi-dimensional array or an integer tensor describing the layout
                of devices, where the IDs are global IDs of the default process group.
            _rank (int): (experimental/internal)
                The global rank of the current process. If not provided, it will
                be inferred from the default process group.

        Returns:
            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.

        The following program runs on each process/rank in an SPMD manner. In this example, we have 2
        hosts with 4 GPUs each.
        A reduction over the first dimension of mesh will reduce across
        columns (0, 4), .. and (3, 7), a reduction over the second dimension
        of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).

        Example::

            >>> # xdoctest: +SKIP("no rank")
            >>> from torch.distributed.device_mesh import DeviceMesh
            >>>
            >>> # Initialize device mesh as (2, 4) to represent the topology
            >>> # of cross-host(dim 0), and within-host (dim 1).
            >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
        _device_type	_rank_map.N_mesh_dim_names_layout
_root_mesh_flatten_mappingT)mesh_dim_namesbackend_override_init_backend_rankrX   rV   rY   r?   meshr(   r[   r\   r]   r^   r+   c                   t         j                  j                  d       |||t        d      t	        |t         j
                        r'|j                  j                  dk7  rt        d|       t	        |t         j
                        r<|j                         j                  t         j                        j                         n%t        j                  |dt         j                        }
t        |
j                         |
j!                               }|
j#                         }n||t        d      |j%                         sJ d       |j&                  d	k(  sJ d
       |j)                         sJ d       |j+                         |j-                         k\  sJ d|j+                          d|        || _        || _        || _        |rt5        |      nd | _        |	| _        |dt;        | j0                        z  }nOt;        |      t;        | j0                        k7  r.t        dt;        |       dt;        | j0                         d      |r|n;t        | j<                  j                         | j<                  j!                               | _        | j0                  j%                         st?        d      | j0                  j+                         | j<                  j+                         k7  r4t?        d| j0                   d| j<                  j                          d      t5        | j2                  jA                               | _!        d | _"        i | _#        |dk7  r|rG| jI                          | jK                  | j0                  | j2                  | j6                  |      | _&        tO               r&tQ               dk(  rtS        jT                         | _"        |
tW               }| j<                  |k(  jY                         }|j                  d      dvrt?        d|j                  d             |j                  d      dkD  r|d   jA                         nd | _-        y y )Nz1torch.distributed.device_mesh.DeviceMesh.__init__z@Cannot provide _layout and/or _rank_map if passing explicit meshcpuz!`mesh` must be a CPU tensor, got dtypedevicerc   z<The mesh argument is required except for PRIVATE USAGE ONLY!z?Please use a non-overlapping layout when creating a DeviceMesh.   z"The rank map must be 1-dimensionalzThe rank map must be contiguouszThe rank map contains z. element, which isn't large enough for layout NNzWbackend_override should have the same length as the number of mesh dimensions, but got  and .z@Please use a valid layout when creating a DeviceMesh.The layout z& is not consistent with the mesh size xlathreadedr   )r   rf   z(rank_coords.size(0) must be 0 or 1, got ).rS   _C_log_api_usage_once	TypeError
isinstanceTensorre   type
ValueErrordetachtorN   
contiguoustensorr	   sizestrideflattencheck_non_overlapndimis_contiguousnumelcosizerU   rX   rV   tuplerW   rY   r3   r_   AssertionErrortolist_flatten_rank_map
_thread_idrZ   _setup_world_group_and_device_init_process_groups_dim_group_namesr$   r   	threading	get_identr    nonzero_coordinate_on_dim)r/   r?   r_   r[   r\   r]   r^   rX   rV   rY   mesh_tensorrank_coordss               r   r0   zDeviceMesh.__init__   s    HH((C &)*?#Z  dELL1dkk6F6F%6O$'H%OPP "$5 KKM$$599$5@@Bd5		J 
 &k&6&6&8+:L:L:NO'//1	?i&7#V  ,,. Q. >>Q&L(LL&**,O.OO,??$(88 ():(; <77>iA8
 !,D"DL&DN<J5#8PTD (DO'#2S5F#F %&#dll*;; "#345U3t||;L:MQP    !14993C3C3EF L
 <<113$U  ||!!#tyy'88$""&,,/UVZV_V_VdVdVfUgghj  &+4>>+@+@+B%CD""DO$&D! e# !668,0,E,E,,(	-D) "#(C&/&9&9&;DO=$JE  $yyE1::<##A&f4(B;CSCSTUCVBWX  0;/?/?/BQ/FKN))+D '5 $r   c                     | j                   S )z$Returns the device type of the mesh.)rU   r.   s    r   r?   zDeviceMesh.device_type,  s     $$$r   c                    | j                   j                  | j                        }|j                  d      dk(  r|d   S |t	               k(  j                         }|j                  d      dkD  r||d      S t        d      )z6Returns the tensor representing the layout of devices.r   rf   )r   r   zIn order to get the mesh Tensor of a DeviceMesh it needs to either have all its original dimensions (e.g., no slicing) or it needs to contain the local rank)rX   remap_to_tensorrV   rx   r    r   r4   )r/   	full_mesh	my_coordss      r   r_   zDeviceMesh.mesh1  s~     44T^^DI~~a A% |#"hj099;I~~a 1$ 4118 r   c                     | j                   S )z%Returns the names of mesh dimensions.)rW   r.   s    r   r[   zDeviceMesh.mesh_dim_names@  s     '''r   c           	         t               }|s
t                t               }| j                  j	                         |kD  r*t        d| d| j                  j	                          d      t        | j                        }|r|j                         sdt        j                  v rMt        t        j                  d         }t        j                  d|       |j                  |       t%               S t        j                  dd       |j!                         }||kD  r'||z  d	k7  rt        d
| d| d| j                   d      |j                  t#               |z         t%               S )Nz2Mesh should not be bigger than default world size z, but found z ranks!
LOCAL_RANKzESetting default device for the current process based on LOCAL_RANK=%sa=  It seems like you did not set/select the default device for the current process before the DeviceMesh initialization or use a launcher (i.e. torchrun) which populates `LOCAL_RANK` environment variable. It is recommended to set the current device for the process BEFORE the DeviceMesh initialization so that the underlying communicator (i.e. NCCL) can be initialized properly. Given that the current process has no default device selected, DeviceMesh will use a heuristic to set the device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. r8   r9   r   z8DeviceMesh only support homogeneous hardware, but found z ranks and  z	 devices!)r$   r#   r!   rX   r~   r4   rA   rU   osenvironrN   loggerinfo
set_devicer;   r<   rB   r    r   )r/   default_initialized
world_sizedevice_handle
local_rankrD   s         r   r   z(DeviceMesh._setup_world_group_and_deviceE  sh   "0"2 '"$')J||!!#j0"HT`aeamamasasau`vv}~  /t/@/@AM]%A%A%C  2::-!$RZZ%=!>JKK_" ",,Z80 &''- MMv $% ,9+E+E+G("%99&)==B*V)l+6J5K1TM^M^L__hj  ",,XZ:N-NO%''r   
sub_layoutrank_mapdim_namec                    | j                         j                  |      }|\  }}|r|j                  nd }d| }d }	t               }
| j	                         t               k(  ru|dk(  rpt        t        t                           }t        j                  j                         r(t        |
      dk(  rt        d|d      }	|	j                  S |
}	|	j                  S t        |
dd       t        j                  j                         ra|5|
j                  t        j                   d            j#                         |k(  r*t%        |
|||j'                         |	      }	|	j                  S d }|D ]W  }|j'                         }t        |||||
      }	t)               |v s0|t+        dt)                d| d      |	j                  }Y |S )Nmesh_rh   gloozcpu:gloo,cuda:ncclmesh_default)backendranks
group_descbound_device_idcuda)	parent_pgtimeout
pg_optionssplit_ranksr   )r   r   r   r   r   zFEach device mesh dimension should get only one process group, but got z in !)nestr   _timeoutr   r~   r!   rO   rangerS   r   r   r   r%   
group_namerR   _get_backendre   namer'   r   r    r4   )r   r   r   r\   pg_ranks_by_dimr   r   r   r   	dim_groupdefault_groupr   pg_namedim_meshsubgroup_rankss                  r   _init_one_process_groupz"DeviceMesh._init_one_process_groupx  s    )oo/??IO"2GZ .8j))TG !
+JI.0M !^%55:J O ; U>#345 zz..0#M2f<  4##1  !+++ '  !+++ '8$?KJJ++-O$11%,,v2FGLLN (+#) / 6 6 8)	 !+++ G+ 3!)!2%(##))	 :/**demeodp q""0!14  (22G#3$ Nr   layoutc           
         g }t        t        |             D ];  }|r||   nd| }|j                  t        j	                  | |   ||||                = |D cg c]  }||	 }}|rt        |      t        |      k(  sJ |S c c}w )Ndim_)r   r3   appendr   r   )	r   r   r[   r\   dim_group_namesdimr   ndim_non_none_group_namess	            r   r   zDeviceMesh._init_process_groups  s     79OS[) 2@>#.SEl&&66sXx9I#9N 4C'Taam'T$'T/37O3PTWU 4   ,+	 (Us   B!Bc                 6    | j                   r| j                   S | S r   )rY   r.   s    r   r=   zDeviceMesh._get_root_mesh  s    &*oo4???4?r   c                 D    t         j                  j                  |        | S r   )rP   r-   r   r.   s    r   	__enter__zDeviceMesh.__enter__  s    &&--d3Kr   c                 @    t         j                  j                          y r   )rP   r-   pop)r/   exc_type	exc_valueexc_tracebacks       r   __exit__zDeviceMesh.__exit__  s    &&**,r   c                    | j                   rDddj                  d t        | j                   | j                  j                        D               dn| j                  j                   }d| d| j
                   d| j                  j                   }t        j                  j                  dd	      d
k(  r |d| j                  j                          z  }| dS )N(z, c              3   0   K   | ]  \  }}| d |   yw)=Nr   ).0kvs      r   	<genexpr>z&DeviceMesh.__repr__.<locals>.<genexpr>  s     mTQ1QCjms   )zDeviceMesh(z, 'z
', stride=TORCH_DISTRIBUTED_DEBUG DETAILz, Mesh: )rW   joinziprX   top_level_sizesr?   stridesr   r   getr_   r   )r/   device_mesh_reprs     r   __repr__zDeviceMesh.__repr__  s     '' DIImS9M9Mt||OkOk5lmmnnop445 
 "--=,>c$BRBRASS]^b^j^j^r^r]stzz~~7<H htyy/?/?/A.B$CC &'q))r   c                     t        | dd       | _        | j                  sGt        | j                  | j                  | j
                  | j                  | j                  f      | _        | j                  S )N_hash)rR   r   hashr   rX   rU   rW   r   r.   s    r   __hash__zDeviceMesh.__hash__	  s_     w5DJ::!..)),,
 ::r   otherc                 8   | |u ryt        |t              sy| j                  |j                  k(  xrj | j                  |j                  k(  xrO | j                  |j                  k(  xr4 | j
                  |j
                  k(  xr | j                  |j                  k(  S )NTF)rp   r   r   rX   rU   rW   r   )r/   r   s     r   __eq__zDeviceMesh.__eq__  s    u}eZ0&&%*A*AA 8LLEMM18%%););;8 ((E,A,AA8 OOu'7'77r   c                 B   | j                   st        d      t        |t              r|fn|}|| j                   k(  r| S | j	                  |      }t
        j                  j                  j                         5  | j                  ||      }ddd       |S # 1 sw Y   S xY w)a[
  
            Slice the current DeviceMesh based on the mesh_dim_names given to create a submesh.
            The submesh created consists of the dimensions and the communicators indicated by
            ``mesh_dim_names``

            Args:
                mesh_dim_names (Union[str, tuple[str, ...]]): the name or the tuple of names of the
                mesh dimension of the DeviceMesh to create the submesh for.
            Returns:
                A :class:`DeviceMesh` object

            The following program runs on each process/rank in an SPMD manner in a world size of 8.
            In the first example:
                Calling mesh_2d["tp"] on rank 0, 1, 2, 3 returns a 1D submesh of DeviceMesh:([0, 1, 2, 3]).
                Calling mesh_2d["tp"] on rank 4, 5, 6, 7 returns a 1D submesh of  DeviceMesh:([4, 5, 6, 7]).
                Calling mesh_2d["dp"] on rank 0, 4 returns a 1D submesh of  DeviceMesh:([0, 4]).
                Calling mesh_2d["dp"] on rank 1, 5 returns a 1D submesh of  DeviceMesh:([1, 5]).
                Calling mesh_2d["dp"] on rank 2, 6 returns a 1D submesh of  DeviceMesh:([2, 6]).
                Calling mesh_2d["dp"] on rank 3, 7 returns a 1D submesh of  DeviceMesh:([3, 7]).

            In the second example:
                Calling mesh_3d["dp", "cp"] on rank 0, 1, 4, 5 returns a 2D submesh of DeviceMesh:([[0, 1], [4, 5]]).
                Calling mesh_3d["dp", "cp"] on rank 2, 3, 6, 7 returns a 2D submesh of DeviceMesh:([[2, 3], [6, 7]]).
                Calling mesh_3d["cp", "dp"] on rank 0, 1, 4, 5 returns a 2D submesh of DeviceMesh:([[0, 4], [1, 5]]).
                Calling mesh_3d["cp", "dp"] on rank 2, 3, 6, 7 returns a 2D submesh of DeviceMesh:([[2, 6], [3, 7]]).

            Example::

                >>> # xdoctest: +SKIP("no rank")
                >>> from torch.distributed.device_mesh import DeviceMesh
                >>>
                >>> # Initialize a 2D device mesh as (2, 4) to represent the topology
                >>> # of cross-host(dim 0), and within-host (dim 1).
                >>> mesh_2d = init_device_mesh(device_type="cuda", (2,4), mesh_dim_names=("dp", "tp"))
                >>> tp_mesh = mesh_2d["tp"]
                >>> dp_mesh = mesh_2d["dp"]
                >>>
                >>> # Initialize a 3D mesh.
                >>> mesh_3d = init_device_mesh(device_type="cuda", (2,2,2), mesh_dim_names=("dp", "pp", "cp"))
                >>> # The order of the mesh_dim_names provided deteremines the order of dimensions in the submesh.
                >>> dp_cp_mesh = mesh_3d["dp", "cp"]
                >>> cp_dp_mesh = mesh_3d["cp", "dp"]
            z1Cannot slice a DeviceMesh without mesh_dim_names!N)
rW   r4   rp   rM   _get_slice_mesh_layoutrS   _subclassesfake_tensorunset_fake_temporarily_create_sub_mesh)r/   r[   sliced_mesh_layoutsubmeshs       r   __getitem__zDeviceMesh.__getitem__%  s    X ''"#VWW &0%D!.  !5!55%)%@%@%P" &&22IIK X"334FWGXXs   7BBmesh_dimc                    t        | d      st        d      t        | j                        dkD  r&|$t        dt        | j                         ddd      t        | j                        dk(  r#|!t	        t        | j                  d               S | j                         }|j                  }|r*||v r&||   j                  d   }t	        t        |            S t        |t              r| j                  |      n|}t        |t              st        d	t        |             t	        t        | j                  |               S )
a  
            Returns the single ProcessGroup specified by mesh_dim, or, if mesh_dim is not specified and the
            DeviceMesh is 1-dimensional, returns the only ProcessGroup in the mesh.

            Args:
                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
                of the mesh dimension. Default is None.

            Returns:
                A :class:`ProcessGroup` object.
            r   z*DeviceMesh process groups not initialized!rf   Found the DeviceMesh have  dimensionsJOptional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.zmIf you want to get the list of all the ProcessGroups in the DeviceMesh,please use `get_all_groups()` instead.r   zmesh_dim must be an int, got )hasattrr4   r3   rX   r   r   r   r=   rZ   rp   rM   _get_mesh_dim_by_namerN   r   rr   )r/   r   	root_meshroot_to_flatten_mappingdim_group_names        r   	get_groupzDeviceMesh.get_groupi  sL    4!34"#OPP4<< 1$)9"0T\\1B0C;O`=  4<< A%(*: 6t7L7LQ7O PQQ++-I&/&@&@#&87N+N!8"""1"&   6~ FGG "(C0 ..x8! 
 "(C0(7X7GH    6t7L7LX7V WXXr   c                     t        t        | j                              D cg c]  }| j                  |       c}S c c}w )z
            Returns a list of ProcessGroups for all mesh dimensions.

            Returns:
                A list of :class:`ProcessGroup` object.
            )r   r3   rX   r   )r/   is     r   get_all_groupszDeviceMesh.get_all_groups  s/     05S5F/GH!DNN1%HHHs   <submesh_dim_namesc                    | j                         }g }|D ]  }|t        | j                        v rA|j                  | j                  t        | j                        j                  |                [| j                  |   }|j                  |j                  t        |j                        j                  |                 t        | j                  ||j                  ||d      }||_        |S )NFrX   rV   r[   rY   r]   )
r=   r   rW   r   r   indexrZ   r   rU   rV   )r/   r   r   r   slice_dim_group_namer   flatten_meshres_submeshs           r   r   zDeviceMesh._create_sub_mesh  s    
 ++-I#% ) 8D$8$899(//--$T%9%9:@@F $(#8#8#>L(//$55$\%A%ABHHN" %!!#--0$#K ,@K(r   rh   rG   c                    | j                         }|s$dj                  t        | j                              }| j                  dk(  r|t        | j                        v r| S t        |j                        }||v rt        | d| dd| d      | j                  j                         }t        |      dkD  r|j                         }||j                  v r:||j                  |   j                  k(  r|j                  |   S t        d| d      t        |j                  ||j                  |f||f	      }||j                  |<   |S )
N_rf   z# already exists for submesh of the . z5The mesh_dim_names of submesh and flattened mesh are z-. Please specify another valid mesh_dim_name.z Flatten mesh with mesh_dim_name zE has been created before, Please specify another valid mesh_dim_name.)rX   rV   r[   rY   r\   )r=   r   r   rW   r|   rs   rX   coalescer3   r   rZ   r   rU   rV   )r/   rG   r\   r   invalid_dim_namesflattened_mesh_layoutres_flattened_meshs          r   _create_flatten_meshzDeviceMesh._create_flatten_mesh  s~   
 ++-I  #$2F2F)G H yyA~-8D<P<P3Q"Q !))B)B C 11 $o%HSUVKL]K^ _B C  %)LL$9$9$;!()A-(=(B(B(D%	 : ::) 11-@HHI %55mDD$:=/ JF G 
 ",&&-#-- -/$"2!4" 9KI&&}5%%r   c                     | j                         }| j                  }|r1|r/t        |      dk7  rt        d      |d   }|j	                  |      S y)z
            Returns the index of the mesh dim in the root mesh.
            The device_mesh passed in needs to be sliced out from the root mesh
            or submesh of the root mesh.
            rf   z"The submesh can only be a 1D mesh.r   N)r=   rW   r3   r   r   )r/   r   child_mesh_dim_nameschild_mesh_dim_names       r   _get_root_mesh_dimzDeviceMesh._get_root_mesh_dim  s^     ++-I#'#7#7 1+,1()MNN&:1&=# 667JKKr   c                     | j                   t        | j                         dk(  rt        d      || j                   vrt        d| dd| j                          t        | j                   j	                  |            S )Nr   zNo `mesh_dim_names` found.zMesh dimension 'z' does not exist.z.Available mesh dimensions are: mesh_dim_names=)rW   r3   KeyErrorr   r  )r/   rG   s     r   r   z DeviceMesh._get_mesh_dim_by_name  s    ##+s43G3G/HA/M0  D$8$88&}o5FGDTEYEYDZ[  D0066}EFFr   c                    d}| | j                         k7  rd}|rF| j                         j                  j                         D ci c]  \  }}||j                   c}}ni }g t	        | j
                        |t        fd|D              st        d| d d      g }|D ]  }|t	        | j
                        v rA|j                  | j                  t	        | j
                        j                  |                [||v s`t        j                  dd	       |j                  ||           t        d
 |D              }t        d |D              }	d}
t        |	      D ].  }t        |      st        d      ||
k  rt        d| d      |}
0 t!        ||	      }|j#                         st%        d| d      |S c c}}w )z
            Validate whether the mesh_dim_names is valid for slicing the given device_mesh.
            If valid, return dim indexes of the slice mesh in the device mesh.
            TFc              3   &   K   | ]  }|v  
 y wr   r   )r   rG   valid_mesh_dim_namess     r   r   z4DeviceMesh._get_slice_mesh_layout.<locals>.<genexpr>+  s      ! !55s   zInvalid mesh_dim_names z% specified. Valid mesh_dim_names are rj   zzSlicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. r8   r9   c              3   4   K   | ]  }|j                     y wr   )sizesr   ls     r   r   z4DeviceMesh._get_slice_mesh_layout.<locals>.<genexpr>B  s      @Q @   c              3   4   K   | ]  }|j                     y wr   )r   r  s     r   r   z4DeviceMesh._get_slice_mesh_layout.<locals>.<genexpr>C  s     "D199"Dr  r2   zCCurrently, this only allows slicing out a contiguous flattened dim.z: specified. Mesh dim indices should be in ascending order.zSlicing overlapping dim_names z is not allowed.)r=   rZ   itemsrX   r   rW   allr  r   r  r;   r<   r   reversedr   NotImplementedErrorr	   r{   r4   )r/   r[   slice_from_rootkeyr_   flatten_name_to_root_layoutlayout_slicedr   sliced_sizessliced_strides
pre_stridery   r  s               @r   r   z!DeviceMesh._get_slice_mesh_layout  s-    #Ot**,,"' # &*%8%8%:%K%K%Q%Q%S!T %
  ($$../$,$ 
  %3  -n-= >00D/EQH 
 M& L8D$8$899!((Xd.B.B%C%I%I$%OP 88MMO#$
 "(()DT)JKL ! @- @@L""Dm"DDN J">2 $ f~-]  J&"1.1A BI I  $
$$ (nEM 224"4^4DDTU  ! Is   Gc                 :   | j                  |      }| j                  |   }|j                  | j                        }| j	                         }g }|D ]H  }t        | j                  ||fd      }||v r| j                  |   gng |_        |j                  |       J |S )z`
            Return all the submeshes of a given mesh dimension of the device mesh.
            Fr[   r]   )	r   rX   r   rV   r    r   rU   r   r   )	r/   rG   r   r   r   cur_rankres_submeshesmesh_1dr   s	            r   rI   zDeviceMesh._get_all_submeshesf  s     11-@H\\(+F$44T^^DO}}HM* .$%%$1#3"'	  7* **845 (
 $$W-. ! r   )r[   groupc                   t        | t              rt        |       }t        |t        j                        r|j                         |k7  s!|9t        |t        j                        s||k7  rt        dt        |       d|       t        j                  |dt        j                        }t        |||d      }| j                  g|_        |S t        |       }t        |      dk(  rt        d      |t        d	      |t        d
      t        |t        j                        r/|j                         j!                  t        j                  d      n%t        j                  |dt        j                        }|j"                  t        |      k7  r)t        d|j                          dt        |       d      t        |||d      }|D  cg c]  } | j                   c} |_        |S c c} w )ak  
            Constructs a :class:`DeviceMesh` with ``device_type`` from an
            existing :class:`ProcessGroup` or a list of existing :class:`ProcessGroup`.

            The constructed device mesh has number of dimensions equal to the
            number of groups passed. For example, if a single process group is passed in,
            the resulted DeviceMesh is a 1D mesh. If a list of 2 process groups is passed in,
            the resulted DeviceMesh is a 2D mesh.

            If more than one group is passed, then the ``mesh`` and ``mesh_dim_names`` arguments
            are required. The order of the process groups passed in determines the topology of
            the mesh. For example, the first process group will be the 0th dimension of the DeviceMesh.
            The `mesh` tensor passed in must have the same number of dimensions as the number of process
            groups passed in, and the order of the dimensions in the `mesh` tensor must match the order
            in the process groups passed in.

            Args:
                group (ProcessGroup or list[ProcessGroup]): the existing ProcessGroup
                    or a list of existing ProcessGroups.
                device_type (str): The device type of the mesh. Currently supports: "cpu",
                    "cuda/cuda-like". Passing in a device type with a GPU index, such as "cuda:0",
                    is not allowed.
                mesh (torch.Tensor or ArrayLike, optional): A multi-dimensional array or an
                    integer tensor describing the layout of devices, where the IDs are global IDs
                    of the default process group. Default is None.
                mesh_dim_names (tuple[str, ...], optional): A tuple of mesh dimension names to assign
                    to each dimension of the multi-dimensional array describing the layout of devices.
                    Its length must match the length of `mesh_shape`. Each string in `mesh_dim_names`
                    must be unique. Default is None.

            Returns:
                DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
            zInvalid mesh z for ProcessGroup with ranks ra   rd   Fr*  r   z.Expects at least one ProcessGroup to be passedz0Must pass mesh if passing multiple ProcessGroupsz:Must pass mesh_dim_names if passing multiple ProcessGroups)rc   re   zEExpects mesh with ndim equal to number of ProcessGroups but got mesh ri   z ProcessGroups)rp   r&   r   rS   rq   r   rs   rM   rw   rN   r   r   r   rO   r3   rt   ru   r|   )r.  r?   r_   r[   group_ranksr6   groupss          r   
from_groupzDeviceMesh.from_group  s   V %.5e<tU\\2t{{}7S$&tU\\:+$'D	{2OP[}]  ||KUYYO(#1"'	 160@0@/A,"" %[F6{a !QRR| !STT% P  dELL1   uyy ?\\$uEIIF 
 yyCK'  KKM?%F}NL  %T.PUK KQ+QE,<,<+QK( ,Rs   G+c                 t    || j                   |   j                         S | j                   j                         S r   )rX   r~   )r/   r   s     r   rx   zDeviceMesh.size  s3    #||H-3355<<%%''r   c                 ,    t        | j                        S r   )r3   rX   r.   s    r   r|   zDeviceMesh.ndim  s    t||$$r   c                 .    | j                   j                  S r   )rX   r   r.   s    r   shapezDeviceMesh.shape  s    <<///r   c                     t               S )z:
            Returns the current global rank.
            )r    r.   s    r   r    zDeviceMesh.get_rank  s     :r   c                    | j                   dkD  r%|#t        dt        | j                         dd      |d}t	        | j                  |            }t        |t              st        d      t	        t        |            S )a|  
            Returns the local rank of the given mesh_dim of the DeviceMesh.

            Args:
                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
                of the mesh dimension. Default is None.

            Returns:
                An integer denotes the local rank.

            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
            hosts with 4 GPUs each.
            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 0, 1, 2, 3 would return 0.
            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 4, 5, 6, 7 would return 1.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 0, 4 would return 0.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 1, 5 would return 1.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 2, 6 would return 2.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3.

            Example::

                >>> # xdoctest: +SKIP("no rank")
                >>> from torch.distributed.device_mesh import DeviceMesh
                >>>
                >>> # Initialize device mesh as (2, 4) to represent the topology
                >>> # of cross-host(dim 0), and within-host (dim 1).
                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
            rf   r   r   r   r   z1We expect ProcessGroup before calling `get_rank`!)
r|   r4   r3   rX   r   r   rp   r&   r   r    )r/   r   mesh_dim_groups      r   get_local_rankzDeviceMesh.get_local_rank  s    : yy1}!1"0T\\1B0C;O`  !%dnnX&>?Nnl;$G  H^455r   c                 6    | j                   r| j                   S dS )z
            Return the relative indices of this rank relative to all
            dimensions of the mesh. If this rank is not part of the mesh, return None.
            N)r   r.   s    r   get_coordinatezDeviceMesh.get_coordinate  s    
 /3.E.E4**O4Or   c                 ~    | j                   st        d      |t        d|id      \  }nd}| j                  ||      S )a\  
            Returns a 1D DeviceMesh by flattening the current DeviceMesh.

            If no mesh_dim_name is provided, the default is a string concatenating the mesh_dim_names of the
            given submesh with each mesh_dim_name separated by "_". For example, if we have a 3D mesh
            DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")), calling
            mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 2, 4, 6], mesh_dim_names=("dp_cp",))
            on rank 0, 2, 4, 6 and a 1D submesh DeviceMesh([1, 3, 5, 7], mesh_dim_names=("dp_cp",)) on rank 1, 3, 5, 7.

            After the flattened dimension is created, to access the flattened dimension in mesh_3d, one can use the
            existing slicing method to obtain the flattened mesh through calling mesh_3d["dp_cp"].
            z3Cannot flatten a DeviceMesh without mesh_dim_names!r   rf   rh   )rW   r4   _normalize_backend_overrider  )r/   rG   r\   backend_override_tuples       r   _flattenzDeviceMesh._flatten$  sZ    ( ''"I   +,G()1-)' *6&,,]<RSSr   r   
mesh_sizesc                    t        t        |      t        |            }|j                         | j                  |   j                         k7  rAt        d|d|j                          d| d| j                  |   j                          d	      | j                  |   j                  |      }| j                  j                  ||dz   |      }t        t        | j                              }t        |      |||dz    | j                         }	t        | j                  ||	j                  t        |      |	d      }
t        | d	      rE| j                   j#                         }| j%                  ||	j                  ||      |||dz    ||
_        |
S )
NzThe product of mesh_sizes=z is z$, but the original dimension at dim=z
 has size z6. These must be equal for unflatten to work correctly.rf   Fr  r   )r	   r   r   r~   rX   rs   compositionsplicerO   r   r[   r=   r   r?   rV   r   r   copyr   )r/   r   rA  r[   r\   inner_layoutpartial_layoutunflattened_layoutunflattened_mesh_dim_namesr   res_meshr   s               r   _create_unflatten_meshz!DeviceMesh._create_unflatten_meshF  s    'uZ'8.:TUL!!#t||C'8'>'>'@@ 1j]$|7I7I7K6L M99<ZUXHYH_H_HaGb cKL  "\\#.::<HN!%!4!4S#'>!R)-ht7J7J.K)L&8<^8L&sS1W5++-I!  *#--$%?@$#H t/0"&"7"7"<"<">151J1J"''"$	2cAg. -<)Or   c                 .   t        |t              r*|| j                  k\  rt        d| d| j                         t        |t              r2|t        | j                        v rt        d| d| j                         t        |      t        |      k7  rt        d      t        |t              r$t        | j                        j                  |      }| t        t        |t        |      |            }ndt        |      z  }| j                  ||||      S )aL  
            Returns a DeviceMesh by unflatten the current DeviceMesh.

            This api can be used to unflatten a N-D DeviceMesh into N-1+len(mesh_sizes)-D meshes or submeshes.
            The dim is the dimension to be unflattened which can be either a string or an integer.

            The mesh_sizes is a tuple which specifies the shape of the mesh unflatten into for the given dim.
            The mesh_dim_names is a list of strings which specifies the names of the dimensions of the mesh unflatten into.
            Its length must match the length of mesh_sizes.

            For example, if we have a 1D mesh DeviceMesh([0, 1, 2, 3, 4, 5, 6, 7], mesh_dim_names=("world")),
            calling mesh_1d._unflatten(0, (2, 2, 4), ["dp", "pp", "tp"]) will create a 3D mesh
            DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")).

            Note that after calling the unflatten, there is no access to the unflattened dimension in mesh_1d, one can only
            use the newly unflattened mesh to slice out the unflattened mesh dims.
            zdim z+ specified in `_unflatten` is out of range z% specified in `_unflatten` is not in zAmesh_dim_names must have same length as mesh_sizes in _unflatten!rg   )rp   rN   r|   rs   rM   r   r[   r3   r4   r  r   r>  rK  )r/   r   rA  r[   r\   r?  s         r   
_unflattenzDeviceMesh._unflattenw  s   6 #s#tyy(8 3%J499+V  C%#$:M:M1N*N 3%DTEXEXDYZ  :#n"55"W  #s#t22399#>+)./(J&*& *93~;N)N&..&	 r   device_mesh_listc                    g }g }g }g }| d   j                   }| D ]  }t        t        |j                              D ]R  }|j	                  |j                  |   j
                         |j	                  |j                  |   j                         T |j                  t        |j                               |j                  t        |j                               |j                   |k7  st        d       t        t        |      t        |            }|j                         st        d|        t        | d   j                   || d   j"                  t        |      | d   j%                         d      }	||	_
        |	S )Nr   zCCannot concatenate DeviceMeshes derived from different device meshsz'Cannot concatenate overlapping meshes: Fr  )r   r   r3   rX   r   r  r   extendr   r[   r   r4   r	   r   r{   r   r?   rV   r=   )
rN  concat_dim_namesconcat_sizesconcat_stridesconcat_dim_group_nameflatten_rank_mapdmr   concat_mesh_layoutrJ  s
             r   _concatenatezDeviceMesh._concatenate  sx   *,+-L-/N57!/2DD& s2::/ AA ''

1(;(;<"))"**Q-*?*?@A !''1B1B(CD%,,Xb6I6I-JK ''+;;&]  "-U<-@%BW!X%779"=>N=OP  " #//**1-77$%56+A.==?#H )>H%Or   r   rK   rJ   )Nrh   )rg   ):r   r   r   __doc__rM   __annotations__rS   rq   r   r	   rY   r   dictr   BackendConfigboolrN   r0   propertyr?   r_   r[   r   rL   r"   r   rO   r   r=   r   r   r   r   objectr   r   r&   r   r   r   r  r  r   r   rI   r2  rx   r|   r6  r    r:  r<  C10dBackendOptionsr@  rK  rM  rX  r   r   r   r   r      s(   -	^ <<sCx4//-1
H\*1sL011
 =Aq	
 6:AE"& $*.-115q	q	 k12T9q	
 "#s(Od2q	 $M3$67$>q	  q	 :q	 !4'q	 ||d*q	 !.q	 q	f 
	% 	% 
	% 
	%,, 	 
	 
	(E#s(Od$: 	( 
	(1	(f 
\	#\	ll\	 \	 ,	\	
 \	 
\	| 
	,	,ll	, "#s(Od2	, $M3$67		,
 )_	, 
	,0	@		-
	*c 
	*		 	4 	B	cE#s(O.C B	 B	H,	YcCi$&6 ,	Y, ,	Y\	ID$6 	I!	!	  %S#X!	 	!	J )-.:1	&:1	& ,1	& 	1	&f	d
 	
	Gs 
	Gs 
	GR	!"'S/R	!R	!j	!C 	!D<N 	!2 
 =A\	
 6:\	$|"44\	\	 k12T9\	
 "#s(Od2\	 \	 
\	|	(t 	(s 	(
 
	%# 	% 
	% 
	05c? 	0 
	0	c 	*	639t+; *	6s *	6X	PDI$4 	P )- 15 	T: 	T #!!" C,,,-. 	T  	TR  /	/	 c3h/	 "#s(O	/	
 $cDj+"5"5"<<=sB/	 /	r <	s<	 c3h<	 "#s(O	<	
 #S;...sK<O<O7O1PPP <	 <	| 
 	4+=  	,  	 
 	r   r\   r|   r[   .r+   c              #     K   |d}t        t        |      |      D ]  \  }}|,|| v r(|| v rt        d| d| d      | j                  |      }n|| v r| j                  |      }nd Ot	        |t
              r|d f ft	        |t        j                        rd |f |  | r+t        dt        | j                                d| d|       y w)	Nr   zFound redundant dim index z
 and name z in backend_overriderh   z,Found invalid keys in backend_override: got z!, expected integers in range [0, z) or one of )
r   r   r4   r   rp   rM   r`  ra  rO   keys)r\   r|   r[   dim_idxr   vals         r   r>  r>    s     !N!,U4[.!I 	GX#4D(D..&4WI >  (z)=?  '**84,,&**73""#s#Dk!C!4!45Sk!	'	* >tDTDYDYD[?\>] ^226|NCSU  s   CC)r[   r\   
mesh_shapec          	         |kt        t        |            t        |      k7  rt        dd|       t        |      t        |      k7  r%t        ddt        |       dt        |       d      | t        t	        |t        |      |            }nd}| r | j                         st        d|  d	d
      t        t        |      t        t        |                  }t        j                  d      5  t        j                  |j                         t        j                        }ddd       t        | |||      }|S # 1 sw Y   xY w)a	  
        Initializes a `DeviceMesh` based on `device_type`, `mesh_shape`, and `mesh_dim_names` parameters.

        This creates a DeviceMesh with an n-dimensional array layout, where `n` is the length of `mesh_shape`.
        If `mesh_dim_names` is provided, each dimension is labeled as `mesh_dim_names[i]`.

        .. note::
            `init_device_mesh` follows SPMD programming model, meaning the same PyTorch Python program
            runs on all processes/ranks in the cluster. Ensure `mesh_shape` (the dimensions of the nD array
            describing device layout) is identical across all ranks. Inconsistent `mesh_shape` may lead to hanging.

        .. note::
            If no process group is found, init_device_mesh will initialize distributed process group/groups
            required for distributed communications behind the scene.

        Args:
            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like", "xpu".
                Passing in a device type with a GPU index, such as "cuda:0", is not allowed.
            mesh_shape (Tuple[int]): A tuple defining the dimensions of the multi-dimensional array
                describing the layout of devices.
            mesh_dim_names (tuple[str, ...], optional): A tuple of mesh dimension names to assign to each dimension
                of the multi-dimensional array describing the layout of devices. Its length must match the length
                of `mesh_shape`. Each string in `mesh_dim_names` must be unique.
            backend_override (Dict[int | str, tuple[str, Options] | str | Options], optional): Overrides for some or all of
                the ProcessGroups that will be created for each mesh dimension. Each key can be either the index of a
                dimension or its name (if mesh_dim_names is provided). Each value can be a tuple containing the name
                of the backend and its options, or just one of these two components (in which case the other will be
                set to its default value).

        Returns:
            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.

        Example::

            >>> # xdoctest: +SKIP("no rank")
            >>> from torch.distributed.device_mesh import init_device_mesh
            >>>
            >>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,))
            >>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp"))

        Nz"Each mesh_dim_name must be unique.z/Found repeated mesh_dim_name in mesh_dim_names z6mesh_shape and mesh_dim_names should have same length!zFound len(mesh_dim_names): z and len(mesh_shape):rj   z0Device type with index is not supported but got r	  zUIf you maintained a 'torch.device' object, it's recommended to pass in 'device.type'.ra   rb   )r?   rX   rV   r[   r\   )r3   setr4   r   r>  isalphar	   r   rS   re   aranger~   rN   r   )r?   rf  r[   r\   r?  r   r   r6   s           r   r   r     s[   f %3~&'3~+>>"8EnEUV 
 :#n"55"L1#n2E1FF[\_`j\k[llmn 
 '%*+$c*o~&" &*" {224B;-rRg 
 U:.uZ?P0QR \\%  	E||FLLN%))DH	E #)3
 	E 	Es   04D>>E)r   r   )Cloggingr   r   r;   collections.abcr   	itertoolsr   typingr   r   r   rS   torch.distributedr   torch.distributed._mesh_layoutr	   torch.distributed._pycuter
   r   r   torch.utils._typing_utilsr   __all__sysr   r   modulesr   r   torch._C._distributed_c10dr   r`  "torch.distributed.distributed_c10dr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   	getLoggerr   r   numpy.typingr(   ImportErrorwarningr   rM   ra  r\  serializationadd_safe_globalslocalr*   rP   rZ  rA   r[  rN   r>  r   r   r   <module>r     s[    	   $ ! 1 1  * 6 F F . |
, ~  ?NCKK/0; 0 KK' B    Wx(F 	. #*k&9&9D&@@AM	((+7,A9?? ,A\ !)
OX*1 1U Uz" 26##I+%%%c;3F3F.F(GGI
#
 # c3h$.# 
-	 #R 26 ]]#s(O] c3h$.	]
 #Is[0005k>Q>Q9Q3RRR
 ] 
]{%  	NNU	s   F9 9GG