
    i0                         d Z ddlZddlmZ ddlmZ ddlmZ ddlZddl	m
Z
mZmZmZmZmZmZmZmZmZ  edd       G d	 d
e             Zy)zj
Definition of CuTe inspired Layouts for DeviceMesh internal bookkeeping and functions to manipulate them
    N)Iterator)	dataclass)product)
as_tuplecoalesce
complementcompositionflattenIntTupleis_intis_tupleLayoutmatch_structureT)frozeninitc                   z    e Zd ZU dZeed<   eed<   ddZedefd       Zedefd       Z	ede
eeef      fd       Zedeed	f   fd
       ZdefdZdedd f fdZddZddZddZdedd fdZdededd dd fdZdee   fdZdedeee      fdZdefdZdej8                  dej8                  fdZ xZS )_MeshLayoutax  
    Utility class for representing an integer layout by borrowing ideas from CuTe Layout Algebra.
    See https://docs.nvidia.com/cutlass/media/docs/cpp/cute/02_layout_algebra.html for more details.

    Each layout is represented as a list of sizes and strides. We use it as a way for mechanical bookkeeping
    of the integers such as ranks in a SPMD mesh, and the transformation on top of it.

    Lots of methods of layout like coalesce, composition, complement, etc. are borrowed from pycute.
    https://github.com/NVIDIA/cutlass/blob/6dd13d42784ee5bfa232d2441e6b9a021c5c6290/python/pycute/layout.py#L137,L257

    Note this is a CuTe-inspired layout, because CuTe uses co-lexicographic way in linearization while PyTorch
    is using lexicographic. So even though the CuTe documentation can still be referenced, the implementation will be
    different from that of PyCute's.
    shapestridereturnc                    t        | j                        s6t        | j                        s!t        dt	        | j                               t        | j
                        s6t        | j
                        s!t        dt	        | j
                               t        | j                  | j
                        s&t        d| j                   d| j
                   d      y )Nz"shape must be a tuple or int, got z#stride must be a tuple or int, got zsizes z and strides z don't match)r   r   r   	TypeErrortyper   r   
ValueErrorselfs    X/var/www/html/engine/venv/lib/python3.12/site-packages/torch/distributed/_mesh_layout.py__post_init__z_MeshLayout.__post_init__/   s    

#F4::,>@djjAQ@RSTT$VDKK-@A$t{{BSATUVVtzz4;;7M$++lK  8    c                     | j                   S N)r   r   s    r   sizesz_MeshLayout.sizes9   s    zzr   c                     | j                   S r!   )r   r   s    r   stridesz_MeshLayout.strides=   s    {{r   c                 f    t        t        | j                        t        | j                              S r!   )zipr
   r   r   r   s    r   sizes_and_stridesz_MeshLayout.sizes_and_stridesA   s!    74::&(<==r   .c                 P     t         fdt        t                     D              S )Nc              3   D   K   | ]  }|   j                           y wr!   )numel).0ir   s     r   	<genexpr>z._MeshLayout.top_level_sizes.<locals>.<genexpr>G   s     ?T!W]]_?s    )tuplerangelenr   s   `r   top_level_sizesz_MeshLayout.top_level_sizesE   s    ?eCI.>???r   c                 R    t        j                  t        | j                              S r!   )mathprodr
   r   r   s    r   r*   z_MeshLayout.numelI   s    yy,--r   r,   c                 
   |t        |        k  s|t        |       k\  r7t        d| dt        |        dt        |         dt        |       dz
   d	      t        |   |      }t	        |j
                  |j                        S )NzDim z! is out of range for layout with z* dimensions. Expected dim to be in range [z,    z].)r0   
IndexErrorsuper__getitem__r   r   r   )r   r,   layout	__class__s      r   r9   z_MeshLayout.__getitem__M   s    D	z>Q#d)^qc:3t9+ F014T
|2c$i!m_BP  $Q'6<<77r   c                 F    t        | j                  f| j                  f      S r!   )r   r   r   r   s    r   nestz_MeshLayout.nestV   s    DJJ=4;;.99r   c                 X    t        |       }t        |j                  |j                        S )u  
        A layout is represented by (sizes):(strides), e.g. (3,2):(4,2).
        Two consecutive dimensions can be "merged" into one if their
        strides are contiguous/multiplicative (i.e., the inner stride * inner size
        equals the next stride), we perform this kind of merge inside coalesce.

        Example 1 (simple): (3,2):(2,1)
        - inner dimension: has stride=1, size=2
        - outer dimension: stride = inner_stride * inner_size = 2
        → coalesced = (6:1)    # acts like a flat 1D array of length 6

        Example 2 (non-coalescible): (3,2):(4,1)
        - inner dimension: stride=1, size=2 → 2*1 = 2
        - outer dimension: stride=4, mismatch (≠ 2)
        → cannot merge; result stays (3,2):(4,1)
        )r   r   r   r   )r   r:   s     r   r   z_MeshLayout.coalesceY   s"    " $6<<77r   r:   c                 Z    t        | |      }t        |j                  |j                        S )u  
        By-dimension composition allows one layout to "select from" or "filter through" another layout.
        Think of it as function composition: (self ∘ layout)(input) = self(layout(input))
        between two layouts. This function is a wrapper of pycute's composition.

        Mental model about how to understand the composition logic:
        - The LEFT layout (self) defines the "output space" - what indices are possible
        - The RIGHT layout (layout parameter) acts as a "selector" - which specific indices to pick
        - The composition only generates indices that the left layout could originally produce,
          but the right layout determines which indices to be picked.
        - The stride of the composition layout will not be smaller than the stride of the right layout,
          because when picking the indices the composition will at least follow the the right layout's stride
          to move forward.

        Example:
          self = (6,2):(2,1)      # sizes=(6,2), strides=(2,1)
          layout = (3:2)          # sizes=(3,), stride=(2,)
          self o layout = (3:2)

        Returns:
          Layout being composed.
        )r	   r   r   r   )r   r:   results      r   r	   z_MeshLayout.compositionm   s%    . T6*6<<77r   
world_sizec                 Z    t        | |      }t        |j                  |j                        S )uG  
        Compute the "complement layout" relative to a given world_size.
        A complement layout fills in the "missing" factor so that: self repeat a layout of complement(self, world_size)
        will get a complete world_size. We use ⊗ to denote the repeat operation.

        Example:
          self = (4:1)   # size=4, stride=1
          world_size = 8
          Then:
            complete needed factor = 8 / 4 = 2
            complement(self, 8) = (2:1)

          Together they form:
            (4:1) ⊗ (2:1) = (4,2):(2,1)
          which has world_size = 4 * 2 = 8, as required.

        In distributed terms, complement() is often used to derive the "other"
        rank grouping when splitting processes into 2D meshes.

        For a visualized explanation, see https://x.com/ezyang/status/1962364978393981433/
        )r   r   r   r   )r   rA   r:   s      r   r   z_MeshLayout.complement   s%    , D*-6<<77r   startendc                 :   t        t        | j                              }t        t        | j                              }t        t        |j                              ||| t        t        |j                              ||| t	        t        |      t        |            S r!   )listr   r"   r$   r   r.   )r   rC   rD   r:   r"   r$   s         r   splicez_MeshLayout.splice   sq    Xdjj)*x-. 67eC!(6>>":;c5<w88r   c                     t        d t        | j                        D         D cg c]1  }t        d t	        |t        | j
                              D              3 c}S c c}w )a  
        This function computes the all ranks specified by the layout staring from zero.

        How it works:
        1. we enumerates every possible coordinate (like a nested for-loop).
        If sizes = (2, 3), we get the following coordinates:
            (0,0), (0,1), (0,2), (1,0), (1,1), (1,2)

        2. For each coordinate, we compute a linear rank index as:
            all_ranks_from_zero = sum(coord[i] * strides[i] for i in range(ndim))

        Example A:
        sizes = (2, 3)        # 2 rows, 3 cols
        strides = (3, 1)        # row-major layout
        coords = (0,0) -> 0*3 + 0*1 = 0
                 (0,1) -> 0*3 + 1*1 = 1
                 (0,2) -> 0*3 + 2*1 = 2
                 (1,0) -> 1*3 + 0*1 = 3
                 (1,1) -> 1*3 + 1*1 = 4
                 (1,2) -> 1*3 + 2*1 = 5
        result = [0, 1, 2, 3, 4, 5]

        Example B:
        sizes = (2, 3)
        strides = (1, 2)        # non-standard / strided layout
        coords = (0,0) -> 0*1 + 0*2 = 0
                 (0,1) -> 0*1 + 1*2 = 2
                 (0,2) -> 0*1 + 2*2 = 4
                 (1,0) -> 1*1 + 0*2 = 1
                 (1,1) -> 1*1 + 1*2 = 3
                 (1,2) -> 1*1 + 2*2 = 5
        result = [0, 2, 4, 1, 3, 5]
        c              3   2   K   | ]  }t        |        y wr!   )r/   )r+   ss     r   r-   z2_MeshLayout.all_ranks_from_zero.<locals>.<genexpr>   s     "I58"Is   c              3   ,   K   | ]  \  }}||z    y wr!    )r+   crJ   s      r   r-   z2_MeshLayout.all_ranks_from_zero.<locals>.<genexpr>   s     D$!QADs   )r   r
   r"   sumr&   r$   )r   coords     r   all_ranks_from_zeroz_MeshLayout.all_ranks_from_zero   sT    H !"IWTZZ5H"IJ
 D#eWT\\-B"CDD
 	
 
s   6Ac           	          | j                  |      j                         D cg c]#  }| j                         D cg c]  }||z   	 c}% c}}S c c}w c c}}w )us  
        Build global ranks specified by the layout via two-level ranks composition.

        The nested list forms the Cartesian product of all ranks for one layout and offset
        regarding filling up the world_size with the layout.
        The final global ranks are the addition of these two. The result is a
        list of lists: one sublist per layout. This rank list will be used to build
        the communicator underlying the layout and the given `world_size`.

        Example:
        world_size = 16
        self.size = 4
        self.stride = 1
        ranks = [0, 1, 2, 3]
        offsets = [0, 4, 8, 12]
        result = [
            [0+0, 0+1, 0+2, 0+3],  # → [0, 1, 2, 3]
            [4+0, 4+1, 4+2, 4+3],  # → [4, 5, 6, 7]
            [8+0, 8+1, 8+2, 8+3],  # → [8, 9, 10,11]
            [12+0, 12+1, 12+2, 12+3],  # → [12,13,14,15]
        ]
        )r   rP   )r   rA   offsetranks       r   global_ranksz_MeshLayout.global_ranks   sQ    2 //*5IIK
 (,'?'?'ABtVd]B
 	
B
s   AAAAc                 b    | j                         }t        |      t        t        |            k(  S )u  
        Check if the layout has any overlap between the ranks it generates. If there is overlap,
        we return False, otherwise True.

        The layout is supposed to be injective i.e, aside from indice 0, indices from each
        dim of the layout must be non-overlapping.

        Example 1 - Valid (no overlap):
        Layout: sizes=(2,3), strides=(6,1)
        - Dim 1: stride=1, span=3*1=3, covers indices [0,1,2]
        - Dim 0: stride=6, span=2*6=12, covers indices [0,6]
        → No overlap since 6 > 3

        Example 2 - Invalid (overlap):
        Layout: sizes=(2,3), strides=(2,1)
        - Dim 1: stride=1, span=3*1=3, covers indices [0,1,2]
        - Dim 0: stride=2, span=2*2=4, covers indices [0,2]
        → Overlap! stride=2 < span=3, so indices [0,2] are duplicated

        Example 3 - Invalid (overlap):
        Layout: sizes=(4,2), strides=(1,1)
        - Dim 1: stride=1, span=4, covers indices [0,1,2,3]
        - Dim 0: stride=1, span=2, covers indices [0,1]
        → Overlap! stride is same for two dims, so indices [0,2] are duplicated

        Returns:
            bool: True if no overlap, False if overlap detected
        )rP   r0   set)r   rankss     r   check_non_overlapz_MeshLayout.check_non_overlap   s)    : ((*5zSU_,,r   rank_mapc                    |j                   dk(  sJ |j                         sJ |j                         | j                         k\  sJ | j	                  |j                               } |j                  t        |j                        t        | j                        z   t        |j                        t        | j                        z         j                  dg| j                   S )a  
        Leverage layout as an index for mesh tensor that re-maps the indexes after layout
        transformation to actual device ranks.

        With this method, the cute layout serves as the backend of indices bookkeeping for the
        mesh tensor when it comes to flatten, unflatten and slicing operations. The actual mesh
        tensor still represents the actual device assignment and ranks. We need this function
        to specify device allocation and create backend for a mesh. Although any transform of mesh tensors
        can be treated as a view or subset of mesh tensor, we do need to use the actual view or
        sub-tensor for DeviceMesh and its backend creation.

        The shape of the `rank_map` must be 1D and contiguous.

        Examples:

        Case 1 - Consecutive ranks, full world:
            original_mesh_tensor = [[0,1],[2,3]]  # 2x2 mesh, ranks 0-3
            world_size = 4
            layout = Layout(2:2)
            Return: [[0,2],[1,3]]

        Case 2 - Non-consecutive ranks:
            original_mesh_tensor = [[10,20],[30,40]]  # custom rank assignment
            world_size = 4
            layout = Layout(2:2)
            Return: [[[10,30],[20,40]]]

        Args:
            rank_map: The concrete mesh tensor with actual device ranks

        Returns:
            torch.Tensor: A tensor representing the actual device allocation from rank_map
        r6   )ndimis_contiguousr*   cosizer   
as_stridedr
   r"   r$   reshaper1   )r   rY   complement_layouts      r   remap_to_tensorz_MeshLayout.remap_to_tensor
  s    D }}!!!%%'''~~4;;=000 OOHNN,<=x""%++,wtzz/BB%--.1FF
 '"- ++- 	-r   )r   N)r   r   )r:   r   r   r   )__name__
__module____qualname____doc__r   __annotations__r   propertyr"   r$   r   r.   intr'   r1   r*   r9   r=   r   r	   r   rG   rF   rP   rT   boolrX   torchTensorrb   __classcell__)r;   s   @r   r   r      sT     O x      >8E#s(O#< > > @sCx @ @.s .8S 8] 8:8(848S 8] 829C 9c 9= 9] 9%
T#Y %
N
s 
tDI 
8-4 -@+- +- +-r   r   )rf   r3   collections.abcr   dataclassesr   	itertoolsr   rk   torch.distributed._pycuter   r   r   r	   r
   r   r   r   r   r   r   rL   r   r   <module>rr      sO     $ !     $T"[-& [- #[-r   