
    i)                    V   d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlZd dlmZ d dlmZ d d	lmZmZ d d
lm Z  d dl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ g dZ, ejZ                  e.      Z/ G d de      Z0e0jb                  Z1e0jd                  Z2e0jf                  Z3e0jh                  Z4e0jj                  Z5e0jl                  Z6e0jn                  Z7e0jp                  Z8e0jr                  Z9e0jt                  Z:e0jv                  Z;e0jx                  Z<e1Z=e2Z>e3Z?e:Z@ ej                  d      ZB G d de      ZCedeCdeDfd       ZE	 dTdeFeGeHeCdz     f   deGdz  deDfdZI G d de      ZJdTdeHej                     deDdz  deHej                     fd ZM	 dTdeHej                     deDdz  deFeGeHej                     f   fd!ZNd"eHej                     fd#ZO G d$ d%eJ      ZP G d& d'eP      ZQ G d( d)eP      ZR G d* d+eP      ZSd,e0deTfd-ZUd.eHeCdz     d/eGdeHeCdz     fd0ZV	 dUd1eHeCdz     d2eGdeHeC   fd3ZWd1eHeCdz     deHeC   fd4ZXd1eFeGeHeC   f   d5eeGgeGf   d6eGdeFeGeHeC   f   fd7ZYd.eFeGeHeCdz     f   d8eGd6eGd9eGdeFeGeGf   f
d:ZZ G d; d<eJ      Z[ G d= d>      Z\ G d? d@e      Z] G dA dBe[      Z^ G dC dDe^      Z_	 	 dVdEZ` G dF dGe^      Za G dH dIe^      Zb G dJ dKe^      Zc G dL dMe^      ZddNeDfdOZed5eeGgeGf   d6eGfdPZfdQ ZgdReHe+   dNeDfdSZhy)W    N)ABCabstractmethod)Counterdefaultdict)Callable)Enum)	lru_cache)Anycast
NamedTupleProtocol)OptimizedModule)
FSDPModuleUnshardHandle)_Loss)record_function   )generate_rank_to_stage_mappinggenerate_stage_to_rank_mapping)merge_chunkssplit_args_kwargs_into_chunksTensorChunkSpec)_PipelineStageBase)
get_schedule_classPipelineScheduleSinglePipelineScheduleMultiSchedule1F1BScheduleGPipeScheduleInterleaved1F1BScheduleLoopedBFSScheduleInterleavedZeroBubbleScheduleZBVZeroBubbleScheduleDualPipeVc                   R    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZd Zed        Zy)_ComputationTyper                        	   
         c                    t         j                  dt         j                  dt         j                  dt         j                  dt         j
                  dt         j                  dt         j                  dt         j                  dt         j                  d	t         j                  d
t         j                  dt         j                  di}||    S )NFIWUNSHARDRESHARDSEND_FRECV_FSEND_BRECV_BBOVERLAP_F_BREDUCE_GRAD)r%   FORWARDBACKWARD_INPUTBACKWARD_WEIGHTr5   r6   r7   r8   r9   r:   FULL_BACKWARDr<   r=   )selfstr_maps     `/var/www/html/engine/venv/lib/python3.12/site-packages/torch/distributed/pipelining/schedules.py__str__z_ComputationType.__str__;   s    $$c++S,,c$$i$$i##X##X##X##X**C((-((-
 t}    c                    | dk(  rt         j                  S | dk(  rt         j                  S | dk(  rt         j                  S | dk(  rt         j                  S | dk(  rt         j
                  S | dk(  rt         j                  S | dk(  rt         j                  S | dk(  rt         j                  S | d	k(  rt         j                  S | d
k(  rt         j                  S | dk(  rt         j                  S | dk(  rt         j                  S t        d|        )Nr2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   Invalid computation type )r%   r>   r?   r@   r5   r6   r7   r8   r9   r:   rA   r<   r=   RuntimeErroractions    rD   from_strz_ComputationType.from_strL   s   S=#+++s]#222s]#333y #+++y #+++x#***x#***x#***x#***s]#111}$#///}$#///!:6(CDDrF   N)__name__
__module____qualname__r>   r?   r@   r5   r6   r7   r8   r9   r:   rA   r<   r=   rE   staticmethodrL    rF   rD   r%   r%   ,   sY    GNOGGFFFFMKK" E ErF   r%   zK(\d+)(F|I|B|W|UNSHARD|RESHARD|REDUCE_GRAD|SEND_F|RECV_F|SEND_B|RECV_B)(\d*)c                       e Zd ZU eed<   eed<   dZedz  ed<   dZed   dz  ed<   d Z	d Z
ed	efd
       Zedefd       Zy)_Actionstage_indexcomputation_typeNmicrobatch_index)rS   .sub_actionsc                 "    | j                         S N)__repr__rB   s    rD   rE   z_Action.__str__   s    }}rF   c                 P   | j                   C| j                   D cg c]  }t        |       }}ddj                  |       d| j                   S t	        | j
                        }|t	        | j                        z  }| j                  |t	        | j                        z  }|S c c}w )N(;))rW   reprjoinrU   strrT   rV   )rB   
sub_actionsub_action_reprsrepr_strs       rD   rZ   z_Action.__repr__   s    'CGCSCSTZZ 0TTsxx 012!D4I4I3JKK4++,HD1122H$$0C 5 566O  Us   B#returnc                 P    | j                   t        t        t        t        t
        fv S rY   )rU   r>   rA   r?   r@   r<   r[   s    rD   is_compute_opz_Action.is_compute_op   s)    $$)
 
 	
rF   action_stringc                    | j                         } | dk(  ry| j                  d      rd| v r| j                  d      }| d| }| |dz   d }g }|j                         rM|j                  d      D ]9  }t        j                  |j                               }|)|j                  |       ; t	        dt        j                  |      d|rt        |            S d      S t        j                  |       x}rW|j                         \  }}	}
t	        t        |      t        j                  |	      t        |
      rt        |
            S d      S | dk(  ryt        d	|  d
      )z
        Reverse of __repr__

        String should be formatted as [stage][action type][(microbatch)]
            e.g. `2F0`, `1UNSHARD`, `3SEND_F1`
         Nr]   r_   r   r^   )rT   rU   rV   rW   zInvalid action string: zD, should be formatted as [stage][action type][(microbatch)] e.g. 2F0)strip
startswithfindsplitrS   rL   appendr%   tuple_action_regexmatchgroupsintlenrI   )ri   bracket_endsub_partcomputation_type_partrW   sub_strrc   rt   rT   rU   rV   s              rD   rL   z_Action.from_str   s    &++-B ##C(SM-A',,S1K$+H %2a!%!
 K~~'~~c2 7G!(!1!1'--/!BJ!-#**:67 !1!:!:;P!Q!%2=E+.	  DH	  "''6656>Clln;K)+;K  ))*:;),-=)>$%  EI 
 b %m_4xy
 	
rF   )rM   rN   rO   rv   __annotations__r%   rV   rW   rr   rE   rZ   propertyboolrh   rP   rb   rL   rQ   rF   rD   rS   rS      sq    &&#'cDj'04K~&-4
 
t 
 
 3
 3
 3
rF   rS   rK   rf   c                     dt        |        S )NzPP:rb   rJ   s    rD   _get_profiler_function_namer      s    VrF   pipeline_ordererror_step_numberc                 ^   t        j                  |       } | D ]/  }t        t        | |               D ]  }| |   |   d| |   |<    1 t	        d | j                         D              }t        |      D cg c]4  }dt        |      j                  t        t        |dz
                    z   6 }}t        |       D cg c]  }| j                  |dg|z         }}t        t        j                  |ddi      }t        |       }	t        |	      D cg c]  }dt        |      z    }
}t        |g| D cg c]  }t	        d |D               c}dt        |d	         d
z   z  dj                  fdt        |
      D              z   }t        ||      D cg c]T  \  }}| ddj                  fdt        |      D              z   |!t!        |j#                         d         |k(  rdndz   V }}}|dz   dj                  |      z   dz   }|S c c}w c c}w c c}w c c}w c c}}w )z
    Formats the pipeline order in a timestep (row) x rank (column) grid of actions
    and returns the formatted string.

    If `error_step_number` is passed in, an additional label will be added to signify which step
    that it is erroring on.
    rk   c              3   2   K   | ]  }t        |        y wrY   )rw   ).0actionss     rD   	<genexpr>z)_format_pipeline_order.<locals>.<genexpr>   s     HWCLHs   zStep r   	fillvalueRank c              3   L   K   | ]  }|t        t        |            nd  y w)Nr   )rw   rb   )r   items     rD   r   z)_format_pipeline_order.<locals>.<genexpr>  s"     F$d.CD	NA5Fs   "$ r   r&   c              3   :   K   | ]  \  }}|d |    d  yw<rk   NrQ   )r   ilabelmax_lengthss      rD   r   z)_format_pipeline_order.<locals>.<genexpr>	  s0      <)1E5;q>" "
#$<s   z: c              3   L   K   | ]  \  }}t        |      d |    d  ywr   r   )r   r   r   r   s      rD   r   z)_format_pipeline_order.<locals>.<genexpr>  s/     R4c$i+a.!1123Rs   !$z <-- ERROR HERE
)copydeepcopyrangerw   maxvaluesrb   zfillsortedgetlist	itertoolszip_longestzipra   	enumeraterv   rp   )r   r   rankr   	num_stepsstep_labelskeyrank_actionstransposed_actions	num_ranksrank_labelscol
header_rowr   rowformatted_rowsformatted_tabler   s                    @rD   _format_pipeline_orderr      sc    ]]>2N  -s>$/01 	-Ad#A&.*,t$Q'	-- H0E0E0GHHIAFyAQ<=#a&,,s3y1}#5677K 
 >DN=S693y 01L 
 i33\PRPQN#I-29-=>7SV#>K> {8%78 	F#FFK
 KN+a/0388 <5>{5K< 4 J k+=>
 E3 '
((R9S>R
R	S !,EKKM!$%)::  	

N 
 !4'$))N*CCdJOI ?
s   89H HH:H$AH)c                      e Zd Z	 	 	 	 	 d dededej                  f   dz  deedf   dz  de	e
ef   dz  de	e
ef   ee   z  dz  d	efd
Zd Zd Zd Ze	 	 	 	 	 d dedz  dedz  dedz  dedz  def
d       Zedddddedz  fd       Zddddedz  fdZ	 	 	 	 d!dedz  dedz  dedz  dedz  deeef   f
dZd Z	 d"deedf   de	e
ef   dz  fdZdee   defdZy)#_PipelineScheduleNTn_microbatchesloss_fn.args_chunk_speckwargs_chunk_specoutput_merge_specscale_gradsc                     || _         || _        || _        || _        || _        || _        	 | j                  d u| _        g | _        t        j                  d| j                  j                         y )NzUsing %s)_n_microbatches_loss_fnr   _args_chunk_spec_kwargs_chunk_spec_output_merge_spec_has_backward_internal_lossesloggerinfo	__class__rM   )rB   r   r   r   r   r   r   s          rD   __init__z_PipelineSchedule.__init__  sq      . ' !0"3"3	 "]]$6 57J 7 78rF   c                     |j                   r>| j                  1| j                  |||         }| j                  j	                  |       y y y rY   )is_lastr   _compute_lossr   rq   )rB   stageoutput
target_mbsmb_indexlosss         rD   _maybe_compute_lossz%_PipelineSchedule._maybe_compute_loss@  sD    ==T]]6%%fj.BCD!!((. 7=rF   c                    d|cxk  xr t        | j                        k  nc }|j                  r| j                  |r| j                  |   S t        | j                        dk7  r|st	        d| d| j                         y )Nr   zLoss for microbatch z6 is not available. Available losses for microbatches: )rw   r   r   r   rI   )rB   r   r   valid_indexs       rD   _maybe_get_lossz!_PipelineSchedule._maybe_get_lossE  s    8@c$*?*?&@@==T]]6;((22&&'1,[&xj 166:6K6K5LN 
 rF   c                    t        |t              s|g}t        d |D              }|r}|{t        | j                        | j
                  k7  r.t        d| j
                   dt        | j                               |j                          |j                  | j                         | j                  j                          y)zB
        Update the losses to those in the internal state
        c              3   4   K   | ]  }|j                     y wrY   r   )r   r   s     rD   r   z3_PipelineSchedule._update_losses.<locals>.<genexpr>X  s     !DE%--!Ds   N
Expecting z losses but got )	
isinstancer   anyrw   r   r   rI   clearextend)rB   stageslossescontains_last_stages       rD   _update_lossesz _PipelineSchedule._update_lossesQ  s    
 &$'XF!!DV!DD 6#54(()T-A-AA" !5!5 66Fs4K`K`GaFbc 
 LLNMM$//0##%rF   arg_mbs	kwarg_mbsr   r   return_outputsc                     t         )aG  
        Run one iteration of the pipeline schedule with list of microbatches.
        Will go through all the microbatches according to the schedule
        implementation.

        Args:
            microbatches: list of microbatch args.
            return_outputs: whether to return the outputs from the last stage.
        NotImplementedError)rB   r   r   r   r   r   s         rD   _step_microbatchesz$_PipelineSchedule._step_microbatchesh  s
    $ "!rF   targetr   r   c                    t         )5  
        Run one iteration of the pipeline schedule with *whole-batch* input.
        Will chunk the input into microbatches automatically, and go through the
        microbatches according to the schedule implementation.

        args: positional arguments to the model (as in non-pipeline case).
        kwargs: keyword arguments to the model (as in non-pipeline case).
        target: target for the loss function.
        losses: a list to store the losses for each microbatch.
        return_outputs: whether to return the outputs from the last stage.
        r   )rB   r   r   r   argskwargss         rD   stepz_PipelineSchedule.step|  s
    ( "!rF   r   r   c                z    | j                   }	 d| _          | j                  |||d||| _         S # || _         w xY w)a  
        Run one iteration of the pipeline schedule with *whole-batch* input.
        Will chunk the input into microbatches automatically, and go through the
        microbatches, calling forward only.

        args: positional arguments to the model (as in non-pipeline case).
        kwargs: keyword arguments to the model (as in non-pipeline case).
        target: target values for the loss function.
        losses: a list to store the losses for each microbatch.
        Fr   )r   r   )rB   r   r   r   r   original_has_backwards         rD   evalz_PipelineSchedule.eval  sH     !% 2 2	7!&D499d6&KFK "7D!6Ds   1 	:rf   c                      dt         f fd}|
 ||d       ndg j                  z  }|
 ||d       ni g j                  z  }|	 ||d       |'t        |t              st	        dt        |             ||fS )z*
        Pre-process/check inputs
        namec           
          t        | t              st        | dt        |              t	        |       j
                  k7  r't        dj
                   d| dt	        |              y )Nz must be a list but got a r   r   z	 but got )r   r   	TypeErrortyperw   r   
ValueError)mbsr   rB   s     rD   check_type_and_lenz;_PipelineSchedule._check_inputs.<locals>.check_type_and_len  sm    c4(4&(B49+ NOO3x4///  !5!5 6avYs3xjQ  0rF   r   rQ   r   r   z losses must be a list but got a )rb   r   r   r   r   r   )rB   r   r   r   r   r   s   `     rD   _check_inputsz_PipelineSchedule._check_inputs  s    	# 	 w	2dT111G y+6t333I!z<8fd+"B4<. QRR	!!rF   c                 &    | j                  ||      S rY   )r   )rB   r   r   s      rD   r   z_PipelineSchedule._compute_loss  s    }}VV,,rF   r   r   c                     |s|r4t        ||| j                  | j                  | j                        \  }}||fS dg| j                  z  i g| j                  z  fS )zj
        Splits a full-batch input into chunks (i.e. microbatches) and returns
        the chunks
        rQ   )r   r   r   r   )rB   r   r   
args_splitkwargs_splits        rD   _split_inputsz_PipelineSchedule._split_inputs  sm     6'D$$%%''($J |++ 4$...t7K7K0KKKrF   output_chunksc                 .    t        || j                        S )z
        Merge output chunks back to a batch state.
        If output_merge_spec is None, the utility will merge output chunks by dimension 0 (batch dim).
        )r   r   )rB   r   s     rD   _merge_outputsz _PipelineSchedule._merge_outputs  s    
 ##
 	
rF   NNNNTNNNNrY   )rM   rN   rO   rv   r   torchTensorrr   r   dictrb   r
   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   rQ   rF   rD   r   r     s    7;>B?C@D  9 9 #u||+,t3 9 34t;	 9
  _ 45< 9  S>E#J6= 9  9D/

&.   $!%"&"#"" $;" 4K	"
 t" " "&  "" t	" "* "&T 7td{ 7,  $!%"&"$"$" $;$" 4K	$"
 t$" 
tTz	$"L- )-LCHoL S#X%L.
DI 
# 
rF   r   p2p_opsdescc                     t        |       dk(  rg S |r| dnd}t        j                  d||        t        j                  |       S )zt
    Simple wrapper over batch_isend_irecv from torch.distributed, which just adds a descriptive logger on top.
    r   z, rk   zbatch_p2p %s%s)rw   r   debugdistbatch_isend_irecv)r  r  desc_strs      rD   
_batch_p2pr    sF     7|q	"$r{H
LL!8W5!!'**rF   c                     t        t              }i }t        |       dk(  r|S | D ]   }||j                     j	                  |       " t        |j                               D ]  \  }}t        ||      ||<    |S )z
    Sorts the list of P2P ops by the peer rank, and then calls
    batch_isend_irecv. Return a dictionary of works by peer rank. This function
    helps us avoid hangs in case of skip connections.
    r   r  )r   r   rw   peerrq   r   itemsr  )r  r  ops_by_peerwork_by_peeropr  opss          rD   _sorted_batch_p2pr    s     0;4/@K/1L
7|q  (BGG##B'( K--/0 8	c'$7T8 rF   workc                 2    | D ]  }|j                           y)zX
    Waits for a list of dist.Work (typically from _batch_p2p / _sorted_batch_p2p).
    N)wait)r  ws     rD   _wait_batch_p2pr    s      	rF   c                        e Zd ZdZ	 	 	 	 	 ddedededz  deedf   dz  d	e	e
ef   dz  d
e	e
ef   ee   z  dz  def fdZd Zdddddedz  defdZde	eeedz     f   dz  fdZ xZS )r   a  
    Base class for single-stage schedules.
    Implements the `step` method.
    Derived classes should implement `_step_microbatches`.

    Gradients are scaled by num_microbatches depending on the `scale_grads` argument, defaulting to True.  This setting
    should match the configuration of your loss_fn, which may either average losses (scale_grads=True)
    or sum losses (scale_grads=False).
    NTr   r   r   r   .r   r   r   c                     t         |   ||||||       || _        |j                  | _        d| _        d| _        || j                  k  rt        d| d| j                   d      | j                         | _	        y )Nr   r   r   r   r   r   FzNumber of microbatches (z9) must be greater than or equal to the number of stages (z).)
superr   _stage
num_stages_num_stages_stage_forward_initialized_stage_backward_initializedr   _get_pipeline_orderr   )	rB   r   r   r   r   r   r   r   r   s	           rD   r   zPipelineScheduleSingle.__init__*  s     	)+//# 	 	
  ++*/'+0(D,,,*>*: ;##'#3#3"4B8  $$& 	rF   c                    | j                   smg }|j                  | j                  j                                t	        t        |             | j                  j                  | j                  ||       d| _         | j                  r:| j                  s-| j                  j                  | j                         d| _	        y y y NT)r"  r   r  _get_init_p2p_neighbors_opsr  r  _prepare_forward_infrar   r   r#  _prepare_backward_infra)rB   r   r   all_opss       rD   _initialize_stagez(PipelineScheduleSingle._initialize_stageM  s    .. )+GNN4;;BBDEJw/0KK..t/C/CT6R.2D+d&F&FKK//0D0DE/3D, 'GrF   r   r   r   c                   | j                   rt        j                         st        d      | j                   | j                  _        | j                  j                          | j                  ||      \  }}|*t        t        j                  || j                              }nd}| j                  |||||       | j                  j                  r'|r%| j                  | j                  j                        S yr   zstep() requires gradients to be enabled for backward computation; it should not be used under torch.no_grad() context. Please call eval() instead.N)r   r  is_grad_enabledrI   r  has_backwardclear_runtime_statesr   r   tensor_splitr   r   r   r   r   )	rB   r   r   r   r   r   r   r   targets_splits	            rD   r   zPipelineScheduleSingle.step]  s    & e&;&;&=.  $(#5#5  	((* $(#5#5dF#C 
L  !3!3FD<P<P!QRM M 	mV^	

 ;;>&&t{{'@'@AArF   rf   c                      y)a  
        Returns the pipeline execution order as a schedule IR.

        The returned IR is a dictionary mapping rank IDs to lists of actions.
        Each action is either an _Action object representing computation to perform,
        or None representing a deliberate idle step.

        The None values are used to represent pipeline bubbles where a rank
        must wait for dependencies from other ranks before proceeding. However
        during execution, with  the _PipelineScheduleRuntime, these Nones are
        skipped since the relevant communication (send/recv) will be scheduled and waited on.

        Returns:
            A dictionary mapping rank -> list of actions
        NrQ   r[   s    rD   r$  z*PipelineScheduleSingle._get_pipeline_order  s      rF   r   )rM   rN   rO   __doc__r   rv   r   rr   r   r  rb   r
   r~   r   r+  r   r   rS   r$  __classcell__r   s   @rD   r   r     s     $(>B?C@D !
!!
 !
 D	!

 34t;!
  _ 45<!
  S>E#J6=!
 !
F4& "#2 t	2
 2hT#tGdN/C*C%Dt%K rF   r   c                   P    e Zd ZdZ	 	 	 	 	 d	dedz  dedz  dedz  dedz  def
dZy)
_ScheduleForwardOnlyzo
    The forward-only schedule.
    Will go through all the microbatches and perform only the forward pass
    Nr   r   r   r   r   c                    ||t        d      | j                  ||||      \  }}| j                  |d   |d          g }t        | j                        D ]  }t        d|       5  | j                  j                  |      }t        |d      }	|	j                         D ]  }
t        |
        | j                  j                  |||   ||          | j                  j                  |      }t        |d      }	|j                  |	j                                ddd       t        j                  d| j                  j                   |        |D ]  }
t        |
        y# 1 sw Y   IxY w)	z<
        Run one iteration of the pipeline schedule
        Nz7Forward-only schedule does not support loss computationr   Forward fwd_recvr  fwd_send[%s] Forwarded microbatch %s)rI   r   r+  r   r   r   r  get_fwd_recv_opsr  r   r  forward_one_chunkget_fwd_send_opsr   r   r  rT   )rB   r   r   r   r   r   fwd_sends_to_waitr   r  worksr  s              rD   r   z'_ScheduleForwardOnly._step_microbatches  sf    !V%7I  "//JPVWwqz9Q<8 46 t++, 	UA 8A30 
9kk2215)#J?!LLN *D#D)* --aYq\Jkk2215)#J?!((8
9 LL79P9PRST	U$ & 	"DD!	"#
9 
9s   'B3E""E+	r   )rM   rN   rO   r4  r   r~   r   rQ   rF   rD   r8  r8    s]      $!%"&"#*"*" $;*" 4K	*"
 t*" *"rF   r8  c                   x    e Zd ZdZ	 	 	 	 	 ddedz  dedz  dedz  dedz  def
dZd	eeee	dz     f   dz  fd
Z
y)r   z^
    The GPipe schedule.
    Will go through all the microbatches in a fill-drain manner.
    Nr   r   r   r   r   c           	      Z   | j                  ||||      \  }}| j                  |d   |d          g }t        | j                        D ]  }t	        d|       5  | j
                  j                  |      }t        |d      }	|	j                         D ]  }
t        |
        | j
                  j                  |||   ||   |      }| j
                  j                  |      }t        |d      }	|j                  |	j                                ddd       t        j                  d| j
                  j                  |       | j!                  | j
                  ||        |D ]  }
t        |
        g }t        | j                        D ]  }t	        d	|       5  | j
                  j#                  |      }t        |d
      }	|	j                         D ]  }
t        |
        | j%                  | j
                  |      }| j
                  j'                  |||| j                  dz
  k(         | j
                  j)                  |      }t        |d      }	|j                  |	j                                ddd       t        j                  d| j
                  j                  |       ! |D ]  }
t        |
        | j+                  | j
                  |       | j
                  j-                  | j.                  r| j                         yd       y# 1 sw Y   xY w# 1 sw Y   xY w)a6  
        Run one iteration of the pipeline schedule with list of microbatches.
        Will go through all the microbatches according to the GPipe schedule.

        Args:
            microbatches: list of microbatch args.
            return_outputs: whether to return the outputs from the last stage.
        r   r:  r;  r  save_forward_outputr<  Nr=  z	Backward bwd_recvr   r   last_backwardbwd_sendz[%s] Backwarded microbatch %s)r   r+  r   r   r   r  r>  r  r   r  r?  r@  r   r   r  rT   r   get_bwd_recv_opsr   backward_one_chunkget_bwd_send_opsr   perform_reduce_gradr   )rB   r   r   r   r   r   rA  r   r  rB  r  r   bwd_sends_to_waitr   s                 rD   r   z ScheduleGPipe._step_microbatches  s     "//JPVWwqz9Q<8 46 t++, 	IA 8A30 9kk2215)#J?!LLN *D#D)* 66wqz9Q<^ 7  kk2215)#J?!((89 LL79P9PRST$$T[[&*aH#	I, & 	"DD!	"
 46t++, 	VA 9QC1 9kk2215)#J?!LLN *D#D)* ++DKK;.."#t';';a'?"? /  kk2215)#J?!((89" LL8$++:Q:QSTU%	V* & 	"DD!	" 	DKK0''@P@P(<(<XVWXm9 989 9s   B5LCL!L	!L*	rf   c           	         i }| j                   }t        |      D ]  }g }|}|j                  dg|z         t        | j                        D ],  }|j	                  t        |t        j                  |             . d|dz
  |z
  z  }|j                  dg|z         t        | j                        D ],  }|j	                  t        |t        j                  |             . t        || j                        ||<    |S )z
        Returns the pipeline order for GPipe schedule.

        See base method in PipelineScheduleSingle for details on the schedule IR format.
        Nr'   r   )
r!  r   r   r   rq   rS   r%   r>   rA   _add_reduce_grad)rB   r   pp_group_sizer   r   warmup_delaymb_idxbackward_delays           rD   r$  z!ScheduleGPipe._get_pipeline_order-  s    ((-( 	SD,.G  LNND6L01   4 45 Pwt-=-E-EvNOP -!"3d":;NNND6N23   4 45 Vwt-=-K-KVTUV $4GT=Q=Q#RN4 '	S* rF   r   rM   rN   rO   r4  r   r~   r   r  rv   rS   r$  rQ   rF   rD   r   r     s      $!%"&"#NYNY $;NY 4K	NY
 tNY NY`T#tGdN/C*C%Dt%K rF   r   c                   x    e Zd ZdZ	 	 	 	 	 ddedz  dedz  dedz  dedz  def
dZd	eeee	dz     f   dz  fd
Z
y)r   zo
    The 1F1B schedule.
    Will perform one forward and one backward on the microbatches in steady state.
    Nr   r   r   r   r   c                 t   | j                  ||||      \  }}| j                  |d   |d          t        | j                  | j                  | j
                  j                  z
        }d}d}g }	g }
t        |      D ]  }| j
                  j                  |      }t        t        |d             | j
                  j                  |||   ||   |      }t        |	       | j
                  j                  |      }
||dz
  k7  rt        |
d      }	| j                  | j
                  |||       |dz  } 	 | j
                  j                  |      }t        t        |
|z   d             | j                  | j
                  |      }| j
                  j!                  |||| j                  dz
  k(         | j
                  j#                  |      }|dz  }|| j                  k(  rn| j
                  j                  |      }t        t        ||z   d	             | j
                  j                  |||   ||   |      }| j                  | j
                  |||       | j
                  j                  |      }
|dz  }Gt        |d
      }	|| j                  k  r| j
                  j                  |      }t        t        |d             | j                  | j
                  |      }| j
                  j!                  |||| j                  dz
  k(         t        |	       | j
                  j#                  |      }t        |d
      }	|dz  }|| j                  k  rt        |	       | j%                  | j
                  |       | j
                  j'                  | j(                  r| j                         yd       y)a5  
        Run one iteration of the pipeline schedule with list of microbatches.
        Will go through all the microbatches according to the 1F1B schedule.

        Args:
            microbatches: list of microbatch args.
            return_outputs: whether to return the outputs from the last stage.
        r   r;  r  rE  r   r<  fwd_send_bwd_recvrH  bwd_send_fwd_recvrJ  rG  N)r   r+  minr   r!  r  rT   r   r>  r  r  r?  r@  r   rK  r   rL  rM  r   rN  r   )rB   r   r   r   r   r   warmup_chunksfwd_mb_indexbwd_mb_index	send_work	fwd_sends_	fwd_recvsr   	bwd_recvsr   	bwd_sendss                    rD   r   zSchedule1F1B._step_microbatchesT  s     "//JPVWwqz9Q<8   t{{666
  &(		}% 	A44\BIJyzBC [[22%,'$2	 3 F I& 44\BI}q00&yzB	
 $$T[[&*lSAL;	D 44\BI Jy9'<CVWX ''\BDKK***d.B.BQ.FF +  44\BIALt333 44\BI Jy9'<CVWX [[22%,'$2	 3 F $$T[[&*lS 44\BIALU Z yz:	 T11144\BIJyzBC ''\BDKK***d.B.BQ.FF +  I& 44\BI"9:>IAL' T111, 		" 	DKK0''@P@P(<(<XVWXrF   rf   c           	         i }| j                   }t        |      D ]  }g }|j                  dg|z         |dz
  |z
  }d}t        |      D ].  }|j                  t	        |t
        j                  |             |}0 t        dd|dz
  |z
  z        }|j                  dg|z         d}	| j                  |z
  }
|
dkD  ri|dz  }|j                  t	        |t
        j                  |             |
dz  }
|j                  t	        |t
        j                  |	             |	dz  }	|
dkD  ri| j                  |	z
  }|dkD  r||z
  dkD  rK|j                  d       |dkD  ri|j                  t	        |t
        j                  |	             |	dz  }	|dz  }n4|j                  t	        |t
        j                  |	             |	dz  }	|dz  }|dkD  rt        || j                        ||<    |S )z
        Returns the pipeline order for 1F1B schedule.

        See base method in PipelineScheduleSingle for details on the schedule IR format.
        Nr   r   r&   )r!  r   r   rq   rS   r%   r>   r   r   rA   rQ  )rB   r   rR  r   r   num_forward
forward_mbr   wait_for_1f1bbackward_mbremaining_forwardremaining_backwards               rD   r$  z Schedule1F1B._get_pipeline_order  s    ((-( 8	SD,.G NND6D=) )1,4KJ;' wt-=-E-EqIJ

  1(9D(@#ABMNND6M12 K $ 4 4{ B#a'a
wt-=-E-EzRS!Q&! D"2"@"@+N q  $a' "&!5!5!C$q( "D(A-NN4()A-#D*:*H*H+V $q(*a/* NN&6&D&DkR  1$K&!+&% %q(( $4GT=Q=Q#RN4 q8	Sr rF   r   rV  rQ   rF   rD   r   r   N  s      $!%"&"#NYNY $;NY 4K	NY
 tNY NY`BT#tGdN/C*C%Dt%K BrF   r   action_typec                     | t         t        fv S rY   )r4   r;   )rl  s    rD   _requires_reduce_gradrn  )  s    1a&  rF   r   r   c           	      F   g }t        t              fd}| D ]  }||j                  |       g }|j                  t        k(  r'|j
                  |j
                  D ]  } |||        n	 |||       |D ]"  }|j                  t        |t        d             $  |S )z
    REDUCE_GRAD refers to joint across minibatches grad reduction.
    reduce_grad frees memory and we want to schedule it just after the last "backward"-like stage.
    c                     t        | j                        r4| j                  }|xx   dz  cc<   |   k(  r|j                  |       y y y Nr   )rn  rU   rT   rq   )ato_schedulerT   cntr   s      rD   _leaf_actionz&_add_reduce_grad.<locals>._leaf_action7  sO     !3!34--K!;>1"";/ 2 5rF   N)r   rv   rq   rU   r<   rW   rS   r=   )	r   r   actions_with_reduce_gradru  rr  schedule_reduce_grad_stage_idxsrc   	stage_idxrt  s	    `      @rD   rQ  rQ  -  s     68%c*C0  S9 ''*57',1Jmm J
Z)HIJ ;<8 	SI$++GI{D,QR	SS $#rF   compute_actionsmax_active_stagesc                   
 dt         dt        t        dz     dt        t            fd}t               
g dt         f
fd}dt         f
fd}t	        |       D ]s  \  }}|	 ||| |d       t        t        
fd	            }t        t        fd

            }|D ]
  }	 ||	        |D ]
  }	 ||	        j                  |       u t        
      D ]
  }	 ||	        S )a]  Given a basic schedule involving only compute actions (F,B,W,OVERLAP_F_B), add UNSHARD/RESHARD actions for FSDP.

    UNSHARD refers to fetching the full contents of an FSDP-sharded layer, requiring an all-gather operation.
    RESHARD does the opposite, releasing memory (but doing no communication)

    We abandon the "timestep lock"  during lowering

    max_active_stages controls how many prefetches we allow. It should be measured in mb and tuneable but in practice
    3 stages is probably the thing we want?
    (to account for having one f and one b active, and something else prefetching?)
    countnext_actionsNrf   c                    t               }g }|D ]  }||j                  t        k(  rt|j                  h|j                  D ]G  }|j                  |vs|j                  |j                         |j                  |j                         I t        |      | k\  s |S |j                  |vs|j                  |j                         |j                  |j                         t        |      | k(  s |S  |S )zdRemove duplicates (same stage, different microbatch), find next 'count' stages that will do compute.)setrU   r<   rW   rT   addrq   rw   )r|  r}  seenretrr  rc   s         rD   next_stage_indicesz0_add_unshard_reshard.<locals>.next_stage_indices^  s     	"A}%%49R&'mm ?
%11= HHZ%;%;<JJz'='=>? 3x5( 
 }}D0/

1==1s8u,!
#	"" 
rF   rT   c                 h    j                  |        j                  t        | t        d              y rY   )r  rq   rS   r5   rT   active_stagesfsdp_aware_actionss    rD   _unshardz&_add_unshard_reshard.<locals>._unshardy  s(    +&!!'+w"EFrF   c                 h    j                  |        j                  t        | t        d              y rY   )removerq   rS   r6   r  s    rD   _reshardz&_add_unshard_reshard.<locals>._reshard}  s(    [)!!'+w"EFrF   c                     | vS rY   rQ   )sr  s    rD   <lambda>z&_add_unshard_reshard.<locals>.<lambda>  s    a}&< rF   c                     | vS rY   rQ   )r  next_ns    rD   r  z&_add_unshard_reshard.<locals>.<lambda>  s    avo rF   )rv   r   rS   r  r   filterrq   )ry  rz  r  r  r  r   rK   fetchevictr   r  r  r  s             @@@rD   _add_unshard_reshardr  N  s    # T'D.5I dSVi 0 "eM(*Gc GGc G / *	6> $$5qr7JKV<fEFV5}EF  	EUO	 	EUO	!!&)/*4 m$  rF   c                 0   g }| r| j                  d      }|t        |       r'| d   "| j                  d       t        |       r| d   "t        |       dkD  r| d   nd}|j                  t        k(  r||j                  t        k(  rx|j
                  |j
                  k(  r_|j                  |j                  k(  rF|j                  t        |j
                  t        |j                               | j                  d       n|j                  |       | r|S )a9  Given a basic schedule involving only compute actions (F,I,W), merge adjacent I and W ops into B ops.
    (note: I = BACKWARD_INPUT, W = BACKWARD_WEIGHT, B = FULL_BACKWARD)

    B refers to running the whole backward (not separating grad_input and grad_weight), which can be more efficient
    in some cases.
    r   N)
poprw   rU   r?   r@   rT   rV   rq   rS   rA   )ry  merged_actionsrK   next_actions       rD   	_merge_bwr    s    N
 $$Q'> /"q'9'A" /"q'9'A -0,@1,Doa($ ##~5',,?""k&=&=='';+G+GG!!**M6;R;RS "!!&)1 2 rF   stage_to_rankr   c           	         | D ci c]  }|g  }}| D ci c]  }|t                c}dt        dt        ffddt        dt        t        t        f   ffd}dt        dz  dt         t           dt        ffd| rgd}t	        |       D ]I  t        |          d	kD  sJ d
dt        |                 |    d	   }||j                  |j                  }n|f}t        fd|D              sk||   j                  |       |D ]  }	   j                  |	        |	      s  ||	      \  }
}|   j                  |
          j                  |
       | |j                           j                  |        |j                           j                  |        |    j                  d	       t        |          d	k(  r| = d}L |sJ d       | rg|S c c}w c c}w )z
    Transforms a compute-only schedule into a complete schedule with communication actions.

    For actions with sub-actions (OVERLAP_F_B) we ensure that all the subactions have been
    computed and the communication is ready
    rK   rf   c                 F   | j                   t        k(  r<| j                  dz
  k7  xr(  | j                  dz          | j                        k7  S | j                   t        t        fv r9| j                  dk7  xr(  | j                  dz
         | j                        k7  S y)Nr   r   F)rU   r2   rT   r?   rA   )rK   r   r  s    rD   
_has_commsz"_add_send_recv.<locals>._has_comms  s    ""a'%%a7 3M""Q&=v112=3 3 $$(GG%%* 3}""Q&0v11203 3 rF   c                     |       s
J |  d       | j                   }| j                  }| j                  }t        ||t        k(  rt
        nt        |      }|t        k(  r|dz   n|dz
  }t        ||t        k(  rt        nt        |      }||fS )Nz is not a valid comm actionr   )	rT   rU   rV   rS   r2   r7   r9   r8   r:   )rK   rx  ctyperT  sendrecv_stage_idxrecvr  s          rD   
_get_commsz"_add_send_recv.<locals>._get_comms  s    &!IfX-H#II!&&	''((yEQJ&FFK*/1*Q)a-~!vPTzrF   Nprev_actionsc                 J   | y| j                   t        k(  rc| j                  dk7  rTt        | j                  t        | j
                        |v ryt        | j                  dz
  t        | j
                        |v ryy| j                   t        t        fv r| j                  dz
  k7  rt        | j                  t        | j
                        |v ryt        | j                  dz   t        | j
                        |v ryt        | j                  dz   t        | j
                        |v ryyy)a  We don't put our own recv ops in the schedule, we let a sender on another rank put our recv ops in place.
        This helps ensure a sane (non-hanging) ordering of sends and recvs.
        But it also means we might not be able to schedule our next compute action yet.
        Tr   r   F)	rU   r2   rT   rS   r8   rV   r?   rA   r:   )rK   r  r   s     rD   _ready_to_schedulez*_add_send_recv.<locals>._ready_to_schedule  s    
 >$$)f.@.@A.E**FF4K4KL  **Q.63J3JK  ##'FF""j1n4 **FF4K4KL  **Q.@W@WX  **Q.v?V?VW  rF   Fr   rank=z, len(compute_actions[rank])=c              3   6   K   | ]  } |           y wrY   rQ   )r   rr  r  r  r   s     rD   r   z!_add_send_recv.<locals>.<genexpr>"  s     VQ)!\$-?@Vs   Tz6Malformed compute schedule, can't schedule sends/recvs)r  rS   r~   rr   r   rw   rW   allrq   r  rT   r  )ry  r  r   r   comm_actionsr  progressrK   all_actionsrr  r  r  r  r  r  s    ```        @@@rD   _add_send_recvr    s;    DS-S4dBh-SL-SET,UTT35[,UL	7 	t 	7 uWg-='> (7T> (W (RV (T ?+ 	Dt,-1 4'7C 56891 %T*1-F!f&8&8&D$00%iV+VV !T"))&1$ 	PA &**1-!!}%/]
d %T*11$7$T*..t4$]43C3C%DELLTR$]43C3C%DEII$O	P D!%%a(?4()Q.#D)H?	@ QQQxG H M .T,Us
   
G0G5rR  num_microbatchesc                     t               |k(  sJ d| dt                       t        |      D ]  }| v rJ d|         t        |      D ci c]=  }|t        t               t        t               t
        t               t        t               i? c}i dt        dt        dt        f fd} D ]n  }t         |         D ][  \  }}|	t        |t              sJ d| d	| d
| d       |j                  |j                  D ]  }	 ||	||        R ||||       ] p D ]  }
t        |
   t                 }t        |
   t                 }t        |
   t
                 }t        |
   t                 }||k(  sJ d| dt         d|
 d|        ||k(  sJ d|
 d| d|        |||z   dz  z   |k(  rJ d|
 d| d| d| d| 
        S c c}w )Nz2Schedule has incorrect number of ranks - expected z	, actual z%Schedule is missing actions for rank rK   r   r   c                 2   | j                   }| j                  }| j                  }|t        k(  r|   t           j	                  |       n"|t
        k(  rX||   t           vr.d| d| d| d| d	}t        
|      }| d| }t        |      |   t
           j	                  |       n|t        k(  rX||   t           vr.d| d| d| d| d	}t        
|      }| d| }t        |      |   t           j	                  |       n`|t        k(  rW||   t           vr.d| d| d	| d| d
	}t        
|      }| d| }t        |      |   t           j	                  |       |vr||<   y|   }	||	k(  sJ d| d| d| d| d|	 
       y)zPProcess a single action and update stage_actions and stage_index_to_rank_mappingr   , step z": Running Full Backward for stage z, microbatch z without first running Forwardr   z

Full pipeline schedule:
z#: Running Backward Input for stage z$: Running Backward Weight for stage z% without first running Backward Inputz: Stage z is assigned to both rank z
 and rank N)
rT   rU   rV   r2   r  r;   r   AssertionErrorr3   r4   )rK   r   r   s_idr  mb_id	error_msgformatted_schedulefull_error_msgexisting_rankr   stage_actionsstage_index_to_rank_mappings             rD   _process_actionz+_validate_schedule.<locals>._process_actionT  sc   !!''''A:$"&&u-aZM$/22D6.PQUPV W""'(FH  &<t&" !k!>?Q>RS  %^44$"&&u-aZM$/22D6.QRVQW X""'(FH  &<t&" !k!>?Q>RS  %^44$"&&u-aZM$/22D6.RSWRX Y""'(MO  &<t&" !k!>?Q>RS  %^44$"&&u-2204'-7=M=( vWTF(4&8RSWRXXbcpbqr(rF   r   r  z: Got an invalid action: z, expected instance of _ActionzGot r   z microbatches for stage z, expected z(Invalid backward microbatches for stage z8: I and W must have equal counts,             but got I=z, W=r&   z: expected z( total backwards,             but got B=z, I=)rw   r   r2   r  r;   r3   r4   rS   rv   r   r   rW   )r   rR  r   r  r   stage_idr  r   rK   rc   r  f_mbb_mbi_mbw_mbr  r  s   `              @@rD   _validate_scheduler  ;  s    w<=( 
<]O9UXY`UaTbc( m$ OwN"Gv NNO j)=  	susususu	
 	
=M #%9 9s 9# 9v  4%gdm4 	4LD&~fg. vWTF*CF8Kij.
 !!-"("4"4 <J#Jd;<  d3	44"  
=&q)*=&q)*=&q)*=&q)*'' 	
4&!4TF+FVEWX	
' t| 	
6tf =fD(	
|
 td{q((,<< 	
6tfKHXGY ZfDd4&2	
<
& '&U=s   AGc                   P    e Zd ZdZ	 	 	 	 	 	 	 ddee   dededz  dee	df   dz  d	e
ee	f   dz  d
e
eef   ee   z  dz  dedz  dedef fdZdeedf   fdZde
eeedz     f   ddfdZd ZddZdddddedz  defdZ	 	 	 	 	 d dedz  dedz  dedz  dedz  def
dZ xZS )!r   aX  
    Base class for multi-stage schedules.
    Implements the `step` method.

    Gradients are scaled by num_microbatches depending on the `scale_grads` argument, defaulting to True.  This setting
    should match the configuration of your loss_fn, which may either average losses (scale_grads=True)
    or sum losses (scale_grads=False).
    NTr   r   r   r   .r   r   use_full_backwardr   backward_requires_autogradc
                    t         |   ||||||       || _        |d   j                  | _        |d   j
                  | _        |d   j                  | _        t        | j                  | j                        | _
        | j                  D ]  }
| j                  |
_
         d| _        d| _        | j                  d ufd| _        i | _        |	| _        |t"        j%                  d       y y )Nr  r   Fc                 $    | j                   xr S rY   r   )r   has_losss    rD   r  z0PipelineScheduleMulti.__init__.<locals>.<lambda>  s    %--2LH rF   zDeprecation warning: 'use_full_backward' is no longer supported. Simply stop passing it, and everything should still work fine.)r  r   _stagesr   r!  
group_sizerR  
group_rankr   r   stage_index_to_group_rank_stages_forward_initialized_stages_backward_initializedr   _should_compute_lossr   _backward_requires_autogradr   warning)rB   r   r   r   r   r   r   r  r   r  r   r  r   s              @rD   r   zPipelineScheduleMulti.__init__  s    	)+//# 	 	
 !!9//#AY111I((	)G 0 0*
& \\ 	ME.2.L.LE+	M ,1(,1) d2$L! @B
 ,F((NNQ )rF   r   c                    | j                   sg }| j                  D ]!  }|j                  |j                                # t	        t        |             t               }| j                  D ]I  }|j                  r|j                  | j                  ||      }-|j                  | j                  ||      }K d| _         | j                  rA| j                  s4| j                  D ]  }|j                  | j                          d| _        y y y r&  )r  r  r   r'  r  r  rr   is_firstr(  r   r   r  r)  )rB   r   r   r*  r   next_stage_argss         rD   _initialize_stagesz(PipelineScheduleMulti._initialize_stages  s    // )+G Du@@BCDJw/0 05wO >>&+&B&B,,dF'O ',&B&B,,ov'O 04D,d&G&G D--d.B.BCD04D- 'HrF   r   rf   c                     t        || j                  | j                  | j                        | _        | j
                  D ]  }| j                  |_         y)z]
        Allocates the stage index to rank mapping which is needed for communication
        N)r  rR  r!  r   r  r  )rB   r   r   s      rD   _validate_and_set_stage_mappingz5PipelineScheduleMulti._validate_and_set_stage_mapping  sT     *<  	*
& \\ 	ME.2.L.LE+	MrF   c                     t        |dd      5 }t        j                  |      }| j                  D ]   }|j	                  | j                  |          " 	 ddd       y# 1 sw Y   yxY w)QDump a CSV representation of the schedule into a file with the provided filename.r  rk   newlineN)opencsvwriterr   writerow)rB   filenamecsvfiler  r   s        rD   	_dump_csvzPipelineScheduleMulti._dump_csv"  s_    (C, 	;ZZ(F++ ; 3 3D 9:;	; 	; 	;s   AAA'c                 H   |dk(  sJ t        |d      5 }t        j                  |      }t        |      D ]5  \  }}|D cg c]  }t        j                  |       c}| j                  |<   7 	 ddd       | j                  | j                         yc c}w # 1 sw Y   *xY w)zLoad a CSV representation of the schedule from a file with the provided filename.
        This API will most likely get renamed/refactored so is marked as internal for now.

        format must be "compute_only" for PipelineScheduleMulti.
        compute_onlyrk   r  N)r  r  readerr   rS   rL   r   r  )rB   r  formatr  r  r   r   r  s           rD   	_load_csvzPipelineScheduleMulti._load_csv)  s     '''(B' 	O7ZZ(F&v. O	cJM,NQW-=-=a-@,N##D)O	O 	,,T-@-@A	 -O	O 	Os   +B BBBB!r   r   r   c                "   | j                   r+| j                  rt        j                         st	        d      | j
                  D ]  }| j                   |_         | j
                  D ]  }|j                           | j                  ||      \  }}|*t        t        j                  || j                              }	nd}	| j                  |||	||       | j
                  D ].  }|j                  s|s| j                  |j                        c S  yr-  )r   r  r  r.  rI   r  r/  r0  r   r   r1  r   r   r   r   r   )
rB   r   r   r   r   r   r   r   r   r2  s
             rD   r   zPipelineScheduleMulti.step9  s   ( 00))+.  \\ 	4E!%!3!3E	4 \\ 	)E&&(	) $(#5#5dF#C 
L  !3!3FD<P<P!QRM M 	mV^	

 \\ 	@E}}**5+>+>??	@ rF   r   r   r   c           
      
   | j                  ||||      \  }}| j                  |d   |d          | j                  D ci c]  }|j                  | }}t	               }t	               }	|D ]\  }
|
dkD  r!|j                  | j                  |
dz
            |
| j                  dz
  k  s<|	j                  | j                  |
dz             ^ t               }t        | j                  | j                           D ]  \  }}	 g }|(|j                  }|j                  }|j                  }
|J d       |t        j                  k(  rV||
   }|j!                  |||   ||   |      }| j#                  ||||       |j%                  |j'                  |             n|t        j(                  k(  r||
   }| j+                  ||      }||
xx   dz  cc<   ||
   | j,                  k(  }| j.                  r| j,                  nd}|j1                  ||d|       |r|j/                  |       |j%                  |j3                  |             n|t        j4                  k(  rM||
   }| j+                  ||      }|j1                  ||dd       |j%                  |j3                  |             n|t        j6                  k(  re||
   }||
xx   dz  cc<   ||
   | j,                  k(  }| j.                  r| j,                  nd}|j9                  ||	       |r |j/                  |       nt;        d
|       |D ]  }| j                  |   }d}|t=        |      k  r||   }|*|j                  }|j                  }|j                  }
|J d       |t        j                  k(  r1|
dz   |v sr||
dz      }|j%                  |j?                  |             |t(        t4        t6        fv rt;        d
|        |	D ]  }| j                  |   }d}|t=        |      k  r||   }|*|j                  }|j                  }|j                  }
|J d       |t        t6        fv rf|t4        t(        fv r1|
dz
  |v s|||
dz
     }|j%                  |jA                  |             t;        d
|        tC        tE        |              | jU                  | j                  |       yc c}w # tF        $ rs}tH        jK                  d| j                  | jL                  jN                  tQ        |      ||       tH        jK                  dtS        | j                  |             |d}~ww xY w)
        Operate on the microbatches for looped schedules (multiple stages on each rank).

        TODO: Does not use sorted_batch_isend_irecv(). As a result, this schedule does
        not support models with skip connections.
        r   r   NzCAll currently supported action types require valid microbatch_indexrE  Tr   full_backwardrI  FrI  zUnknown computation type zi[Rank %s] pipeline schedule %s caught the following exception '%s' at time_step %s when running action %sz%sr  )+r   r  r  rT   r  r  r  r!  r   r   r   r   rU   rV   r%   r>   r?  r   r   r@  rA   r   r   r   rL  rM  r?   r@   backward_weight_one_chunkr   rw   r>  rK  r  r  	Exceptionr   errorr   rM   rb   r   r   )rB   r   r   r   r   r   r   stage_index_to_stageall_prev_ranksall_next_ranksrT   backward_counter	time_steprK   r  rU   r   r   r   rI  grad_scale_factor	prev_rankprev_rank_opsprev_rank_action	next_ranknext_rank_opsnext_rank_actiones                               rD   r   z(PipelineScheduleMulti._step_microbatchest  s    "//JPVW
IaL9
 37,,?
).Eu$?
 ?
 $'5#&5/ 	TKQ""4#A#A+PQ/#RST--11""4#A#A+PQ/#RS	T *1!*4+>+>tyy+I!J W	IvV(*%'-'>'>$%66H"("4"4K#/ ]/ (+;+C+CC 4[ A!&!8!8$#H-%h/0>	 "9 " 00
HU

5#9#9(#CD)-=-K-KK 4[ A#33E8D(5:5,[9T=Q=QQ & 594D4DD00! * 00$!%*.*7	 1  )!--.?@

5#9#9(#CD)-=-L-LL 4[ A#33E8D00$!%*/*/	 1  

5#9#9(#CD)-=-M-MM 4[ A(5:5,[9T=Q=QQ & 594D4DD00! * 77$*7 8  )!--.?@(+DEUDV)WXX "0 I$($7$7	$BM'+$ 3}#55+8+C('3+;+L+L(#3#D#D&6&B&B'3 a3 ,/?/G/GG*Q2FF )=[1_(M #

5+A+A(+K L-)*+2  !",";<L;M N# 9> "0 I$($7$7	$BM'+$ 3}#55+8+C('3+;+L+L(#3#D#D&6&B&B'3 a3 ,/II -.-1PP*Q2FF )=[1_(M #

5+A+A(+K L",";<L;M N# 1:  
30MW	r 	DLL&1S?
n  (IINN++F *++y !s;   S=ISASA7SAS(AS	U"A.UU)NNNNNTTr  r   )rM   rN   rO   r4  r   r   rv   r   rr   r   r  rb   r
   r~   r   r  rS   r  r  r  r   r   r5  r6  s   @rD   r   r     s    $(>B?C@D)- +/4'(4 4 D	4
 34t;4  _ 45<4  S>E#J6=4  $;4 4 %)4l5uS#X 5:MCgn!556M	M;B& "#9 t	9
 9z  $!%"&"#}2}2 $;}2 4K	}2
 t}2 }2rF   r   c                   V    e Zd Z	 	 	 	 ddedee   dz  dee   dz  dedz  dedz  f
dZy)	_PipelineContextNschedule_refr   r   r   r   c                 J    || _         || _        || _        || _        || _        y rY   )r  r   r   r   r   )rB   r  r   r   r   r   s         rD   r   z_PipelineContext.__init__5  s(     )"$rF   r  )rM   rN   rO   r   r   rr   r  r   rQ   rF   rD   r   r   4  s_     '+'+"&"' et# :$	
 4K trF   r   c                        e Zd ZdededdfdZy)_CustomFunctionProtocolrK   ctxrf   Nc                      y rY   rQ   )rB   rK   r  s      rD   __call__z _CustomFunctionProtocol.__call__E  s    rF   )rM   rN   rO   rS   r   r  rQ   rF   rD   r  r  D  s    KwK-=K$KrF   r  c                        e Zd ZdZ fdZdededdfdZ	 ddee	e
edz     f   d	ef fd
Zdded	ef fdZdded	efdZd ZdefdZ	 	 	 	 	 dde
dz  de
dz  de
dz  de
dz  def
dZ xZS )_PipelineScheduleRuntimea%  
    Provides a simple runtime that requires a 'schedule IR' including specified communication operations.

    Can be instantiated directly by creating _PipelineScheduleRuntime and calling load_csv, or can be
    subclassed and the subclass can be responsible for creating a schedule IR.
    c                     t        |   |i | i | _        t               | _        i | _        i | _        t        t              | _	        t               | _        y rY   )r  r   _comp_type_to_function_mapr   r  bwd_recv_opsfwd_recv_opsr   r   unshard_opsr  unsharded_stages)rB   r   r   r   s      rD   r   z!_PipelineScheduleRuntime.__init__P  sU    $)&)LN'.5i EGDF <Gt;L #rF   rU   custom_functionrf   Nc           	          |t         t        t        t        t        t
        t        t        fvrt        d| d      || j                  v rt        j                  d|       || j                  |<   y)a~  
        Register a custom function to be executed for a specific computation type.

        Args:
            computation_type: The computation type for which to register the custom function
            custom_function: The function to execute when this computation type is encountered.
                Must have signature: (action: _Action, ctx: _PipelineContext) -> None
        rH   z. Only FORWARD, FULL_BACKWARD,                 BACKWARD_INPUT, BACKWARD_WEIGHT, OVERLAP_F_B, UNSHARD, RESHARD and REDUCE_GRAD are supported.zTComputation type %s is already registered. Overwriting the existing custom function.N)r>   rA   r?   r@   r<   r5   r6   r=   r   r  r   r  )rB   rU   r  s      rD   register_custom_functionz1_PipelineScheduleRuntime.register_custom_function_  s     	$
 	
 +,<+= >n o  t>>>NN<  =L''(89rF   r   r  c                     t            |       i  _        |dk(  rC|D ]=  }g  j                  |<   ||   D ]$  }|J  j                  |   j                  |       & ? y|dk(  r|j	                         D ]<  \  }}t        |      D ])  \  }}|	|j                  rt        d| d| d| d       > |D ]M  }t        ||          j                  |<   t         j                  |    j                         j                  |<   O t         j                   fd j                  	       _        yt        d
|d      )z
        Given an in-memory representation for a simple compute-only schedule, lower it to a complex schedule including
        communication actions.  Stores the schedule in self, and must be called before running step_mo()
        compute_commsNr  z?Expected compute-only schedule but found communication action 'z
' at rank z, position ze. Communication actions (e.g. SEND_F, RECV_F, etc.) should not be present when format='compute_only'.c                 "    j                   |    S rY   r  r  rB   s    rD   r  zG_PipelineScheduleRuntime._prepare_schedule_with_comms.<locals>.<lambda>  s    (F(Fq(I rF   )r  r   format= is not implemented)r  r  pipeline_order_with_commsrq   r  r   rh   r   r  rQ  r   r  r!  r   )rB   r   r  r   rK   action_listr   r   s   `      rD   _prepare_schedule_with_commsz5_PipelineScheduleRuntime._prepare_schedule_with_comms  s    	/8CE&_$ H79..t4%dm HF!---2248??GHH ~%%,]]_ !k!*;!7 IAv)&2F2F(  &xz${1# FPQ    7KDM8..t4 8H2248((8..t4	 .<..I++.D* &	1D&EFFrF   r  c                    |dk(  r+t         	|   |       | j                  | j                         y|dk(  rzi }t	        |d      5 }t        j                  |      }t        |      D ]+  \  }}|D cg c]  }t        j                  |       c}||<   - | j                  ||       ddd       yt        d|d      c c}w # 1 sw Y   yxY w)	a	  Loads a csv in simple format and then lowers it to include communication actions

        format must be either "compute_only" or "compute_comms".  If compute_only, the lowering passes
        will automatically be run to generate a compute_comms schedule.
        r  r  rk   r  )r  Nr  r  )r  r  r  r   r  r  r  r   rS   rL   r   )
rB   r  r  r   r  r  r   r   r  r   s
            rD   r  z"_PipelineScheduleRuntime._load_csv  s     ^#Gh'--d.A.AB&Gh+ JwG,!*6!2 GID#BE$FQW%5%5a%8$FGDMG11'&1I	J J &	1D&EFF %GJ Js   +C1C C CCc                    |dk(  rp| j                   J d       t        |dd      5 }t        j                  |      }| j                   D ]   }|j	                  | j                   |          " 	 ddd       y|dk(  rp| j
                  J d       t        |dd      5 }t        j                  |      }| j
                  D ]   }|j	                  | j
                  |          " 	 ddd       yy# 1 sw Y   yxY w# 1 sw Y   yxY w)	r  r  Nz'Compute only schedule must be availabler  rk   r  r  z6Must initialize compute_comms schedule before dump_csv)r   r  r  r  r  r  )rB   r  r  r  r  r   s         rD   r  z"_PipelineScheduleRuntime._dump_csv  s    ^#&&2 92 hR0 ?GG, // ?DOOD$7$7$=>?? ? &11= H= hR0 JGG, :: JDOOD$B$B4$HIJJ J	 '	? ?J Js   AC,AC8,C58Dc                 L     t         j                   fd j                        S )Nc                 "    j                   |    S rY   r  r  s    rD   r  z4_PipelineScheduleRuntime._simulate.<locals>.<lambda>  s    d44Q7 rF   )_simulate_comms_computer  r!  r[   s   `rD   	_simulatez"_PipelineScheduleRuntime._simulate  s%    &**7
 	
rF   r   c                 :   t        |j                  t              }|r|j                  }|| j                  v rL| j                  |   D ]  }|j                           | j                  |= | j                  j                  |       || j                  v s
J d|       yy)zQIf an unshard is active for `stage_idx`, wait() it and mark `stage_idx` unshared.z*Attempted to compute on sharded stage_idx=N)r   submodr   rT   r  r  r  r  )rB   r   stage_uses_fsdprx  r  s        rD   _assert_unshardedz*_PipelineScheduleRuntime._assert_unsharded  s    $U\\:>))ID,,,**95 BGGI$$Y/%%)))4 5 55 =9,?5 rF   r   r   r   r   r   c           	           j                  |      \   j                  d   d           j                  D ci c]  }|j                  | c} j                  J d       g dt
        ddf fd} j                  j                          t         j                   j                           D ]  \  }}	t        j                  d||	       	 t        t        |	            5  |	j                   j                  v r0t!         |      }
  j                  |	j                     |	|
       nH|	j                  t"        k(  r-|	j$                  J d       |	j$                  D ]
  } ||        n ||	       ddd        rt-        j/                                rt1         j2                        dk(  sJ d        j5                   j                  |       yc c}w # 1 sw Y   jxY w# t&        $ rH}t        j)                  d	||	       t        j)                  t+         j                  |
             |d}~ww xY w)r  r   NzLMust call _prepare_schedule_with_comms() before calling _step_microbatches()rK   rf   c                    | j                   }| j                  | j                  nd}|dk\  s|t        t        t        fv sJ d| d       | j
                  }|   }t        |j                  t              }|dz   v }|dz
  v }|t        k(  r*j                  t        |j                  |                   y |t        k(  r*j                  t        |j                  |                   y |t        k(  rH||fj                   vsJ d|d|d       t        |j#                  |            j                   ||f<   y |t$        k(  rH||fj&                  vsJ d|d|d	       t        |j)                  |            j&                  ||f<   y |t        k(  r|r|j*                  vr|j,                  vsJ d
|d       |j                  j/                         D ]Q  }t        |t              st1        t2        |j5                  d            }	j,                  |   j                  |	       S y y |t        k(  r|r|j*                  v sJ d|d       |j,                  vsJ d|d       |j                  j/                         D ]#  }t        |t              s|j7                          % j*                  j9                  |       y y |t:        k(  rj=                  |       |j>                  sC|sA||fj                   v sJ d| d       tA        j                   jC                  ||f             |jE                  ||   |         }
jG                  ||
|       |r|dz      jI                  |
|       y y |tJ        k(  rj=                  |       |jL                  sC|sA||fj&                  v sJ d| d       tA        j&                  jC                  ||f             jO                  ||      }jP                  |xx   dz  cc<   jP                  |   jR                  k(  }|jU                  ||d|       |r(|dz
     jW                  |jY                  |      |       y y |tZ        k(  rj=                  |       |jL                  sC|sA||fj&                  v sJ d| d       tA        j&                  jC                  ||f             jO                  ||      }|jU                  ||dd       |r(|dz
     jW                  |jY                  |      |       y y |t\        k(  rXj=                  |       jP                  |xx   dz  cc<   jP                  |   jR                  k(  }|j_                  ||       y |t        k(  r,j`                  rjR                  nd}|jc                  |       y te        d| d      )Nrl   r   zaction=z missing mb_indexr   zRecv twice for stage_idx=z
 mb_index=z without executing forwardz without executing backwardzUnsharding the same stage_idx=z twiceT)async_opzResharding stage_idx=z without unshardingz before finishing unshardzComputing action=z before receiving inputrE  z Attempted to run compute action=r  Fr  z is unknown or unsupported)3rU   rV   r5   r6   r=   rT   r   r$  r   r7   rq   r  r@  r9   rM  r8   r  r>  r:   r  rK  r  r  modulesr   r   unshardreshardr  r>   r&  r  r  r  r?  r   set_local_fwd_inputrA   r   r   r  r   rL  set_local_bwd_inputget_local_bwd_outputr?   r@   r  r   rN  r   )rK   	comp_typer   rx  r   r%  is_next_stage_on_this_rankis_prev_stage_on_this_rank	submodulehandler   r   rI  r  r   r   r   rB   send_opsr  r   s                 rD   _perform_actionzD_PipelineScheduleRuntime._step_microbatches.<locals>._perform_action  s   //I+1+B+B+N''TV  q=I2 % - &+,	- 
 **I(3E(zBO)2Q:N)N&)2Q:N)N& F"
5+A+A(+K LMf$
5+A+A(+K LMf$ **+  1i\8+=WX	 + <F**84<!!9h"78 f$ **+  1i\8+=XY	 + <F**84<!!9h"78 g%"!)>)>>%T-=-==A 9i\@A> &+\\%9%9%; C	))Z@$!%mY5F5FPT5F5U!V((3::6B	C # g%"$(=(== 0i\1DE= %D,<,<< 0i\1JK< &+\\%9%9%; ,	))Z@$!))+, ))00; # g%&&u- 6 "  **+  -VI-DE	 + $D$5$5$9$99h:O$PQ00H%h'(6	 1  ((
HM .(Q7KK .
 m+&&u- 6 "  **+  <F9<ST	 + $D$5$5$9$99h:O$PQ++E8<%%i0A50 $ 5 5i @DDXDX X(("&"/	 )  .(Q7KK228<h . n,&&u-}}-G!  **+  <F9<ST	 + $D$5$5$9$99h:O$PQ++E8<(("'"'	 )  .(Q7KK228<h . o-&&u-%%i0A50 $ 5 5i @DDXDX X//"/ 0  k)<@<L<LD$8$8RS!))*;< GF9,F!GHHrF   z8_PipelineScheduleRuntime running time_step %d, action %szsub_actions must be setz\_PipelineScheduleRuntime caught exception at step %s when running action %s.  Full Schedule:r  zUnused unshard operations)r   r  r  rT   r  rS   r  r   r   r   r   r  r   r   rU   r  r   r<   rW   r  r  r   r  r  rw   r  r   )rB   r   r   r   r   r   r   r6  r  rK   r  sub_ar  r5  r  s   ```` `       @@rD   r   z+_PipelineScheduleRuntime._step_microbatches  se    "//JPVW
IaL9
 37,,?
).Eu$?
 --9 	
Z	
9
 +-g	IG g	I g	I g	IT 	##%!*4+I+I$))+T!U %	IvLLJ
$%@%HI 0..$2Q2QQ. #%&" Q778O8OP"C  00K?%11=X?XX=%+%7%7 3E+E23 (/#0%	P HLLN+  4##$)F+FF) 	DLL&1I?
z0 0$  r
 *66*3 s8   G(G93BG-G9-G6	2G99	I
AII
r  )r  r   )rM   rN   rO   r4  r   r%   r  r  r  rv   r   rS   rb   r  r  r  r"  r   r&  r~   r   r5  r6  s   @rD   r	  r	  H  s    &%L*%L 1%L 
	%LT %0Gc4$//00G 0GdG# Gs G*J# Js J&
'9    $!%"&"#w2w2 $;w2 4K	w2
 tw2 w2rF   r	  c                   z     e Zd ZdZ	 	 	 	 ddee   dedeez  dz  de	e
ef   ee   z  dz  dedef fd	Zd
 Z xZS )r    ai  
    Breadth-First Pipeline Parallelism.
    See https://arxiv.org/abs/2211.05953 for details.
    Similar to Interleaved 1F1B, Looped BFS supports multiple stages per rank.
    What is different is that when microbatches are ready for multiple local
    stages, Loops BFS will prioritizes the earlier stage, running all available
    microbatches at once.
    Nr   r   r   r   r   r  c                     t         	|   ||||||       i | _        t        | j                        D ]"  }| j                  |      }|| j                  |<   $ | j                  | j                         y )N)r   r   r   r   r   r  )r  r   r   r   rR  !_calculate_single_rank_operationsr  )
rB   r   r   r   r   r   r  r   rank_opsr   s
            rD   r   zScheduleLoopedBFS.__init__  s     	)/#'A 	 	
 @B$,,- 	1D==dCH(0D%	1
 	))$*=*=>rF   c           	         t        | j                        }t        || j                  |z  | j                        }t        |      D cg c]  }d  }}|D ]/  |j	                  fdt        | j
                        D               1 d| j                  dz
  |z
  z  }|j	                  d g|z         t        |      D ]8  |j	                  fdt        t        | j
                              D               : |S c c}w )Nc              3   T   K   | ]  }t        t        j                  |       ! y wrY   )rS   r%   r>   r   r   rT   s     rD   r   zFScheduleLoopedBFS._calculate_single_rank_operations.<locals>.<genexpr>"	  s)       %5%=%=xH   %(r&   r   c              3   T   K   | ]  }t        t        j                  |       ! y wrY   )rS   r%   rA   r>  s     rD   r   zFScheduleLoopedBFS._calculate_single_rank_operations.<locals>.<genexpr>-	  s)       %5%C%CXNr?  )rw   r  r   rR  r   r   reversed)rB   r   n_local_stagesstage_indicesra  r;  post_warmup_opsrT   s          @rD   r:  z3ScheduleLoopedBFS._calculate_single_rank_operations	  s    T\\*$$$~5t7I7I
 9>d)D1$)D)D( 	KOO  %d&:&: ; 	 t11A5<=01#M2 	KOO  (t/C/C)D E 	
 % *Es   	C9)NNTT)rM   rN   rO   r4  r   r   rv   r   r   r  rb   r
   rr   r~   r   r:  r5  r6  s   @rD   r    r      s     ,0@D +/?'(? ? E!D(	?
  S>E#J6=? ? %)?<rF   r    c
                    t        t              }
t        t              }t        t              }t        |      D cg c]  }d  }}| |z  d|dz
  |z
  z  z   ||z   z
  }|	r||z
  dz
  }||z   |z   }g }d}|	rt        nt        }t        |      D ]  }||k  r^ ||      }|
|   x}dz   |
|<   |j                  t        |t        j                  |             ||dz
  k(  sQ|j                  d g|z         g||cxk  r||z   k  rn n ||      }|
|   x}dz   |
|<   |j                  t        |t        j                  |              ||      }||   x}dz   ||<   |j                  t        |||             |j                  |       |	s||z
  |k\  s |||         }||   x}dz   ||<   |j                  t        |t        j                  |             |dz  }O|	s|j                  d         ||      }||   x}dz   ||<   |j                  t        |||             |j                  |       |	s||z
  |k\  s |||         }||   x}dz   ||<   |j                  t        |t        j                  |             |dz  } |	rf|t        |      k  rX |||         }||   x}dz   ||<   |j                  t        |t        j                  |             |dz  }|	r|t        |      k  rX|S c c}w )Nr&   r   r   )r   rv   r   r?   rA   rq   rS   r%   r>   r   r@   rw   )rB  rR  
warmup_opsfwd_bwd_opscooldown_opsr   forward_stage_indexbackward_stage_indexnum_1f1b_microbatchesenable_zero_bubblefwd_stage_mb_indexbwd_stage_mb_indexweight_stage_mb_indexra  r;  rD  	total_opsbackward_op_idsweight_op_countFULL_BACKWARD_OR_BACKWARD_INPUTr  fwd_stage_indexr   r]  bwd_stage_indexr^  weight_stage_indexweight_mb_indexs                               rD   _get_1f1b_rank_opsrX  4	  s    *5S)9)4S)9,7,< 5:$K%@qd%@H%@ 	&ma.?$.F)GG	d	O '$.2[(<7IOO -- $ I M%
?1"5O /??3/ OO)9)A)A8L Z!^# 8928
[ 881"5O 2? CC3/ OO)9)A)A<P 326O 2? CC3/ OO)H,W ""2&!b:o9N&N%9#O4&" (==O'PPO=%&89 *(88'  1$
 &%226O 2? CC3/ OO)H,W ""2&!b:o9N&N%9#O4&" (==O'PPO=%&89 *(88'  1$[M%^ 33G!G1//2RS45GHHO501 	"$4$D$Do	

 	1 33G!G Og &As   	K2c                        e Zd ZdZ	 	 	 	 	 	 ddee   dededz  dee	df   dz  de
ee	f   dz  d	e
eef   ee   z  dz  d
edef fdZdeedz     fdZ xZS )r   a  
    The Interleaved 1F1B schedule.
    See https://arxiv.org/pdf/2104.04473 for details.
    Will perform one forward and one backward on the microbatches in steady
    state and supports multiple stages per rank. When microbatches are ready for
    multiple local stages, Interleaved 1F1B prioritizes the earlier microbatch
    (also called "depth first").

    This schedule is mostly similar to the original paper.
    It differs by being relaxing the requirement of num_microbatch % pp_size == 0.
    Using the flex_pp schedule, we will have num_rounds = max(1, n_microbatches // pp_group_size) and
    it works as long as n_microbatches % num_rounds is 0. As a few examples, support

    1. pp_group_size = 4, n_microbatches = 10. We will have num_rounds = 2 and n_microbatches % 2 is 0.
    2. pp_group_size = 4, n_microbatches = 3. We will have num_rounds = 1 and n_microbatches % 1 is 0.
    Nr   r   r   r   .r   r   r   r  c	           
         |d   j                   | _        t        |   ||||||||       t	        |      | _        |d   j                  | _        t        d|| j                  z        | _	        || j                  z  | _
        || j                  z  dk7  rt        d| j                   d| d      i | _        t        | j                        D ]"  }	| j                  |	      }
|
| j                  |	<   $ | j                  | j                         y )Nr   r   r   r   r   r   r   r   r  r   z_Interleaved 1F1B requires the number of microbatches to be a multiple of the number of rounds (), but got .)r  rR  r  r   rw   rB  r  r   r   number_of_roundsmicrobatches_per_roundr   r   r   r:  r  rB   r   r   r   r   r   r   r   r  r   r;  r   s              rD   r   z ScheduleInterleaved1F1B.__init__	  s-    $AY11)+//#'A 	 		
 "&k1I((	 #A~9K9K'K L&48M8M&M#D111Q65595J5J4K L)*!-  @B$,,- 	1D==dCH(0D%	1
 	))$*=*=>rF   rf   c           
         	  fd} |      	 j                    j                  z  }|	z
  }||z
  }	|z   |z   }t        j                  d	|||        fd} 	fd}t	         j                    j
                  	||||      S )Nc                     j                   dz
  j                  z  }d}||j                  dz
  | z
  z  z   }t        |j                  j                   z        S )Nr   r&   rB  r_  rR  r[  r   r   warmups_ops_last_stagemultiply_factorrF  rB   s       rD   get_rank_warmup_opszVScheduleInterleaved1F1B._calculate_single_rank_operations.<locals>.get_rank_warmup_ops	  o     ##a'++&,"  O//##a'4/3 J
 z4#7#7$:M:M#MNNrF   =rank %s, warmup_ops %s, 1f1b %s, cooldown_ops %s total_ops %sc                 `    | j                   z  j                  z  }|j                  z  z   S rY   r_  rB  rR  r   local_indexr   rB   s     rD   rI  zVScheduleInterleaved1F1B._calculate_single_rank_operations.<locals>.forward_stage_index
  4    4#>#>>$BUBUUK$"4"44<<rF   c                     j                   dz
  | z
  j                  z  j                   z  z
  }|j                  z  z   S rq  rB  r_  rR  r   rm  r   rB   rF  s     rD   rJ  zWScheduleInterleaved1F1B._calculate_single_rank_operations.<locals>.backward_stage_index!
  W    ##:%$*E*EE%%&&   $"4"44<<rF   rB  r   r   r  rX  rR  )
rB   r   rg  microbatch_opsrG  rH  rP  rI  rJ  rF  s
   ``       @rD   r:  z9ScheduleInterleaved1F1B._calculate_single_rank_operations	  s    	O ).
,,t/C/CC$z1%3,|;	K	
	=
	= " 	
 		
rF   NNNNTTrM   rN   rO   r4  r   r   rv   r   rr   r   r  rb   r
   r~   r   rS   r:  r5  r6  s   @rD   r   r   	  s    * $(>B?C@D +/)?'()? )? D	)?
 34t;)?  _ 45<)?  S>E#J6=)? )? %))?V9
gn9M 9
rF   r   c                        e Zd ZdZ	 	 	 	 	 	 ddee   dededz  dee	df   dz  de
ee	f   dz  d	e
eef   ee   z  dz  d
edef fdZdeedz     fdZd Z xZS )r!   aw  
    The Interleaved Zero Bubble schedule.
    See https://arxiv.org/pdf/2401.10241 for details.
    Will perform one forward and one backward on inputs for the microbatches in steady
    state and supports multiple stages per rank. Uses the backward for weights to fill in
    the pipeline bubble.

    In particular this is implementing the ZB1P schedule in the paper.
    Nr   r   r   r   .r   r   r   r  c	           
         t        || j                  j                         |d   j                  | _        t
        |   ||||||||       t        |      | _        |d   j                  | _
        t        d|| j                  z        | _        || j                  z  | _        || j                  z  dk7  rt        d| j                   d| d      i | _        t!        | j                        D ]"  }	| j#                  |	      }
|
| j                  |	<   $ | j%                  | j                  | j                  z        | _        | j'                  | j                         y )Nr   r[  r   zZZero bubble requires the number of microbatches to be a multiple of the number of rounds (r\  r]  )"_check_torch_compile_compatibilityr   rM   r  rR  r  r   rw   rB  r  r   r   r^  r_  r   r   r   r:  _add_bubbles_to_actionsr  r`  s              rD   r   z&ScheduleInterleavedZeroBubble.__init__A
  sj    	+64>>3J3JK#AY11)+//#'A 	 		
 "&k1I((	 #A~9K9K'K L&48M8M&M#D111Q65595J5J4K L)*!-  @B$,,- 	1D==dCH(0D%	1 #::$"4"44

 	))$*=*=>rF   rf   c                    
  fd} |      
 j                    j                  z  }|
z
  }||z
  }
|z   |z   }t        j                  d
|||        fd} 
fd}}	t	         j                    j
                  
|||||	d
      S )Nc                     j                   dz
  j                  z  }d}||j                  dz
  | z
  z  z   }t        |j                  j                   z        S rq  rc  rd  s       rD   rg  z\ScheduleInterleavedZeroBubble._calculate_single_rank_operations.<locals>.get_rank_warmup_opsv
  rh  rF   ri  c                 `    | j                   z  j                  z  }|j                  z  z   S rY   rk  rl  s     rD   rI  z\ScheduleInterleavedZeroBubble._calculate_single_rank_operations.<locals>.forward_stage_index
  rn  rF   c                     j                   dz
  | z
  j                  z  j                   z  z
  }|j                  z  z   S rq  rp  rq  s     rD   rJ  z]ScheduleInterleavedZeroBubble._calculate_single_rank_operations.<locals>.backward_stage_index
  rr  rF   T)rL  rs  )rB   r   rg  rt  rG  rH  rP  rI  rJ  rK  rF  s   ``        @rD   r:  z?ScheduleInterleavedZeroBubble._calculate_single_rank_operationsu
  s    	O ).
,,t/C/CC$z1%3,|;	K	
	=
	= !%! !#
 	
rF   c                    | j                   }d }t               }i }i }i }d}t        | j                        D ]  }	g ||	<   d||	<   d||	<    	 d}
t               }t        | j                        D ]  }	||	   }|t	        ||	         k\  rd}
||	   |   ||	   |   }|J |\  }}}} ||||||      s>||	   j                  ||	   |          ||j                  |||f       ||	xx   dz  cc<   ||	   j                  d        ||	xx   dz  cc<   ||	xx   dz  cc<   ||	   j                  d         |j                  |       |
rn|dkD  rt        j                  d||       |S )Nc                     |t         j                  k(  r| dk7  r| dz
  ||f|vryy|t         j                  k(  r'| |dz
  k(  r| t         j                  |f|vS | dz   ||f|vS y)Nr   r   TF)r%   r>   rA   )r   r  
microbatchnum_stages_globalseen_opss        rD   need_bubblezJScheduleInterleavedZeroBubble._add_bubbles_to_actions.<locals>.need_bubble
  s    %---A:519b*"=X"M
 	 '555-11!#3#;#;ZHPXXX	2z2(BBrF   r   TFr   z?Non zero bubbles added: total_bubbles_added=%s bubbles_added=%s)
r   r  r   rR  rw   rq   r  updater   r  )rB   r  r   r  r  resultnext_pointerbubbles_addedtotal_bubbles_addedr   should_stoptemp_seen_ops	timestamptemp_actionrT   r  r  ra  s                     rD   rz  z5ScheduleInterleavedZeroBubble._add_bubbles_to_actions
  s   %%	 <?524')(*$,,- 	$DF4L!"L"#M$	$
 KDGEMd001 .(.	GDM 22#4=+7")$-	":K&2225@2KZ&#R5F t++GDM),DE%1)--{B
.KL$T*a/*t++D1%d+q0+ &!+&4L''-/.2 OOM*A D "NNQ#
 rF   ru  )rM   rN   rO   r4  r   r   rv   r   rr   r   r  rb   r
   r~   r   rS   r:  rz  r5  r6  s   @rD   r!   r!   6
  s     $(>B?C@D +/2?'(2? 2? D	2?
 34t;2?  _ 45<2?  S>E#J6=2? 2? %)2?h>
gn9M >
@@rF   r!   c                        e Zd ZdZ	 	 	 	 	 	 ddee   dededz  dee	df   dz  de
ee	f   dz  d	e
eef   ee   z  dz  d
edef fdZdeedz     fdZ xZS )r"   a  
    The Zero Bubble schedule (ZBV variant).
    See https://arxiv.org/pdf/2401.10241 Section 6 for details.

    This schedules requires exactly two stages per rank.

    This schedule will perform one forward and one backward on inputs for the microbatches in steady
    state and supports multiple stages per rank. Uses backward with respect to weights to fill in
    the pipeline bubble.

    This ZB-V schedule would have the "zero bubble" property only if time forward == time backward input == time backward weights.
    In practice, this is not likely true for real models so alternatively
    a greedy scheduler could be implemented for unequal/unbalanced time.
    Nr   r   r   r   .r   r   r   r  c	           
         t        || j                  j                         |d   j                  | _        t
        |   ||||||||       t        | j                  | j                  d      | _	        | j                  D ]  }	| j                  |	_	         t        |      | _        | j                  dk7  rt        d| j                   d      |d   j                  | _        |d   j                   | _        i | _        t%        | j                        D ]"  }
| j'                  |
      }|| j"                  |
<   $ | j)                  | j"                         y )Nr   r[  vstyler&   0ZBV requires exactly 2 stages per rank, but got r]  ry  r   rM   r  rR  r  r   r   r!  r  r  rw   rB  r   r  r   r   r   r   r:  r  rB   r   r   r   r   r   r   r   r  r   r   r;  r   s               rD   r   zScheduleZBVZeroBubble.__init__  s\    	+64>>3J3JK#AY11)+//#'A 	 		
 *H 0 0*
& \\ 	ME.2.L.LE+	M "&k!#B&&'q* 
 1I((	 )..
 @B$,,- 	1D==dCH(0D%	1
 	))$*=*=>rF   rf   c                    t        d| j                  z  dz
  | j                        }t        |      D cg c]  }d  }}d\  }}}}d| j                  |z
  z  dz
  }	|}
| j                  dz
  |z
  }t        |	      D ](  }|j                  t        |
t        |             |dz  }* |}t        |      D ]N  }|j                  t        |t        |             |dz  }|j                  t        |
t        |             |dz  }P | j                  |z
  }t        |      D ]o  }|j                  t        |t        |             |dz  }|j                  t        |t        |             |j                  t        |t        |             |dz  }q ||k  s||k  r||k  r&|j                  t        |
t        |             |dz  }|j                  t        |
t        |             |j                  t        |
t        |             |dz  }|j                  t        |t        |             |dz  }|j                  t        |t        |             |j                  t        |t        |             |dz  }||k  r||k  r||}}|}t        |      D ]N  }|j                  t        |
t        |             |dz  }|j                  t        |t        |             |dz  }P | j                  |z
  }t        |      D ]N  }|j                  t        |
t        |             |dz  }|j                  t        |
t        |             |dz  }P ||k  r,|j                  t        |t        |             |dz  }||k  r,||k  r,|j                  t        |
t        |             |dz  }||k  r,||k(  r||k(  sJ ||k(  r||k(  sJ |D cg c]-  }|'|j                  |j                  | j                  k  r|nd / }}|S c c}w c c}w )Nr&   r   )r   r   r   r   )rU   rV   )r   rR  r   r   r   rq   rS   r2   r3   r4   rV   )rB   r   n_microra  r;  f0_cntf1_cntb0_cntb1_cnt	warmup_n1stage_id_chunk0stage_id_chunk1	warmup_n2	warmup_n3w0_cntw1_cntcooldown_n1cooldown_n2rK   s                      rD   r:  z7ScheduleZBVZeroBubble._calculate_single_rank_operations;  s[    a$,,,q0$2F2FG8=d)D1$)D)D *4&++d23a7	//A-4y! 	AOO!fU aKF		
 	y! 	AOO!fU aKFOO!fU aKF	 &&-	y! 	AOO!fU aKFOO!fU OO!fU aKF	 vo'!1'!f
 !OO!fU OO!fU aKFOO!fU aKFOO!fU OO!fU aKF5 vo'!18  {# 	AOO!fU aKFOO!fU aKF	 ((4/{# 	AOO!fU aKFOO!fU aKF	 voOO!fU aKF	 vo
 voOO!fU aKF	 vo Ff$444Ff$444 #	
  %++7++d.B.BB  		
 	
 a *EL	
s   	P2P
ru  rv  r6  s   @rD   r"   r"   
  s    & $(>B?C@D +/1?'(1? 1? D	1?
 34t;1?  _ 45<1?  S>E#J6=1? 1? %)1?ftgn9M trF   r"   c                        e Zd ZdZ	 	 	 	 	 	 ddee   dededz  dee	df   dz  de
ee	f   dz  d	e
eef   ee   z  dz  d
edef fdZdeedz     fdZ xZS )r#   z
    The DualPipeV schedule. A more efficient schedule variant based on the
    DualPipe schedule introduced by DeepSeek in https://arxiv.org/pdf/2412.19437

    Based on the open sourced code from https://github.com/deepseek-ai/DualPipe
    Nr   r   r   r   .r   r   r   r  c	           
         t        || j                  j                         |d   j                  | _        t
        |   ||||||||       t        | j                  | j                  d      | _	        | j                  D ]  }	| j                  |	_	         t        |      | _        | j                  dk7  rt        d| j                   d      || j                  k  rt        d| d	| j                   d
      |d   j                  | _        |d   j                   | _        i | _        t%        | j                        D ]"  }
| j'                  |
      }|| j"                  |
<   $ | j)                  | j"                         y )Nr   r[  r  r  r&   r  r]  zDDualPipeV requires at least as many microbatches as stages, but got z microbatches and z stages.r  r  s               rD   r   zScheduleDualPipeV.__init__  s    	+64>>3J3JK#AY11)+//#'A 	 		
 *H 0 0*
& \\ 	ME.2.L.LE+	M "&k!#B&&'q*  D,,,V!""4T5E5E4FhP 
 1I((	 )..
 @B$,,- 	1D==dCH(0D%	1
 	))$*=*=>rF   rf   c                    g }i g | j                   }| j                  }t        ||dz  d      }||   \  }}dt        ffddt        dt        dt        ffd	}dt        dt        d
t
        ffd}	dt        ffd}
||z
  dz
  dz  }t        |      D ]  } |	||t                |dz   }t        |      D ]  } |	||t                |	||t                 ||z
  dz
  }t        |      D ]&  } |	||t                |
|        |	||t               ( ||dz  z
  |z   dz   }t        |      D ]B  }|dk(  r%||dz
  k(  r |	||t                |	||t               n ||||        ||||       D ||z
  dz
  }t        |      D ]  } |	||t                ||||        |dz   }d}t        |      D ]V  }||dz  k(  r
|dz  dk(  rd}|rt        nt        } |	|||       ||dz  k(  r
|dz  dk(  rd}|rt        nt        } |	|||       X ||z
  dz
  }t        |      D ]"  } |
|       |rt        nt        } |	|||       $ |dz   }t        |      D ]
  } |
|        |S )Nr&   r  r  rT   c                     | t         f}| t        f}j                  |d      dz   |<   j                  |d      dz   |<   y)zbHelper method to increment BACKWARD_INPUT and BACKWARD_WEIGHT counters when FULL_BACKWARD is used.r   r   N)r?   r@   r   )rT   	input_key
weight_keycounterss      rD   increment_backward_countszVScheduleDualPipeV._calculate_single_rank_operations.<locals>.increment_backward_counts  sK    $n5I%7J"*,,y!"<q"@HY#+<<
A#>#BHZ rF   r   forward_stagebackward_stagec                    |t         f}|t        f}j                  |d      }j                  |d      }t        |t         |      t        |t        |      f}| j                  t        dt        d|             |dz   |<    	|       y)zYHelper method to add an overlapped forward+backward action which tracks microbatch index.r   rl   Nr   )r>   r?   r   rS   rA   rq   r<   )
r   r  r  forward_keybackward_keyrg  ri  rW   r  r  s
           rD   add_overlap_f_bzLScheduleDualPipeV._calculate_single_rank_operations.<locals>.add_overlap_f_b  s     )'2K*N;L!k15J",,|Q7K w
;{CK NN72{D+FG %/NH[!%n5rF   rU   c                     |t         k7  r||fn|t        f}j                  |d      }| j                  t	        |||             |t         k(  r	 |       y |t        k(  rj                  ||f       |dz   |<   y )Nr   r   )rA   r?   r   rq   rS   )r   rT   rU   r   r   r  r  weight_queues        rD   
add_actionzGScheduleDualPipeV._calculate_single_rank_operations.<locals>.add_action  s     $}4 ./!>2 
  ||C+HNN7;0@(KL  =0)+6 $~5 ''h(?@ (1rF   c                     syj                  d      \  }}| j                  t        |t        |             |t        f}j	                  |d      dz   |<   y)z4Helper method to add a weight action from the queue.Nr   r   )r  rq   rS   r@   r   )r   actual_stage_indexrW  r  r  r  s       rD   add_weight_action_if_pendingzYScheduleDualPipeV._calculate_single_rank_operations.<locals>.add_weight_action_if_pending7  sb    2>2B2B12E/NN&## -o>J#+<<
A#>#BHZ rF   r   r   )r  r  FT)
rR  r   r   rv   r   r%   r   r>   r?   rA   )rB   r   r   r   
num_chunksrank_to_stagesstage0_indexstage1_indexr  r  r  step_1ra  step_2step_3step_4r   step_5step_6	enable_zbr0  step_7step_8r  r  r  s                          @@@rD   r:  z3ScheduleDualPipeV._calculate_single_rank_operations  s   (*  	 &&	))
7y1}C
 &4D%9"l	C3 	C	6	6	6  	6.	-	-	- /	-0	C$ 	C$ d"Q&!+v 	7Awg6	7 v 	7Awg6wg6	7
 T!A%v 	7Awn=(1wg6	7 i!m+d2Q6v 	AAv$)a-/7L':7L-@".#/
 *+	$ T!A%v 	Awm<*+	 	v 	9AFaKD1HM 	*3Iwi8FaKD1HM 	*3Iwi8	9 T!A%v 	9A(1*3Iwi8	9 v 	2A(1	2 rF   ru  rv  r6  s   @rD   r#   r#     s     $(>B?C@D +/6?'(6? 6? D	6?
 34t;6?  _ 45<6?  S>E#J6=6? 6? %)6?p^gn9M ^rF   r#   schedule_namec           
      0   t         t        t        t        t        t
        t        t        t        d	}|D ci c]  }|j                         | }}| j                         }||vr(t        d|  dt        |j                                      |||      S c c}w )z
    Maps a schedule name (case insensitive) to its corresponding class object.

    Args:
        schedule_name (str): The name of the schedule.
    )	1F1BInterleaved1F1BGPipe	LoopedBFSInterleavedZeroBubbler   r   ZBVZeroBubble	DualPipeVzUnknown schedule name 'z'. The valid options are )r   r   r   r    r!   r   r   r"   r#   lowerr   r   keys)r  schedule_mapklowercase_keyslowercase_schedule_names        rD   r   r     s     2&!>"8!6.&
L -99qaggil9N9+113n4%m_4MdS_SdSdSfNgMhi
 	
 '>?@@ :s   Bc           	        
 t        |       D ci c]  }|| |   D cg c]  }||	 c} } }}t        |       D ci c]  }|g  c}D ci c]  }|t                c}
dt        dt        dz  f
fd}dt        dz  dt        f
fd}| r_d}t        |       D ]O  }t        | |         dk(  r| |   d   } ||      r"|	 |||       | |   j                  d       d	}G ||d       Q t        | d	
      D ]  }	t        | |	         dk(  s| |	=  t        |       D ]`  }t        | |         dk(  r|   d   | |   d   } ||      s/|||   d<   
|   j                  |       | |   j                  d       b t        | d	
      D ]  }	t        | |	         dk(  s| |	=  |s>t        dt                     | D ]  }t        d|d| |   d            t        d      | r_S c c}w c c}}w c c}w c c}w )a  This function dry-run simulates the actions in the schedule from the perspective of all ranks, and flags
    any deadlocks caused by missing or misordered communications.  It also simulates any bubbles in time where a rank
    can not execute any action due to waiting for unmet dependencies.  The total number of simulator steps can be used
    as a metric for unit tests involving IR optimization passes as reordering and merging of IR can reduce the number
    of simulated steps.

    The simulation is not high-fidelity and does not model overlapping of compute and communication, or cuda streams.
    Future work may be to enhance this and model the compute time, comms overlap, and even memory.
    Nr   rK   c                 \    |    j                  |       ||    j                  |       y y rY   )rq   r  )r   rK   _prev_ops_rank	_schedules     rD   add_to_schedulez0_simulate_comms_compute.<locals>.add_to_schedule  s3    $v&4 $$V, rF   rf   c                    | y| j                   } 
|         }| j                  t        k(  rd| j                   dk(  ryt        | j                   t        | j
                        |v ryt        | j                   dz
  t        | j
                        |v ryy| j                  t        t        fv r| j                   	dz
  k(  ryt        | j                   t        | j
                        |v ryt        | j                   dz   t        | j
                        |v ryt        | j                   dz   t        | j
                        |v ryy| j                  t        k(  ry| j                  t        k(  r)t        | j                   t        | j
                        }||v S | j                  t        k(  r-|dz
  }t        |t        | j
                        }| 
|         v S | j                  t        k(  rTt        | j                   t        | j
                        }t        | j                   t        | j
                        }||v xs ||v S | j                  t        k(  r-|dz   }t        |t        | j
                        }| 
|         v S t        d|        )NTr   r   FzUnsupported action type )rT   rU   r2   rS   r8   rV   r?   rA   r:   r@   r7   r9   r   )rK   rx  prev_ops
expected_fpeer_stage_idxexpected_send
expected_bexpected_bwr  r   r  s           rD   r  z3_simulate_comms_compute.<locals>._ready_to_schedule  sd   >&&	!-	":;""a'!!Q&**FF4K4KLPXX**Q.63J3JKxW$$(GG!!Z!^3v))663J3JKxW**Q.@W@WX **Q.v?V?VW $$7$$. !3!3Q8O8OPJ))$$.&]N#NFF<S<STM N=3P$QQQ$$. ""NF4K4KJ """M63J3JK )D[H-DD$$.&]N#NFF<S<STM N=3P$QQQ7x@AArF   Fr   T)reverserl   zWIP comms schedule:
r  z next action= zSchedule is not progressing)r   r  rv   rS   r~   rw   r  r  printr   r   )r   r  r   r   rr  r  r  r  rK   r   r  r  s    ``       @@rD   r!  r!    s    >* 	..@Q!-q@@N 
 $N32b2I HQ.QttSU{.QN-c -7T> -
8B7T> 8Bd 8Bt >* 	,D>$'(A-#D)!,F!&)%#D&1t$((+d+	, 5 	&A>!$%*"1%	& >* 	,D>$'(A-r".#D)!,F!&)%*0IdOB'"4(,,V4t$((+	, 5 	&A>!$%*"1%	& )+A)+LM& I~nT.B1.E-FGHI:;;W Z k A2 /Rs&   G=G8G8G=
HH8G=c                 N   g }t        |       D ]\  }t        | |         D ]I  \  }}|	|j                  t        |      |j                  t
        t        t        fv rdndd|||dd       K ^ ddl}t        |d      5 }|j                  d	|i|       ddd       y# 1 sw Y   yxY w)
a  
    This function dumps a schedule IR into a chrometrace format so it can be visualized.

    It is currently very basic and only serves as a graphical alternative to dumping the schedule IR as text.

    As future work we may extend this to include more accurate heuristics for durations, or let users input durations,
    add 'flow events' to let the UI show the connection between sends and recvs, and model cuda streams for comm/compute
    as separate streams on the chrometrace view.
    NcomputationcommunicationXr   )r   catphpidtidtsdurr   r  traceEvents)r   r   rq   rb   rU   r2   r;   r4   jsonr  dump)scheduler  eventsr   timesteprK   r  fs           rD   _dump_chrometracer  3  s     Fx   )(4. 9 	Hf~MMK "22q!Qi? &,"	& 	h	 .		=&)1-. . .s   =BB$r   c           
         | D ]  }t        |j                  t        j                  j                        s2|j                  j                         D ]6  }t        |t              st        d| dt        |      j                           y)z
    Check if the schedule is compatible with torch.compile.

    Args:
        stages: List of pipeline stages to check
        schedule_name: Name of the schedule for error message

    Raises:
        RuntimeError: If any stage uses torch.compile
    zThe ze schedule is not supported with stage modules that have used torch.compile. Found OptimizedModule in N)
r   r$  r  nnModuler*  r   rI   r   rM   )r   r  r   modules       rD   ry  ry  W  s      
%,,8ll**, 	F&/2"=/ *004V0E0E/FH 		
rF   rY   )r'   )r   F)ir   r  r   loggingreabcr   r   collectionsr   r   collections.abcr   enumr   	functoolsr	   typingr
   r   r   r   r  torch.distributeddistributedr	  torch._dynamor   torch.distributed.fsdpr   r   torch.nn.modules.lossr   torch.profilerr   _utilsr   r   r  r   r   r   r   r   __all__	getLoggerrM   r   r%   r>   r?   r@   r5   r6   r7   r8   r9   r:   rA   r<   r=   r2   r3   r4   r;   compilers   rS   rb   r   r  rv   r   r   r   P2POpWorkr  r  r  r   r8  r   r   r~   rn  rQ  r  r  r  r  r   r   r  r	  r    rX  r   r!   r"   r#   r   r!  r  ry  rQ   rF   rD   <module>r     s    
   	 # , $   2 2    ) < ' * R T T % 
		8	$;Et ;E| 
"
"!00"22

"
"

"
"		 	 		 	 		 	 		 	  ..****  

R
S
j S
l  C   %)<d7T>223<Tz< 	<~Q
 Q
h+TZZ( +d
 +d499o + 37$**%(4Z	#tDII
6$tyy/ B. BJ0"1 0"ft* tnX) Xv!'7 !D !$'D.!$36$	'D.$F P'D.)PP 
']Pf#'D.)#	']#Lq#tG},-qSE3J'q q 
#tG}
	qhx'#tGdN++,x'x' x' 	x'
 
#s(^x'v{2- {2|  Lh Ld24 d2NB0 B\ FRv
6 v
r$< Dw4 wt^0 ^BAc A6B#+SE3J#7BEHBJ!.H#$58rF   