
    i                     B   U d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZm Z m!Z!m"Z" d dl#m$Z$ d d	l%m&Z&m'Z'm(Z)m*Z*m+Z+m,Z,m-Z-m.Z. d d
l/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z8 d dl9m:Z: g dZ;dZ<dZ=dZ>dZ?e@eA   ZBee3eej                  eDeEeAf   ZFeeFeGeF   eHeF   eIeAdf   f   ZJeIeAeJf   ZKeGeK   ZLeIeAeeKeLf   f   ZM e@       ZNe@e   eOd<   e j                  d        ZQe
 G d d             ZRe
 G d deR             ZS	 	 	 dIdej                  deAdeAdeUdeUd eBfd!ZV G d" d#      ZWdJd$ZXddd%dej                  d&eHej                  j                  d'f   d(eUd)ee@ej                        d*eeR   d eSfd+Z[d,eIeAeJf   d-eMd.eSd dfd/Z\d0eej                  ej                  j                  f   d1eAd efd2Z]d3eIeAef   d.eSd eIeAef   fd4Z^ ej                         dej                  d.eSd eIeAeJf   fd5       Z` ej                         dej                  d3eIeAeJf   d.eSd e5fd6       Zad7ej                  j                  d dfd8Zbd3eMd eIeAeJf   fd9Zcd7ej                  j                  d3eIeAeJf   d.eSd eMfd:Zd ej                         dej                  d;eHej                  j                  d'f   d.eSd eMfd<       Zedej                  d7ej                  j                  d-eMd.eSd eMf
d=Zf ej                         dej                  d;eHej                  j                  d'f   d3eMd.eSd df
d>       Zgddd%dej                  d)ee@ej                        d*eeR   d eIeAeJf   fd?Zhddd%dej                  d;eej                  j                  eej                  j                     f   d)ee@ej                        d*eeR   d eMf
d@Ziddd%dej                  d;eej                  j                  eej                  j                     f   d)ee@ej                        d*eeR   d eHeIeAeJf   eMf   f
dAZjdej                  d3eeIej                  eIeAeJf   f   eIeAeJf   f   d eIeAeJf   fdBZkddCdej                  d,eIeAeJf   d*eeR   d e5fdDZlddCdej                  d;eej                  j                  eej                  j                     f   d-eMd*eeR   d df
dEZmddCdej                  d;eej                  j                  eej                  j                     f   d,eIeAeJf   d-eMd*eeR   d e5fdFZneddCdej                  d*eeR   d dfdG       ZoeddCdej                  d;eHej                  j                  d'f   d*eeR   d dfdH       Zpy)K    N)Callable	GeneratorIterable)asdict	dataclassfield)chain)Anycastno_type_checkOptionalUnion)ShardedTensor)_broadcast_state_dict_distribute_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)DTensor)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dict_flat_paramparam_groupsparamsstater'   _patched_state_dictc               #      K   t        j                         } t        j                          	 d  | rt        j                          y y # | rt        j                          w w xY wwN)gc	isenableddisableenable)
is_enableds    a/var/www/html/engine/venv/lib/python3.12/site-packages/torch/distributed/checkpoint/state_dict.py_gc_contextr?   Q   sD     JJJLIIK :IIK s   )A$A A$A!!A$c                       e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	eed<   dZ
eed<   dZeed	<   dZeed
<   dZeed<   y)r+   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dict_fqn_modifiersdsd_fqn_modifiersN)__name__
__module____qualname____doc__rA   bool__annotations__rB   rC   rD   rE   rF   rG   rI   str     r>   r+   r+   \   s_    "H "OT!K!&$&$(T(FD!&$&). $.-s-rR   r+   c                   h   e Zd ZU  ee      Zeeeej                  f   ee
ej                  f   f   ed<    ee      Zeeeej                  f   ee
ej                  f   f   ed<    ee      Zee   ed<   dZeed<   dZeed<   ej&                  Zeed<    ee      Zeej2                     ed	<   y
)_StateDictInfo)default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)rJ   rK   rL   r   dictrV   r   rP   torchTensorr%   rO   rW   setrX   rY   rN   rZ   
contextlibnullcontextr[   r   listr\   nnModulerQ   rR   r>   rT   rT      s    
 	d# tc5<< fell"#	% $ 	d# 4c5<< fell"#	% $ $)#=C=L$L$'33L(3$)$$?L$ryy/?rR   rT   modelnamerI   skip_ddp_prefixskip_compiler_prefixreturnc                    |j                  t        d      }d|vr|hS |j                  d      }g }| }t        |      D ]  \  }}	t	        |t
              r5|	dk7  rt        d|	 d      |j                  }|r:|j                  |	       Lt	        |t              r|t        |      dz
  k  rW||dz      t        k(  rHdj                  |      }
t        |t              }|
r|
 d}
|j                  D ch c]  }|
 | 
 c}c S t        |t              }|	t        k7  s|j                  |	       t        ||	      }t	        |t         j"                  j$                  j&                        r7|	dk7  rt        d|	 d      |j(                  }|rN|j                  |	       at+        ||      r: t        ||             j-                  |	      x}rt+        ||      rt        ||      }|j                  |	       |	t.        j0                  j                  j2                  k(  r|t        |      dz
  k7  st5        d	      t        ||	      } dj                  |      j                  t        d      hS c c}w )
a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
     .modulezExpected 'module', got ''   	_orig_modzExpected '_orig_mod', got 'z-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPAssertionErrorrn   appendFSDPlen_FLAT_PARAMjoingetattr_fqnsr    r^   _dynamo
eval_frameOptimizedModulerq   hasattrgetrd   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)rf   rg   rI   rh   ri   	obj_namesfqn_obj_namescurr_objicurr_obj_nameprefix
flat_paramfqnremoved_fqns                 r>   	_get_fqnsr      sS   . <<*B/D
$v

3IMH%i0 (<=h$($'?a%PQQH"$$]3$'3y>A%%)AE*:k*I-0$X{;
 &xq\F4>4D4DES6(3%(EEx)<=H 33$$]3"8];%--":":"J"JK+$'B=/QR%STT))H'$$]3 x!23"F'(4E"F"H"L"L!# ;  x5#*8[#A  /

 1 1 I III**&'VWW"8];Q(<T HH]#++,>CDD; Fs   0I0c                       e Zd Zy)_EXTRA_STATEN)rJ   rK   rL   rQ   rR   r>   r   r      s    rR   r   c              #      K   t               dt        j                  dt        dt        ffd | d      E d {    y 7 w)Nrn   curr_fqnrj   c              3     K   j                  |        |r| dnd}| j                         D ]T  \  }}|v rt        |       r'| t        |              j	                         v r|d d }n| | } ||      E d {    V t        | j                  d      | j                  d            D ]   \  }}|| j                  v r| | }||f " t        | j                  dt        j                  j                        t        j                  j                  k7  r7| t        j                  j                  j                   }|t!               f y y 7 ׭w)Nrm   rl   F)recurseget_extra_state)addnamed_childrenr   r}   valuesr	   named_buffersnamed_parameters_non_persistent_buffers_set	__class__rd   re   r   r   rn   r   r   )	rn   r   rg   	submodulenew_fqnobjrI   r   visited_moduless	         r>   r   z+_iterate_valid_model_state.<locals>.recurse   sr    F#%-hZq>2%446 	3OD)O+  12>GF,=>@GGII #3B-%Jtf-y'222	3    /1H1HQV1H1W
 	ID# v999!
4&)G3,	 F$$&79R9RSyy(() "
2::#4#4#L#L"MNG<>))	) 3s   A;E>E?CErl   )r`   rd   re   rP   r   )rf   rI   r   r   s    `@@r>   _iterate_valid_model_stater      s?     &)eO *		  *S  *Y  *D ub!!!s   ;A AA)
submodulesoptionsoptims.
optim_onlyr   r   c                   |rt        j                  dt        d       |r|st        d      |xs
 t	               }i }i }t        |       D ]  \  }}t        |t              rt        | |      }	|j                  |      }
|
2t        t        t           ||         j                  |	       ||   ||<   n|	j                         ||<   |	D ]  }
t        |t              r|||
<     t        |j!                               D ])  \  }}|D ]  }
t        t"        j$                  |      ||
<   ! + t               }|ret        |      }| j'                         D ]G  \  }}||vrt        | |      }	t)        |	      dk7  rt+        d      |j                  d |	D               I |j,                  r|j.                  st1        d      t3        j4                  |       }|r|j.                  rat7        |j8                  |j8                  	      }t;        |j8                  |j8                  xs |j,                  	      }t<        j>                  }n<tA        |j8                  
      }tC        |j8                  
      }t<        jD                  }tF        jH                  d        }tK        jL                  || |||      }ntF        jN                  }tQ        di tS        |      ||||t        t        tT        jV                     |      | t)        |      dkD  dS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.   
stacklevelz;Optimizers are not passed in but optim_only is set to True.rp   z)Submodule FQN should only have 1 instancec              3   &   K   | ]	  }| d   yw)rm   NrQ   ).0r   s     r>   	<genexpr>z"_verify_options.<locals>.<genexpr>Q  s     %@CQi%@s   z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpu
rank0_only)r   c              3      K   t        j                         5  t        j                  ddt               t	        j
                  | |||      5  d  d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY ww)NignorezFSDP.state_dict_type)messagecategoryrn   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsfilterwarningsFutureWarningry   r   r   s       r>   $fsdp_state_dict_type_without_warningz=_verify_options.<locals>.fsdp_state_dict_type_without_warningo  sy      ((* 
''&<} ))!$3&7,C	  	
 
 	
 
s4   A;6A/A#A/	A;#A,	(A//A84A;r   r   )rV   rW   rX   r[   r\   rY   rZ   rQ   ),r   warnr   r   r+   r   ru   r   r   r   r   r`   rP   updatecopyrc   itemsr^   r_   named_modulesrz   rw   rF   rA   
ValueErrorry   r\   r   rB   r   r   FULL_STATE_DICTr   r   SHARDED_STATE_DICTra   contextmanager	functoolspartialrb   rT   r   rd   re   )rf   r   r   r   r   rV   rW   rg   paramfqnsr   param_fqns_rX   rn   r\   r   r   r   r   r[   s                        r>   _verify_optionsr     s-    I 	
 &I
 	
 +)+G 	 
 	  2%8 /ee\*%##E*?S,U34;;DA+<U+C!%( (,yy{e$ 	/Ce\2).!#&	//  399;< D 	DC)-ellF)C!#&	DD $'5_
!//1 	ALD&Z'UD)D4yA~$%PQQ%%%@4%@@	A ##G,C,CM
 	
 $$U+L "" 3&22w?R?R! '?&22#//O73O3O'# ,;;O 6&22! 'B&22'# ,>>O		"	"	 
#	$ !((0+/$;
 "-- 	
/	+3-!$ryy/<8#^&kAo	 	rR   model_state_dictoptim_state_dictinfoc                    |j                   D ]  }t        |      }|t        d       |j                  rk| si|j                  s]|j
                  sQ|j                  r|j                  s9|j                  r-|j                  s!t        dt        j                         d      |j                  r4|s2|j                  r|j                  s|j                  st        d|       | D ]  }t        |v st        | dt         d       y )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rm   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)r\   r   rw   rY   rX   rC   rB   rA   rE   rF   r   distget_rankrZ   r{   )r   r   r   rn   
fsdp_statekeys         r>   _verify_state_dictr     s   
 ## NCFK
 !LMMN 	 ''))!!d&:&:KK))'mmo'q*
 	
  %%$*>*>..::J9KM 
   #%z+ /* * rR   r   apic                     t        | |      }|t        v r+t        j                  t        | j                  |      |       }|S )N)self)r}   r6   r   r   r   )r   r   calls      r>   _state_dict_fnr     s9    3D""  !<3GKrR   
state_dictc                     |j                   rF|j                  rt        j                  j	                         sdnd}t        | |j                  |      S |j                  rt        |       S | S )NrQ   )r   )rB   
ranks_only)rA   rB   r^   distributedis_initializedr   r   )r   r   r   s      r>   _maybe_full_or_cpu_state_dictr     sn      $$E,=,=,L,L,N  	
 "D$4$4
 	
 
		)*55rR   c           	      f   |j                   si S |j                         5   t        | d             }d d d        t        j	                               D ]  }t        | |      }t        |      dk7  rt        d| dt        |       d|       t        t        |            }||k7  sTdt        fd} |||      st        d| d	|       |j                  |      ||<    |j                  rYi }|D ]P  }|j                  D ]?  }|j                  |      s|j                  r	||   ||<   *|t        |      d  }	||   ||	<   A R |}|j                   rI| j#                         D ]6  \  }}
|
j$                  rt        | |      }|D ]  }|j                  |        8 t'        ||      S # 1 sw Y   wxY w)
Nr   rp   Expected 1 FQN for key '', got z: rj   c                    t        |      t        |       k\  ry|j                  d      }| j                  d      }d}t        |      D ]:  \  }}|||   k(  r'|dz  }|t        |      k(  s"|t        |      dz
  k(  c S |dv r: y y)NFrm   r   rp   )rn   rq   T)rz   rs   rt   )r   r   	fqn_split	key_splitfqn_idxkey_idxkey_names          r>   verifyz%_get_model_state_dict.<locals>.verify  s    s8s3x' IIcN	IIcN	)29)= %%GX9W#551"c)n4#*c)nq.@#@@!%<< $% rR   zAn unexpected key, z, exists. FQN is )rY   r[   r   rc   keysr   rz   rw   nextiterrN   r   poprX   
startswithrD   rC   r   requires_gradr   )rf   r   r   r   r   r   r   new_state_dictr   r   r   s              r>   _get_model_state_dictr     s    					 ;8^E<8:
; JOO%& 2$t9> *3%ws4ykD6J  4:#:D " #s#"%8=Nse#TUU(nnS1JsO=2@ /1 	>C11 >~~f-//*4S/N3'!#f+-0G.8oN7+>	> $
  002 	$JC""UC(D $s#$		$ )T::s; ;s   F&&F0c           	         |j                   r|s|j                  st        i i       S i }t        | |j                        D ]  \  }}t        | ||j                        }t        | ||j                  dd      }t        ||      D ]f  \  }}	|j                  rt        j                         dk(  r9||	k7  r4|j                  |d       }
|
|j                  rt        d| d      |
||	<   |||	<   h  d}|j                  s|j                  rnt               }|j                         D ]G  }t        j                   |      s|j#                         dkD  s-|j%                  |j&                         I t        j&                  d      |v r&|j)                  t        j&                  d             d}t+        |      dk(  r.|j%                  t        j,                  j/                                nt+        |      dkD  rt1        d	      |j                  r3t3        |||j                         |j                  |j4                  
       n(|j                  rt7        |||j                                |j9                  |       |j;                         5  t=        t         t?        | d      ||j                  |            cd d d        S # 1 sw Y   y xY w)NF)rh   ri   r   zMissing key: rm   metaTrp   zMultiple devices found)devicerE   rB   r   load_state_dict)r   rE   assign) rY   rF   r"   r   rI   r   zipr   r   r   rE   r   rA   r`   r   r^   	is_tensordimr   r   removerz   distributed_c10d_get_pg_default_devicer   r   rB   r   r   r[   r   r   )rf   r   r   local_state_dictr   valuer   fqns_with_prefixr   fqn_with_prefix
load_valuer   devicess                r>   _load_model_state_dictr  "  sq    Z8Q8Q R((08N8NO 6
UT%;%;<$""!!&
 %(.>$? 
	6 C--A1E('^^C6
%{{*]3%q+ABB2<J/05_-
	66, F  D$8$8%%,,. 	*Eu%%))+/ELL)	*
 <<7*NN5<</0Fw<1KK--DDFG\A566$$! {{}{{ ,, !!":/?V*+				 
4N5"34%dkk&

 
 
s   -J==Koptimc                 h   | j                   ry| j                  D ]  }|t           D ]  }|j                    y ! | j                  D ]7  }|t           D ])  }|j                  st        j                  |      |_        + 9 g }| j                  D ]R  }d|v s|j                  |d          t        |d   t
        j                        rt        j                  d      nd|d<   T | j                  d       | j                  D ]  }d|v s|j                  d      |d<    | j                  d       y)zH
    Initialize optim states by calling the step() with zero grads.
    Nlrg        )closurer   T)set_to_none)r5   r3   _PARAMSgradr   r^   
zeros_likerx   ru   r_   tensorstepr   	zero_grad)r  param_groupr   lrss       r>   _init_optim_stater  g  sB    {{ ))  ) 	Ezz%	
 )) 5 ) 	5E"""--e4
	55 C)) ;JJ{4() k$/> S!  
JJtJ )) +; #
K+ 
OOO%rR   c           
      <  	
 dt         t        t        f   dt        dt         t        t        f   f	
fd	d 
i }t	        t
        | t                 j                         D ]C  \  }}t         d| }|j                   	t	        t         t        t        f   |      |             E t	        t        | t                 D ]\  }|j                  t              }t	        t        t           |      D ]+  }|j                         D ]  \  }}||t         d| d| <    - ^ |S )a3  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_groups": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_groups.layer1.weight.lr": 0.1,
        "param_groups.layer2.weight.lr": 0.1,
        "param_groups.layer1.weight.betas": (0.9, 0.95),
        "param_groups.layer2.weight.betas": (0.9, 0.95),
    }

    The "state" section supports arbitrary levels of nesting for optimizers like Shampoo.
    nested_dictr   rj   c                     i }| j                         D ]Q  \  }}t        |      }|r| d| n|}t        |t              r|j	                   ||             E |       |||<   S |S )a  
        Recursively flatten a nested dictionary with dot-separated keys.

        Args:
            nested_dict: The dictionary to flatten
            prefix: The prefix to prepend to all keys

        Returns:
            Flattened dictionary with dot-separated keys
        rm   )r   rP   ru   r]   r   )	r  r   	flattenedr   r  str_keyfull_key_flatten_state_nested_dict_raise_if_type_not_supporteds	          r>   r  z=_flatten_optim_state_dict.<locals>._flatten_state_nested_dict  s     +-	%++- 	,JC#hG06&7),GH%&  !;E8!LM -U3&+	(#	, rR   c                     t        | t        j                  t        t        t
        f      st        dt        |        d      y )Nz[Flattening optimizer state_dict only supports tensor, int, float, dict states now. Type is rm   )ru   r^   r_   intfloatr]   NotImplementedErrortype)vs    r>   r  z?_flatten_optim_state_dict.<locals>._raise_if_type_not_supported  s@    !ellC=>%7)1&  ?rR   rm   )r]   rP   r
   r'   r   r(   _STATEr   r   r)   _PGr   r  rc   )r   retr   r5   state_prefixr  r   kr#  r  r  s            @@r>   _flatten_optim_state_dictr)    s    R#s(^-0	c9n	< !#C =*V*<=CCE 

U 3%(

&tDcNE'BLQ	

 -z#? ,w'S	4( 	,C#))+ ,1*+se1SE1#&',	,, JrR   c                    dt         dt        t         t        f   dt        t         t        f   fd}i }g }t        |t        |i}| j
                  D ]  }|j                  t        g i       |t           D ]  }|j                  |   D ]  }	|	|j                  v r)d}
|D ]!  }|t        k(  rt         d|	 d| }||v rd}
 n nd}
|
s?|d   t           }t        |t              st        d	t        |             |j                  |	       |j                  si ||	<   | j                  |   D ]M  }t         d|	 d| }||vr  |||      }|t!        t"        ||	         |<   5||   t!        t"        ||	         |<   O   t!        t        t            |d   t                 d
   }|D ]V  }|t        k(  r|t         d| d|    }||d   vr	||d   |<   .|d   |   |k7  s:t%        d| d| d| d|d   |    d	        |S )z
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    Supports arbitrary levels of nesting in the state section through recursive reconstruction.

    See the docstring of _flatten_optim_state_dict() for more detail.
    flattened_keyflattened_dictrj   c                    |  d}i }|j                         D ]m  \  }}|j                  |      s|t        |      d }|j                  d      }|}|dd D ]%  }	|	|vri ||	<   t	        ||	   t
              sJ ||	   }' |||d   <   o |S )z
        Reconstructs a potentially nested value from flattened keys.
        For non-nested values, returns the value directly.
        For nested values, reconstructs the nested structure with string keys.
        rm   Nr   )r   r   rz   rs   ru   r]   )
r+  r,  r   r  r   r  remaining_keypartscurrentparts
             r>   _reconstruct_nested_dictz=_unflatten_optim_state_dict.<locals>._reconstruct_nested_dict  s     "?!$&( )..0 	'JC >>&)  F.M "'',E!G cr
 (w&$&GDM!'$-666!$-( "'GE"I9	'> rR   Frm   Tr   Expected list, got r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )rP   r]   r'   r$  r%  r3   rx   r  rV   rW   ru   rc   rw   r"  r   r5   r   r(   r   )r  r   r   r2  r5   pg_state
return_osdr  r   r   	in_paramsr(  flatten_keyr4   
state_nameflattened_state_keyreconstructed_valuefirst_param_fqnr  s                      r>   _unflatten_optim_state_dictr<    s   //,0i,@/	c9n	/b E"$H&,eS(%CJ)) >"& ) .	E--e4 - $444 %I( <$),Qse1QC&8&*4(,I !%I !"g.!&$/(+>tF|n)MNNc" **  c
"'++e"4 J-3HAcU!J<*H'**<.F//+ 0 ]E#J7
C
 GQ/G]E#J7
C=-.	` tCy(2,w*?@C 	AG|#a'8!=>E$"'Q"aE)"==L<MQqc R 3HRLO3DAG 	g>@ rR   
optimizersc                    |j                   si S t        i t        g i}|D ]  }t        |        t	        |d             }|j
                  r|j                         5  t        j                  | ||      }d d d        |s_t        |t           j                               D ]9  }d|v s|t           j                  |      |t           |j                  dd      <   ; |t           D ]1  }|t           D cg c]  }|j                  dd       }}||t        <   3 nGt        t        j                  d |j                   D                    }t#        t%        |t'        t)        |                        }	i }
| j+                         D ]a  \  }}t-        | |      }t)        |      dk7  rt/        d| dt)        |             t1        t3        |            }||	vrS|	|   }||
|<   ||
|<   c t        |t           j                               D ])  }|
|   }|t           j                  |      |t           |<   + |t           D ]#  }|t           D cg c]  }|
|   	 c}|t        <   % |sEt5        t6        |t                 j9                  |t                  t5        t:        |t                 j=                  |t                   |j>                  rt5        t@        tC        |            }tE        ||      S # 1 sw Y   xY wc c}w c c}w )	Nr   rq   z
_orig_mod.rl   c              3   .   K   | ]  }|t              y wr8   )r  )r   gs     r>   r   z(_get_optim_state_dict.<locals>.<genexpr>  s     -UQaj-Us   rp   r   r   )#rZ   r$  r%  r  r   r\   r[   ry   r   rc   r   r   rr   r  r	   from_iterabler3   r]   r   rangerz   r   r   rw   r   r   r   r(   r   r)   extendrG   r*   r)  r   )rf   r=  r   r   r  osdr(  r@  r4   param_pid_mappingfqn_pid_mappingr   r   r   r   pidgroups                    r>   _get_optim_state_dictrI  w  s    	,2BR+@ 2H% 1nUL13""$ ?++E5#>? #f+**,- R!#?B6{q?QCK		, ;<R X $?@zJ!!))L"5JJ#'
$ %---U%BTBT-UUVF $Ss6{1C%D E O#446 +
U ,t9>(23%ws4ykJ  4:& 11'.'*$'*$+ CK,,./ 8%c* $'v;??3#7FC 	8 S RBG.!Q3/#"6!QgR ],V45<<S[I 0 56==c#hGe2Hh (( 9:J K
 ))94@@k? ? K6 "Rs   K21K?L2K<	c           
      V   i }g }t         |t        |i}i }t        d t        t        |t                  D              r|S |j
                  D ]t  }|j                  t        g i       |t           D ]o  }	|j                  |	   D ]Y  }
|
|j                  v rCd}t        t        |t                 D ]&  }|
t        t        t           |t                 v s$d} n nd}|sZ|d   t           }t        |t              st        dt        |             |j                  |
       |	j                   rR|
t        t        |t                  v rt        t        |t                  |
   ||
<   n|j"                  rt%        d|
 d      t        t        |t                 D ]D  }|
t        t        t           |t                 v s$t'        |t                 dz
  |t)        |      <   F \ r t'        |t                 d	k(  sg }t        t        |t                 D ]>  }t'        t        t        t           |t                       d	k(  s.|j                  |       @ t'        |      dk7  rt+        d
      t'        |t                 t'        |j
                        k7  rt+        d      t'        |t                 dz
  |t)              <   w t        t        |t                 D ]M  }|j-                  t)        |      d      }|dk(  r$|j/                         D ]  \  }}|t        k(  r|||   |<    O |S )a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c              3   <   K   | ]  }t        |t                y wr8   )ru   r  )r   r(  s     r>   r   z*_split_optim_state_dict.<locals>.<genexpr>  s     
U!:a
Us   FTr   r3  z'Missing optimizer state for parameter 'z' in checkpoint. The parameter requires gradients but has no saved optimizer state. To load anyway, use StateDictOptions(strict=False).rp   r   zThere are param groups that have zero parameters. In such a case, DSD only support exactly one param group with zero parameters.But the loaded state_dict has zero or more than one param groups that have zero parameters.z`When there is a parameter group that has zero parameters, multiple optimizers are not supported.)r$  r%  allr   r(   r3   rx   r  rV   rW   r)   rc   rP   ru   rw   r"  r   rE   r   rz   idr   r   r   )rf   r  r   r   r5   r4  r5  
pg_mappingr  r   r   r6  loaded_param_groupr4   r&  pg_idxr   r  s                     r>   _split_optim_state_dictrQ    sI   * E"$H&,eS(%CJ!#J

UtM;KF;S'T
UU)) 7J"& )  	VE--e4 V$444 %I.2)+;C+@/ "* $tCy2DW2M"NN(,I!" !%I !"g.!&$/(+>tF|n)MNNc"&&d=2B62JKK%)-9I&9Q%RSV%Wc
*EcU KR R 
 +/%'7'<+ V& d49.@.IJJ=@C=QTU=U
2&8#9:	V7V 	VD {7#$)C&*+<>Ns>S&T 3"tDI'9''BCDIJJ123 3x1} 1  #C()S1C1C-DD =  25Z_1E1IJr,-.o7Jr -/?/DE 	*;4R<%++- 	*JCg~$)HVS!		*	* rR   c           
      0   |j                   sy |D ]  }t        |       |r@t        |v rt        | |||      }n+t	        |t        t        t        t        f   |      |      }ni }|j                  rl| j                         D ]&  \  }}t        | |      }t        | |d      }	||	k(  r't        |      dk7  rt        d| dt        |             |j                         }
|	j                         }|t           D ]N  }t        t        t        t         f   |      }|t"           D cg c]  }|j%                  |
|       }}||t"        <   P t        t&        |t                 }t)        |j+                               D ]+  }|
|v s|j                  |      ||j%                  |
|      <   - ) |j-                         5  t/        j0                  | ||      }d d d        n|j2                  rd|_        t5        | |f|      }d|_        d fd}t7        t8        j:                  ||      }t        d      t=        |      \  }}t=        |      \  }}|j>                  rtA        ||	       ntC        ||	       |D ]*  }||vs||vrt        d
| d      ||   ||<   ||   ||<   , tE        ||      }|t           D ]/  }t"        |vsg t        t        t        t        f   |      t"        <   1  tG        |d      |        y c c}w # 1 sw Y   %xY w)NF)ri   rp   zExpected 1 FQN for 'r   Tc                     | j                         dkD  r*| j                  | S | j                  k7  rt        d      | S )Nr   zDevice mismatch)r   r   r   )tr   s    r>   _devicez'_load_optim_state_dict.<locals>._device^  sD    557Q;~!"   188+():;;rR   zExpected device to be setr   zExpected key 'z' in osd_mappingr   )r   )$rZ   r  r$  rQ  r<  r   r]   rP   r'   r\   r   r   rz   rw   r   r%  r
   r  rr   r(   rc   r   r[   ry   optim_state_dict_to_loadrA   rI  r$   r^   r_   r   rF   r   r   r   r   )rf   r=  r   r   r  r   original_fqn_r   fqns_with_compilerr   fqn_with_compilerr@  valr   r4   	osd_stater(  r  rU  flatten_osdosd_mappingflatten_local_osdlocal_osd_mapping	optim_keypgr   s                             @r>   _load_optim_state_dictrc  !  st     [N% ##:5*d$  $?4S)^ 4jA4$   " $)#9#9#; Xa 5%.<e&" --t9>(.|nGCI;O  hhj$6$:$:$<!)#. *AtCH~q1CGJ7|@CC):;F  $*CL* !0@0HI	inn./ XAaxGP}}UVGW	!))C1B"CDX+X2 ""$ #'#@#@5"2$   !!#(D 4UUHdK#'D F ellG5EFA~$%@AA':;K'L$K3FGW3X00((%k3DVT&{4EfU
 ) J	$55 3,,YK7GH  4?y3I%i03>y3I%i0J  5!#4  's+ A"$>@Dc9n-r27;A 	1u/0<LMw[N@ s   $L
;LL	c                    t               5  t        | dd||      }t        | |      }t        |i |       |cddd       S # 1 sw Y   yxY w)aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    rQ   Fr   r   r   N)r?   r   r   r   )rf   r   r   r   r   s        r>   r,   r,     sV    0 
 
 !
 1=+R6
  
  
 s   +A  A	c                    t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d||      }t        | ||      }t        i ||       |cddd       S # 1 sw Y   yxY w)a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    Tre  N)	r?   ru   r^   r  	Optimizertupler   rI  r   )rf   r=  r   r   r   r   s         r>   r-   r-     s    6 
   *ekk&;&;< Mz" 	
 !
 1
DI2/6     s   AA33A<c                   t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d||      }t        | |      }t        | ||      }t        |||       ||fcddd       S # 1 sw Y   yxY w)a  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(
        ...     fsdp_model, fsdp_optim
        ... )

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    Fre  N)
r?   ru   r^   r  rg  rh  r   r   rI  r   )rf   r=  r   r   r   r   r   s          r>   r.   r.     s    P 
 2 *ekk&;&;< Mz" 	
 !
 1=0
DI+-=tD!11!2 2 2s   A,BB
c           
         |si S t        t        t        |j                                     t        j
                        rt        j                  dt        d       t        t        t        j
                  t        t        t        f   f   |      }i }|j                         D ]  \  }}| j                         D ]}  \  }}||k7  rt        | |      }t!        |      dk7  rt#        d      t        t        |             d}	|j%                  |j                         D 
ci c]  \  }
}|	|
z   | c}}
         |S t        t        t        t        f   |      S c c}}
w )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.r   r   rp   z/FQNs for a submodule should only have 1 elementrm   )ru   r   r   r   rd   re   r   r   r   r   r]   rP   r'   r   r   r   rz   rw   r   )rf   r   cast_state_dictr   r   sub_state_dictrg   mr   r   subfqnr  s               r>   _unflatten_model_state_dictro  6  sF    	$tJOO-./;" 	
 tBIItCN/C$CDjQ/1)8)>)>)@ 	%I~ ..0 a	> -t9>(I  !d,-Q/%%AOAUAUAWXVf_e+X	 Di(*55	 Ys   E)r   c                    t        | |      }t               5  t        | dd|      }t        |i |       t	        | ||      cddd       S # 1 sw Y   yxY w)a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    rQ   Fr   r   N)ro  r?   r   r   r  )rf   r   r   r   s       r>   r/   r/   [  s`    : .I. 
 EubUGL+R6%e-=tD	E E Es   )A

Ac                    t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d|      }t        i ||       t        | |||       ddd       y# 1 sw Y   yxY w)a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    WARN: ``set_optimizer_state_dict`` can only be called before ``backward()`` or after
        ``step()`` is called on the optimizers. Otherwise, the optimizer states won't be
        initialized correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    Trq  N)	r?   ru   r^   r  rg  rh  r   r   rc  )rf   r=  r   r   r   s        r>   r0   r0     sz    > 
 	J *ekk&;&;< Mz" 	
 ujT7S2/6uj2BDI	J 	J 	Js   AA11A:c                .   t        | |      }t               5  t        |t        j                  j
                        r|fn
t        |      }t        | || |      }t        |||       t        | |||       t        | ||      cddd       S # 1 sw Y   yxY w)a  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    WARN: ``set_state_dict`` can only be called before ``backward()`` or after ``step()``
        is called on the optimizers. Otherwise, the optimizer states won't be initialized
        correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    rq  N)ro  r?   ru   r^   r  rg  rh  r   r   rc  r  )rf   r=  r   r   r   r   s         r>   r1   r1     s    \ .I. 
 E *ekk&;&;< Mz" 	
 :.>*>
 	+-=tDuj2BDI%e-=tDE E Es   A*BBc                $   t        j                  t        | |      fd}|| _        t        j                  t        | |      dt
        t        t        f   ffd}|| _        t        j                  |       t        j                  |       y)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rf   r   c                               S r8   rQ   _state_dict_calls   r>   state_dict_callz0_patch_model_state_dict.<locals>.state_dict_call      !!rR   r   c                      |        y )N)r   rQ   r   _load_state_dict_calls    r>   load_state_dict_callz5_patch_model_state_dict.<locals>.load_state_dict_call      z:rR   N)r   r   r,   r   r/   r]   rP   r
   r   r6   r   )rf   r   rx  r}  r|  rw  s       @@r>   _patch_model_state_dictr    s    6 !((" 'E%--;c3h ; 1EO,01rR   c                   t        j                  t        | ||      fd}t        j                  t        | ||      dt        t
        t        f   ffd}t        j                  |       t        j                  |       t        |t        j                  j                        r|fn
t        |      }|D ]  }||_        ||_         y)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rf   r=  r   c                               S r8   rQ   rv  s   r>   rx  z4_patch_optimizer_state_dict.<locals>.state_dict_callL  ry  rR   r   c                      |        y )N)r   rQ   r{  s    r>   r}  z9_patch_optimizer_state_dict.<locals>.load_state_dict_callV  r~  rR   N)r   r   r-   r0   r]   rP   r
   r6   r   ru   r^   r  rg  rh  r   r   )rf   r=  r   rx  r}  r  r|  rw  s         @@r>   _patch_optimizer_state_dictr  &  s    > !(( 	" &-- 	;c3h ; O,01 j%++"7"78 
: 
  5* 45rR   )rH   TT)rH   )qra   r   r9   r   collections.abcr   r   r   dataclassesr   r   r   	itertoolsr	   typingr
   r   r   r   r   r^   torch.distributedr   r   torch.nnrd   'torch.distributed._shard.sharded_tensorr   #torch.distributed._state_dict_utilsr   r   r   r   r   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   torch.distributed.fsdpr   r   r   ry   r   r   r   r   r   $torch.distributed.fsdp._common_utilsr   r    torch.distributed.tensorr!   torch.nn.modules.moduler"   torch.nn.parallelr#   rv   torch.utils._pytreer$   __all__r{   r%  r  r$  r`   rP   r%   r_   r  r   r&   rc   rh  r]   r'   r(   r)   r*   r6   rO   r   r?   r+   rT   re   rN   r   r   r   r  rg  r   r   r   r   no_gradr   r  r  r)  r<  rI  rQ  rc  r,   r-   r.   ro  r/   r0   r1   r  r  rQ   rR   r>   <module>r     s     	  9 9 0 0  < <     A 	 	 	 - 5 < -" 
		Sg}ellCKL4&m(<d3CS>TT	 S)^$' #u]4E%EFFG  &)U S] *   ,. ,. ,.^ @% @ @& . !%HE99HE
HE HE 	HE
 HE HEV	 	%"Z ,0*.A99A%++'',-A A
 RYY(A &'A AH+3	>*+(+ + 
	+\bii)>)>>? c h S#X&4	#s(^$ ?;99?;*?;	#y.?; ?;D A
99A
S)^$A
 A
 	A
 A
H'&U[[22 '&t '&T_*< _c9nAU _DA;;  AS)^$A A 	AH BA99BAekk++S01BA BA 	BA BAJa99a;;  a )a 	a
 aH dN99dNekk++S01dN #dN 	dN
 
dN dNT ,0*.	" 99"  RYY("  &'	" 
 
#y." R ,0*.* 99* ekk++Xekk6K6K-LLM*  RYY(	* 
 &'*  * b ,0*.X299X2ekk++Xekk6K6K-LLMX2 RYY(	X2
 &'X2 4Y!334X2v"699"6d299d3	>&::;T#y.=QQR"6 
#y."6R +/	$E99$E3	>*$E &'	$E
 $EX +/(J99(Jekk++Xekk6K6K-LLM(J )(J
 &'(J 
(Jb +/=E99=Eekk++Xekk6K6K-LLM=E 3	>*	=E
 )=E &'=E =ED  +/129912 &'12 
	12 12l 
 +/	;599;5 ekk++S01;5 &'	;5
 
;5 ;5rR   