
    iZh              (       N   d Z ddlmZ ddlZddlmZ ddlmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZ ddgZ G d	 de      Zd
de de
 de de de dz   e_         dee   dee   dee   dee   dee   dee   dededededededededededed df$d!Zdee   dee   dee   dee   dee   dee   dededededededededededed df$d"Z e	e#      	 	 	 	 	 	 d&dee   dee   dee   dee   dee   dee   ded$edz  dedededededededededed df&d%       Zy)'z'Implementation for the NAdam algorithm.    )castN)Tensor   )_capturable_doc_default_to_fused_or_foreach_differentiable_doc_disable_dynamo_if_unsupported_foreach_doc!_get_capturable_supported_devices_get_scalar_dtype
_get_value_maximize_doc_params_doc_stack_if_compiling
_to_scalar_use_grad_for_differentiable_view_as_real	OptimizerParamsTNAdamnadamc                        e Zd Z	 	 	 	 	 	 dddddddedeez  deeef   deded	ed
ededz  dedededdf fdZ fdZ	d Z
edd       Z xZS )r   FN)foreachmaximize
capturabledifferentiableparamslrbetasepsweight_decaymomentum_decaydecoupled_weight_decayr   r   r   r   returnc                   t        |t              r|j                         dk7  rt        d      d|k  st        d|       d|k  st        d|       d|d   cxk  rdk  sn t        d|d          d|d   cxk  rdk  sn t        d	|d          d|k  st        d
|       d|k  st        d|       |||||||	||
|d
}t        |   ||       y )Nr   zTensor lr must be 1-element        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: zInvalid momentum_decay value: )
r   r   r    r!   r"   r#   r   r   r   r   )
isinstancer   numel
ValueErrorsuper__init__)selfr   r   r   r    r!   r"   r#   r   r   r   r   defaults	__class__s                K/var/www/html/engine/venv/lib/python3.12/site-packages/torch/optim/nadam.pyr,   zNAdam.__init__!   s    b&!bhhjAo:;;by6rd;<<cz6se<==eAh$$B58*MNNeAh$$B58*MNNl";L>JKKn$=n=MNOO(,&< $,
 	*    c                 8   t         |   |       | j                  D ]z  }|j                  dd       |j                  dd        |j                  dd       |j                  dd       |j                  dd       |d   D ]  }| j                  j                  |g       }t        |      dk7  s/t        j                  |d	         s_t        |d	         }|d   r*t        j                  |t               |j                  
      nt        j                  |t                     |d	<   t        j                  |d         r|d   }|d   r*t        j                  |t               |j                  
      nt        j                  |t                     |d<    } y )Nr   Fr   r   r   r#   r   r   stepdtypedevicer5   
mu_product)r+   __setstate__param_groups
setdefaultstategetlentorch	is_tensorfloattensorr   r6   )r-   r<   grouppp_statestep_valmu_prod_valr/   s          r0   r9   zNAdam.__setstate__L   sn   U#&& 	EZ/Y-\51-u55u=8_ **..B/w<1$ ??76?;#(#9
  %\2 "LL (0A0CAHH "'h>O>Q!R   !??7<+@A&-l&;
  %\2 "LL +3D3Fqxx "'kARAT!U  -	r1   c                    d}|d   D ]  }	|	j                   |t        j                  |	      z  }|j                  |	       |	j                   j                  rt        d      |j                  |	j                          | j                  |	   }
t        |
      dk(  r|d   r*t        j                  dt               |	j                        nt        j                  dt               	      |
d
<   |d   r*t        j                  dt               |	j                        nt        j                  dt               	      |
d<   t        j                  |	t        j                        |
d<   t        j                  |	t        j                        |
d<   |j                  |
d          |j                  |
d          |j                  |
d          |j                  |
d
           |S )NFr   z'NAdam does not support sparse gradientsr   r    r4   r&   r7   r3   r'   r8   )memory_formatexp_avg
exp_avg_sq)gradr?   
is_complexappend	is_sparseRuntimeErrorr<   r>   zerosr   r6   rB   ones
zeros_likepreserve_format)r-   rC   params_with_gradgradsexp_avgsexp_avg_sqsmu_productsstate_stepshas_complexrD   r<   s              r0   _init_groupzNAdam._init_groupj   s    x $	2Avv!u//22 ''*66##&'PQQQVV$

1u:? !. B.?.A!((S"\\#5F5HI &M !. 

2->-@R"\\#5F5HI ,' (-'7'7)>)>(E)$ +0*:*:)>)>+E,' i 01""5#67""5#67""5=1I$	2J r1   c                    | j                          d}|$t        j                         5   |       }ddd       | j                  D ]  }g }g }g }g }g }g }	t	        t
        t        t        f   |d         \  }
}| j                  |||||||	      }t        ||||||	|
||d   |d   |d   |d   |d   |d   |d	   |d
   |d   |        |S # 1 sw Y   xY w)zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r!   r"   r    r   r#   r   r   r   )beta1beta2r   r!   r"   r    r   r#   r   r   r   r\   )	 _cuda_graph_capture_health_checkr?   enable_gradr:   r   tuplerA   r]   r   )r-   closurelossrC   rV   rW   rX   rY   rZ   r[   r_   r`   r\   s                r0   r3   z
NAdam.step   s2    	--/""$ !y! && &	E-/"$E%'H(*K(*K(*KeUl 3U7^DLE5** K  ;">2$%56%Lz*',-E'Fi( .$%56'%'&	P W! !s   C

C)gMb`?)g?g+?g:0yE>r   gMbp?FN)__name__
__module____qualname__r   rA   r   rc   boolr,   r9   r]   r   r3   __classcell__)r/   s   @r0   r   r       s     "%1 $',)+  $ $)+)+ FN)+ UE\"	)+
 )+ )+ )+ !%)+ )+ )+ )+ )+ 
)+V<0d "6 "6r1   a  Implements NAdam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
            &\hspace{13mm} \: \textit{decoupled\_weight\_decay}, \:\textit{maximize}             \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1}                                       \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
            &\hspace{15mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}                    \\
            &\hspace{10mm}\textbf{else}                                                          \\
            &\hspace{15mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
    z
    Args:
        a  
        lr (float, Tensor, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
        decoupled_weight_decay (bool, optional): whether to decouple the weight
            decay as in AdamW to obtain NAdamW. If True, the algorithm does not
            accumulate weight decay in the momentum nor variance. (default: False)
        z	
        z

    .. _Incorporating Nesterov Momentum into Adam:
        https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101

    r   rW   rX   rY   rZ   r[   r_   r`   r   r!   r"   r    r#   r   r   r   r\   r$   c                   t         j                  j                         st        |      }t	        |       D ]  \  }}|s||   n||    }||   }||   }||   }||   }t        j
                  |      rTt        j                  |      }t        j                  |      }t        j                  |      }t        j                  |      }t         j                  j                         s}|r{t               }|j                  j                  |j                  j                  cxk(  r|j                  j                  k(  rn n|j                  j                  |v st        d| d      |dz  }|r|}nt        |      }d||z  z
  }|	dk7  r-|r|j                  d||	z  z
         n|j                  ||	      }|ddd||
z  z  z  z
  z  }|ddd|dz   |
z  z  z  z
  z  }||z  }|j!                  |d|z
         |j                  |      j#                  ||d|z
  	       |j%                  |      j'                         }|s|r]|j                  |      }||z  }|| d|z
  z  d|z
  z  z  }|| |z  d|z
  z  z  }|j)                  ||       |j)                  ||       `t        |      |z  }|j+                  |       |j)                  ||| d|z
  z  dt        |      z
  z  	       |j)                  ||t-        t.        | |z  d|z
  z        	        y )
NzVIf capturable=True, params, mu_products and state_steps must be on supported devices: .r   r   alphar'         ?Q?)value)r?   jitis_scriptingr   	enumeraterN   view_as_realcompileris_compilingr   r6   typeAssertionErrorr   mul_addlerp_addcmul_divsqrtaddcdiv_add_r   rA   )r   rW   rX   rY   rZ   r[   r_   r`   r   r!   r"   r    r#   r   r   r   r\   iparamrM   rK   rL   r8   step_tcapturable_supported_devicesr3   bias_correction2mumu_nextdenommu_product_nexts                                  r0   _single_tensor_nadamr     s&   ( 99!!#^f% J5'uQxeAhY1+ ^
 ^
QE"&&u-E%%d+D((1G++J7J ~~**,+L+N(!!Z%6%6%;%;Qv}}?Q?QQLL%%)EE$--I,J!M  	!Df%Dud{?1%

1rL001xx\x: cC4D>,A#BCCD3$(n1L(M!NNO 	b
 	dAI&''d!e)'D/0557ZIIcNE )72OB3#(+sZ/?@AD"w#2G!HIGNN4'NN7E*(4w>OJJsONNeRC38$4j>T8T$U   NN5B3=S?5J"KL  MJr1   c                  , t        |       dk(  ry |rt        d      t        j                  j	                         s>|r<t        d      ,t        ,fdt        | ||d      D              st        d, d	      t        |      }t        j                  | |||||g      }|j                         D ]  \  \  }}}}}}}t        t        t           |      }t        t        t           |      }t        t        t           |      }t        t        t           |      }t        t        t           |      }t        t        t           |      }|rt        ||||       |rt        j                   |      }t        j                  j	                         s=|d   j"                  r.t        j$                  |t        j&                  d
d      d
       nt        j$                  |d       |	dk7  rR|rt        j(                  |d||	z  z
         n3|rt        j$                  |||	       nt        j*                  |||	      }t        j,                  ||d|z
         t        j(                  ||       t        j.                  |||d|z
         t        j0                  |      }|r4t        j2                  ||
      } t        j4                  d|       }!t        j(                  |!d       t        j$                  |!d
       t        j(                  |!|       t        j$                  | |
       t        j4                  d|       }"t        j(                  |"d       t        j$                  |"d
       t        j(                  |"|       ~ t        j4                  ||      }#t        j6                  |#d
       t        j8                  |#       t        j:                  |#       nr|D $cg c]  }$d|t=        |$      z  z
  dz   }#}$|D $cg c]  }$|d
ddt=        |$      |
z  z  z  z
  z   }!}$|D $cg c]  }$|d
ddt=        |$      dz   |
z  z  z  z
  z  ! }"}$t        j(                  ||!       t        j>                  ||#       t        j$                  ||       ~#|rt        j6                  |!d
       t        j(                  |!|       t        j@                  |d
      }%t        j8                  |%       t        j>                  |!|%       |!}&~%t        j2                  ||"      }%t        j(                  |"|       t        j6                  |%d
       t        j>                  |"|%       |"}'~%t        j2                  |&|      }(t        j.                  |(|'|       t        jB                  ||(|       tE        t        ||!d      D )*cg c](  \  })}*t=        |      d
|*z
  z  d
t=        |)      z
  z  dz  * c}*})      }&tE        t        ||"d      D )+cg c](  \  })}+t=        |      |+z  d
t=        |)      |+z  z
  z  dz  * c}+})      }'t        jB                  ||||&       t        jB                  ||||'        y c c}$w c c}$w c c}$w c c}*})w c c}+})w )Nr   z#_foreach ops don't support autogradF)supports_xlac              3      K   | ]n  \  }}}|j                   j                  |j                   j                  cxk(  xr |j                   j                  k(  nc xr |j                   j                  v  p y wrf   )r6   ry   ).0rD   mpr3   r   s       r0   	<genexpr>z&_multi_tensor_nadam.<locals>.<genexpr>  s`      
 2t HHMMRYY^^?t{{/?/?? >!==>
s   A4A7T)strictzWIf capturable=True, params, mu_products, and state_steps must be on supported devices: rm   r'   cpu)r6   rn   r   rq   g      rp   )#r>   rz   r?   rw   rx   r   allzipr   r   "_group_tensors_by_device_and_dtypevaluesr   listr   r   _foreach_negis_cpu_foreach_add_rB   _foreach_mul__foreach_add_foreach_lerp__foreach_addcmul__foreach_sqrt_foreach_mul_foreach_pow_foreach_sub__foreach_neg__foreach_sqrt_r   _foreach_div__foreach_sub_foreach_addcdiv_r   )-r   rW   rX   rY   rZ   r[   r_   r`   r   r!   r"   r    r#   r   r   r   r\   grouped_tensorsgrouped_params_grouped_grads_grouped_exp_avgs_grouped_exp_avg_sqs_grouped_mu_products_grouped_state_steps__grouped_paramsgrouped_gradsgrouped_exp_avgsgrouped_exp_avg_sqsgrouped_mu_productsgrouped_state_stepsexp_avg_sq_sqrtexponentmusmu_nextsbias_correction_sqrtr3   r   step_size_gradsstep_size_expavg	numeratorr8   r   r   r   s-                                               @r0   _multi_tensor_nadamr   }  sa   ( 6{aBCC >>&&(Z'H(
$  
  #6;DQ
 

 !V/03  
BBBB	+{KHO ""$m 		 	d6lO<T&\>:V.?@"4<1EF"4<1EF"4<1EF /?AT !..}=M ~~**,1DQ1G1N1N#U\\#e%DC  3Q71%##NA\8I4IJ ''%~\ %*$6$6%~\%M
 	-}a%iH/7q5y	
  --.AB
 ))*=~NH$$T84CT*S)U+ .9))$9H$/#.%0 #(#5#5e=P#Q  4c: 45  !56 DW$;?Uj...36$  $
 0 sdz$/?./P&QRRSC  0  *T*:Q*>.)P QRRTH  	/5O-ABOS1 !S)R(&&':C@E&U+!O &&':HEE"- s+%0' **?MJI##I/?AQR ##NIO1 +..A3t*T&
B  ^sRx0C*Z:P4PQUWWO  3 03+Xd0
 ,
G #2!"J!7'!AAC 
  ##	 ##  	Qmn$b
s   Y!3!Y&$Y+--Y08-Y6)single_tensor_fnr   c                   t        d |D              st        d      t        d |D              st        d      |t        | |	d      \  }}|r)t        j                  j                         rt        d      |r%t        j                  j                         st        }nt        } || |||||||||||||||	|
	       y)
zpFunctional API that performs NAdam algorithm computation.

    See :class:`~torch.optim.NAdam` for details.
    c              3   P   K   | ]  }t        |t        j                           y wrf   r(   r?   r   r   ts     r0   r   znadam.<locals>.<genexpr>x       @qz!U\\*@   $&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsc              3   P   K   | ]  }t        |t        j                           y wrf   r   r   s     r0   r   znadam.<locals>.<genexpr>}  r   r   zPAPI has changed, `mu_products` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizers)r_   r`   r   r!   r"   r   r#   r    r   r   r\   )r   rQ   r   r?   rs   rt   r   r   )r   rW   rX   rY   rZ   r[   r#   r   r   r   r\   r   r_   r`   r   r!   r"   r    r   funcs                       r0   r   r   \  s    8 @K@@^
 	
 @K@@^
 	
 1Ne

7 599))+STTuyy--/"#!%5%#r1   )FNFFFF)__doc__typingr   r?   r   	optimizerr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   __all__r   r   rA   rj   r   r   r   rI   r1   r0   <module>r      su   .       ( G
sI sn&N		 	 
 		 		 		 !O> FaLa<a 6la f	a
 fa fa a a 	a a a 
a !a a  !a" #a$ %a& 
'aH\L\<\ 6l\ f	\
 f\ f\ \ \ 	\ \ \ 
\ !\ \  !\" #\$ %\& 
'\~  1EF $) DLD<D 6lD f	D
 fD fD !D D[D D D D D  !D" #D$ 	%D& 'D( )D* 
+D, 
-D GDr1   