
    kh,;              !          d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	  G d dej                  j                        Z	 	 dd	ee   d
ee   dee   dee   dee   dee   dedededededededededdf dZd	ee   d
ee   dee   dee   dee   dededededededededee   fdZd	ee   d
ee   dee   dee   dee   dededededededededee   fdZy)ad   NAdamW Optimizer

Based on simplified algorithm in https://github.com/mlcommons/algorithmic-efficiency/tree/main/baselines/nadamw

Added multi-tensor (foreach) path.

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285
    N)ListOptionalTuple)Tensor   )ParamsTc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 ddededeeef   dedededed	ed
ee   def fdZ	 fdZ
 ej                         dd       Z xZS )NAdamWa   Implements NAdamW algorithm.

    See Table 1 in https://arxiv.org/abs/1910.05446 for the implementation of
    the NAdam algorithm (there is also a comment in the code which highlights
    the only difference of NAdamW and AdamW).

    For further details regarding the algorithm we refer to
        - Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101
        - On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ

    Args:
        params: iterable of parameters to optimize or dicts defining parameter groups
        lr: learning rate
        betas: coefficients used for computing running averages of gradient and its square
        eps: term added to the denominator to improve numerical stability
        weight_decay: weight decay coefficient
        caution: enable caution
        corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr)
    paramslrbetasepsweight_decaycautioncorrected_weight_decaymaximizeforeach
capturablec                 D   d|k  st        d|       d|k  st        d|       d|d   cxk  rdk  sn t        d|d          d|d   cxk  rdk  sn t        d|d          d|k  st        d	|       t        |||||||	||

	      }t        |   ||       y )N        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid weight_decay value: )	r   r   r   r   r   r   r   r   r   )
ValueErrordictsuper__init__)selfr   r   r   r   r   r   r   r   r   r   defaults	__class__s               M/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/nadamw.pyr   zNAdamW.__init__*   s     by6rd;<<cz6se<==eAh$$B58*MNNeAh$$B58*MNNl";L>JKK%#9!

 	*    c                    t         |   |       t        | j                  j	                               }t        |      dk7  xr t        j                  |d   d         }|s+|D ]&  }t        j                  t        |d               |d<   ( | j                  D ]&  }|j                  dd       |j                  dd       ( y )Nr   stepr   Fr   )r   __setstate__liststatevalueslentorch	is_tensortensorfloatparam_groups
setdefault)r   r%   state_valuesstep_is_tensorsgroupr   s         r   r#   zNAdamW.__setstate__N   s    U#DJJ--/0l+q0^eoolSToV\F]6^! ;!LLqy)9:&	;&& 	>EY.5u=	>r    c                    | j                          d}|$t        j                         5   |       }ddd       | j                  D ]{  }g }g }g }g }g }|d   \  }	}
|d   D ]  }|j                  |j                  |       |j                  j                  rt        d      |j                  |j                         | j                  |   }t        |      dk(  rht        j                  d      |d<   t        j                  |t        j                        |d	<   t        j                  |t        j                        |d
<   |j                  |d	          |j                  |d
          |j                  |d          ! t        ||||||	|
|d   |d   |d   |d   |d   |d   |d   r| j                  d   nd       ~ |S # 1 sw Y   xY w)zPerforms a single optimization step.

            Args:
              closure (callable, optional): A closure that reevaluates the model
                  and returns the loss.
        Nr   r   z(NAdamW does not support sparse gradientsr   r   r"   )memory_formatexp_avg
exp_avg_sqr   r   r   r   r   r   r   	beta1beta2r   r   r   r   r   r   max_lr) _cuda_graph_capture_health_checkr(   enable_gradr,   gradappend	is_sparseRuntimeErrorr%   r'   r*   
zeros_likepreserve_formatnadamwr   )r   closurelossr1   params_with_gradgradsexp_avgsexp_avg_sqsstate_stepsr7   r8   pr%   s                r   r"   zNAdamW.stepY   s    	--/""$ !y! && -	E!EHKK >LE58_ 266> ''*66##&'QRRQVV$

1 u:?$)LL$4E&M','7'7I^I^'_E)$*/*:*:1ELaLa*bE,'i 01""5#67""5=1)2,  ;">2%Li(z* ..34L.Mt}}T*SW=-	^ e! !s   GG)	MbP?)g?g+?g:0yE>g{Gz?FFFNFN)__name__
__module____qualname____doc__r   r+   r   boolr   r   r#   r(   no_gradr"   __classcell__)r   s   @r   r
   r
      s    . )5"&!+0"&*$"+"+ "+ &	"+
 "+  "+ "+ %)"+ "+ d^"+ "+H	> U]]_= =r    r
   r   rF   rG   rH   rI   r   r   r7   r8   r   r   r   r   r   r9   returnc                N   t        d |D              st        d      |:	 | xs4 dt        j                  j                  j
                  j                         v }|r%t        j                  j                         st        }nt        } || |||||||	|
|||||       y#  d}Y JxY w)zcFunctional API that performs NAdamW algorithm computation.
      See NAdamW class for details.
    c              3   P   K   | ]  }t        |t        j                           y wrL   )
isinstancer(   r   ).0ts     r   	<genexpr>znadamw.<locals>.<genexpr>   s     @qz!U\\*@s   $&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNScalarFr6   )allr?   r(   opsaten_foreach_maximum_	overloadsjitis_scripting_multi_tensor_nadamw_single_tensor_nadamw)r   rF   rG   rH   rI   r   r   r7   r8   r   r   r   r   r   r9   funcs                   r   rB   rB      s    , @K@@!" 	" 	!k]X1Q1Q1[1[1]%]G uyy--/#$!	Gs   9B B$c       	         ~   t        |       D ]  \  }}|s||   n||    }||   }||   }||   }|dz  }||n|dz  |z  }|j                  d||z  z
         |j                  |      j                  |d|z
         |j                  |      j                  ||d|z
         |r|}dt	        j
                  ||      z
  }dt	        j
                  ||      z
  }||z  }|j                         }|j                         }|j                  |      j                  |d|z
        }|j                         ||z  z  j                  |	|z        }|
ra||z  dkD  j                  |j                        }|j                  |j                         j                  d             |j                  |       |j                  ||       |j                         }d||z  z
  }d||z  z
  }||z  }t!        j                  |      }|j                  |      j                  |d|z
        }|j                         |z  j                  |	      }|
ra||z  dkD  j                  |j                        }|j                  |j                         j                  d             |j                  |       |j                  |||         y )	Nr      r   alpha)valuer   rK   )min)	enumeratemul_add_addcmul_r(   pownegsqrtmultodtypediv_meanclamp_addcdiv_itemmath)r   rF   rG   rH   rI   r7   r8   r   r   r   r   r   r   r9   iparamr<   r4   r5   step_twd_scaler"   bias_correction1bias_correction2	step_sizestep_size_negbias_correction2_sqrtdenommasks                                r   rd   rd      s   $ f% >=5'uQxeAhY1+ ^
Q 	!  2R1Wv-=

2<//0 	U  QY 7''d!e)'DD  !599UD#99 599UD#99--I%MMOM$4$9$9$;! kk%(--d!e)-DG__&*?-*OPVVWZ]jWjkE  $*..tzz:		$))+,,,67T"NN7E*;;=D 5D=0 5D=0--I$(II.>$?! kk%(--d!e)-DG__&)>>DDSIE$*..tzz:		$))+,,,67T"NN7E)N<}>=r    c       	            t        |       dk(  ry |r#t        d t        | |      D              sJ d       |rt        j                  t        |            }|D cg c].  }t        j                  |      rt        j                  |      n|0 }}|D cg c].  }t        j                  |      rt        j                  |      n|0 }}|D cg c].  }t        j                  |      rt        j                  |      n|0 }}| D cg c].  }t        j                  |      rt        j                  |      n|0 } }t        j                  |d       ||n|dz  |z  }t        j                  | d||z  z
         t        j                  ||       t        j                  ||d|z
         t        j                  ||       t        j                  |||d|z
         |rk|D cg c]  }t        j                  ||       }}|D cg c]  }t        j                  ||       }}t        j                  |d       t        j                  |d       t        j                  |       t        j                  |       t        j                  ||      }t        j                  |       t        j                  |       t        j                   |      }t        j"                  ||      }t        j                  ||d|z
         t        j                   |      }t        j$                  |t        j"                  ||             t        j                  ||	      }t        j                  |       t        j&                  ||      }|
rt        j"                  ||      }t        ||      D cg c]#  \  }}|dkD  j)                  |j*                        % }}}|D cg c]  }|j-                          }}t        j.                  |d       t        j$                  ||       t        j                  ||       t        j0                  | ||       y |D cg c]  }d||j3                         z  z
   }}|D cg c]  }d||j3                         z  z
   }}|D cg c]
  }||z  dz   }}|D cg c]  }t5        j6                  |       }}t        j"                  ||      }t        j                  ||d|z
         t        j                   |      }t        j$                  ||       t        j&                  ||	      }|
rt        j"                  ||      }t        ||      D cg c]#  \  }}|dkD  j)                  |j*                        % }}}|D cg c]  }|j-                          }}t        j.                  |d       t        j$                  ||       t        j                  ||       t        j0                  | |||       y c c}w c c}w c c}w c c}w c c}w c c}w c c}}w c c}w c c}w c c}w c c}w c c}w c c}}w c c}w )	Nr   c              3   V   K   | ]!  \  }}|j                   xr |j                    # y wrL   )is_cuda)rX   rJ   r"   s      r   rZ   z'_multi_tensor_nadamw.<locals>.<genexpr>;  s)      
+21dAII&$,,&
s   ')z@If capturable=True, params and state_steps must be CUDA tensors.r   rg   rh   rK   )r'   r\   zipr(   _foreach_negtuple
is_complexview_as_real_foreach_add__foreach_mul__foreach_addcmul_rp   _foreach_sub__foreach_neg__foreach_div_foreach_reciprocal__foreach_sqrt_foreach_mul_foreach_div__foreach_addrt   ru   rw   r_   _foreach_addcdiv_rz   r{   rr   )r   rF   rG   rH   rI   r7   r8   r   r   r   r   r   r   r9   xr   r"   r   r   r   r   exp_avg_sq_sqrteps_over_step_sizer   masksmg
mask_scalebcs                                r   rc   rc   &  s^   " 6{a 
69&+6N
 
 	NM	N 
 ""5<0JOPQe&6&6q&9U"q@PEPMUV)9)9!)<""1%!CVHVP[\1E,<,<Q,?5%%a(QF\K\KQRau'7'7':e  #ARFR 
Q' ^rq6)9H	X%< <= 
%(	%q5y9	U+	Kq5yA?JKtEIIeT2KK?JKtEIIeT2KK,a0,a0,-,- &&'7<	""9-I& % 3 34D E %%h6He1u9=--k:4i@	
 #//	3?""#56""?4FG&&x7E585FGTQa!eZZ(GEG,12q!&&(2J2##J5z2%0%8ALMA 44MMALMA 44MM.>?b2g^?	?9I J22 J J %%h6He1u9=--k:O-BC""?C8&&x7E585FGTQa!eZZ(GEG,12q!&&(2J2##J5z2%0%Cq QV\R$ LK> H2 NM? J H2sT   3X 3X3X
3X XX%(XX$X).X.X3&X8(X=Y)NF)rP   r{   typingr   r   r   r(   r   _typesr   optim	Optimizerr
   rQ   r+   rB   rd   rc    r    r   <module>r      sq  	  ( (   BU[["" BV #' 6V6F|6 v,6 &\	6
 &\6 $6 6 6 6 6 6 6 6 6  !6" 
#6rP=VP=F|P= v,P= &\	P=
 &\P= P= P= P= P= P= P= P= P= P=ftDVtDF|tD v,tD &\	tD
 &\tD tD tD tD tD tD tD tD tD tDr    