
    khA              %          d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ  G d d	e	      Z	 	 dd
ee   dee   dee   dee   dee   dee   dee   dedededededededededee   ddf$dZd
ee   dee   dee   dee   dee   dee   dedededededededededee   f dZd
ee   dee   dee   dee   dee   dee   dedededededededededee   f dZy) aM   AdamW Optimizer
Impl copied from PyTorch master

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

NOTE: This impl has been deprecated in favour of torch.optim.AdamW and remains as a reference
    N)ListOptionalTuple)Tensor)	Optimizer   )ParamsTc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 ddededeeef   dedededed	ed
edee   def fdZ	 fdZ
 ej                         dd       Z xZS )AdamWLegacya  Implements AdamW algorithm.

    NOTE: This impl has been deprecated in favour of torch.optim.AdamW and remains as a reference

    References:
        - Adam: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980
        - Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101
        - On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ

    Args:
        params: iterable of parameters to optimize or dicts defining parameter groups
        lr: learning rate
        betas: coefficients used for computing running averages of gradient and its square
        eps: term added to the denominator to improve numerical stability
        weight_decay: weight decay coefficient
        amsgrad: whether to use the AMSGrad variant of this algorithm
            from the paper `On the Convergence of Adam and Beyond`
        caution: apply caution when using AdamW
        corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr)
        maximize: maximize the params based on the objective, instead of minimizing
        foreach: whether foreach implementation of optimizer is used.
            If unspecified by the user (so foreach is None), we will try to use
            foreach over for-loop implementation on CUDA, since it is faster in general.
        capturable: whether this instance is safe to capture in a CUDA graph.
            Passing True can impair ungraphed performance, so if you don't intend to
            graph capture this instance, leave it False
    paramslrbetasepsweight_decayamsgradcautioncorrected_weight_decaymaximizeforeach
capturablec                    d|k  st        dj                  |            d|k  st        dj                  |            d|d   cxk  rdk  sn t        dj                  |d               d|d   cxk  rdk  sn t        dj                  |d               t        ||||||||
|	|	
      }t        t        |   ||       y )
N        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})
r   r   r   r   r   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)selfr   r   r   r   r   r   r   r   r   r   r   defaults	__class__s                L/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/adamw.pyr   zAdamWLegacy.__init__1   s     by8??CDDcz8??DEEeAh$$DKKERSHUVVeAh$$DKKERSHUVV%#9!
 	k4)&(;    c                    t         t        |   |       t        | j                  j                               }t        |      dk7  xr t        j                  |d   d         }|s+|D ]&  }t        j                  t        |d               |d<   ( | j                  D ]n  }|j                  dd       |j                  dd       |j                  dd       |j                  dd        |j                  dd       |j                  d	d       p y )
Nr   stepr   Fr   r   r   r   r   )r   r   __setstate__liststatevalueslentorch	is_tensortensorfloatparam_groups
setdefault)r   r(   state_valuesstep_is_tensorsgroupr!   s         r"   r&   zAdamWLegacy.__setstate__U   s    k4-e4DJJ--/0l+q0^eoolSToV\F]6^! ;!LLqy)9:&	;&& 	2EY.Y.5u=Y-Z/\51	2r#   c                 Z   | j                          d}|$t        j                         5   |       }ddd       | j                  D ]  }g }g }g }g }g }g }	|d   \  }
}|d   }|d   D ]k  }|j                  |j                  |       |j                  j                  rt        d      |j                  |j                         | j                  |   }t        |      dk(  rt        j                  d      |d<   t        j                  |t        j                  	      |d
<   t        j                  |t        j                  	      |d<   |r(t        j                  |t        j                  	      |d<   |j                  |d
          |j                  |d          |r!|j                  |j                  dd             |	j                  |d          n t        ||||||	|d   ||
||d   |d   |d   |d   |d   |d   |d   r| j                  d   nd        |S # 1 sw Y   xY w)zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   z'AdamW does not support sparse gradientsr   r   r%   )memory_formatexp_avg
exp_avg_sqmax_exp_avg_sqr   r   r   r   r   r   r   r   )r   r   beta1beta2r   r   r   r   r   r   max_lr) _cuda_graph_capture_health_checkr+   enable_gradr/   gradappend	is_sparseRuntimeErrorr(   r*   r-   
zeros_likepreserve_formatgetadamwr    )r   closurelossr4   params_with_gradgradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepsr:   r;   r   pr(   s                  r"   r%   zAdamWLegacy.stepd   s=    	--/""$ !y! && 7	E!EHK OK >LE5I&G8_ 266> ''*66##&'PQQQVV$

1 u:?$)LL$4E&M','7'7I^I^'_E)$*/*:*:1ELaLa*bE,'272B2B1TYTiTi2j./i 01""5#67#**5995Et+LM""5=1326  i(;">2%Li(z* ..34L.Mt}}T*SW#K7	r y! !s   H  H*)
MbP?)g?g+?g:0yE>g{Gz?FFFFNFN)__name__
__module____qualname____doc__r	   r.   r   boolr   r   r&   r+   no_gradr%   __classcell__)r!   s   @r"   r   r      s    > )5"&!!+0"&*$"<"< "< &	"<
 "<  "< "< "< %)"< "< d^"< "<H2 U]]_G Gr#   r   r   rJ   rK   rL   rM   rN   r   r   r   r:   r;   r   r   r   r   r   r<   returnc       	         R   t        d |D              st        d      |:	 | xs4 dt        j                  j                  j
                  j                         v }|r%t        j                  j                         st        }nt        } || |||||||	|
|||||||       y#  d}Y LxY w)zgFunctional API that performs AdamW algorithm computation.
      See AdamWLegacy class for details.
    c              3   P   K   | ]  }t        |t        j                           y wrQ   )
isinstancer+   r   ).0ts     r"   	<genexpr>zadamw.<locals>.<genexpr>   s     @qz!U\\*@s   $&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNScalarF)
r   r:   r;   r   r   r   r   r   r   r<   )allrB   r+   opsaten_foreach_maximum_	overloadsjitis_scripting_multi_tensor_adamw_single_tensor_adamw)r   rJ   rK   rL   rM   rN   r   r   r   r:   r;   r   r   r   r   r   r<   funcs                     r"   rF   rF      s    0 @K@@!" 	" 	!k]X1Q1Q1[1[1]%]G uyy--/"#!!	Gs   9B   B&c       
             t        |       D ]o  \  }}|s||   n||    }||   }||   }||   }|dz  }||	n|	dz  |z  }|j                  d||
z  z
         |j                  |      j                  |d|z
         |j                  |      j                  ||d|z
         |r ||   }t	        j
                  |||       |}n|}|r|}dt	        j                  ||      z
  }dt	        j                  ||      z
  }|	|z  }|j                         }|j                         }|j                         ||z  z  j                  ||z        }|rU||z  dkD  j                  |j                        } | j                  | j                         j                  d	             || z  }|j                  ||       |j                         }d||z  z
  }d||z  z
  }|	|z  }t!        j                  |      }|j                         |z  j                  |      }|rU||z  dkD  j                  |j                        } | j                  | j                         j                  d	             || z  }|j                  |||        r y )
Nr      r   alpha)value)outr   rP   )min)	enumeratemul_add_addcmul_r+   maxpownegsqrttodtypediv_meanclamp_addcdiv_itemmath)!r   rJ   rK   rL   rM   rN   r   r:   r;   r   r   r   r   r   r   r<   iparamr?   r7   r8   step_twd_scaler9   
denom_baser%   bias_correction1bias_correction2	step_sizestep_size_negbias_correction2_sqrtdenommasks!                                    r"   ri   ri      s   ( f% ?=5'uQxeAhY1+ ^
Q 	!  2R1Wv-=

2<//0 	U  QY 7''d!e)'D,Q/NIInjnE'J#JD  !599UD#99 599UD#99--I%MMOM$4$9$9$;!__&*?-*OPVVWZ]jWjkE  $*..tzz:		$))+,,,67!D.NN7E*;;=D 5D=0 5D=0--I$(II.>$?!__&)>>DDSIE$*..tzz:		$))+,,,67!D.NN7E)N<?=r#   c       
         |   t        |       dk(  ry |r#t        d t        | |      D              sJ d       |rt        j                  t        |            }|D cg c].  }t        j                  |      rt        j                  |      n|0 }}|D cg c].  }t        j                  |      rt        j                  |      n|0 }}|D cg c].  }t        j                  |      rt        j                  |      n|0 }}| D cg c].  }t        j                  |      rt        j                  |      n|0 } }t        j                  |d       ||	n|	dz  |z  }t        j                  | d||
z  z
         t        j                  ||       t        j                  ||d|z
         t        j                  ||       t        j                  |||d|z
         |r|D cg c]  }t        j                  ||       }}|D cg c]  }t        j                  ||       }}t        j                  |d       t        j                  |d       t        j                  |       t        j                  |       t        j                  ||	      }t        j                  |       t        j                  |       t        j                   |      }|re|D cg c].  }t        j                  |      rt        j                  |      n|0 }}t        j"                  ||       t        j                   |      }nt        j                   |      }t        j$                  |t        j&                  ||             t        j                  ||      }t        j                  |       t        j(                  ||      }|rt        j&                  ||      }t        ||      D cg c]#  \  }}|dkD  j+                  |j,                        % }}}|D cg c]  }|j/                          }}t        j"                  |d       t        j$                  ||       t        j&                  ||      }t        j0                  | ||       y |D cg c]  }d||j3                         z  z
   }}|D cg c]  }d||j3                         z  z
   }}|D cg c]
  }|	|z  dz   }}|D cg c]  }t5        j6                  |       }}|re|D cg c].  }t        j                  |      rt        j                  |      n|0 }}t        j"                  ||       t        j                   |      }nt        j                   |      }t        j$                  ||       t        j                  ||       |rt        j&                  ||      }t        ||      D cg c]#  \  }}|dkD  j+                  |j,                        % }}}|D cg c]  }|j/                          }}t        j"                  |d       t        j$                  ||       t        j&                  ||      }t        j0                  | |||       y c c}w c c}w c c}w c c}w c c}w c c}w c c}w c c}}w c c}w c c}w c c}w c c}w c c}w c c}w c c}}w c c}w )	Nr   c              3   V   K   | ]!  \  }}|j                   xr |j                    # y wrQ   )is_cuda)r]   rO   r%   s      r"   r_   z&_multi_tensor_adamw.<locals>.<genexpr>Y  s)      
+21dAII&$,,&
s   ')z@If capturable=True, params and state_steps must be CUDA tensors.r   rl   rm   rP   )r*   ra   zipr+   _foreach_negtuple
is_complexview_as_real_foreach_add__foreach_mul__foreach_addcmul_rw   _foreach_sub__foreach_neg__foreach_div_foreach_reciprocal__foreach_sqrtrd   _foreach_div__foreach_mul_foreach_addrz   r{   r}   _foreach_addcdiv_r   r   ry   )r   rJ   rK   rL   rM   rN   r   r:   r;   r   r   r   r   r   r   r<   xr   r%   r   r   r   r   r   eps_over_step_sizer   masksmg
mask_scalebcs                                  r"   rh   rh   B  s   & 6{a 
69&+6N
 
 	NM	N 
 ""5<0JOPQe&6&6q&9U"q@PEPMUV)9)9!)<""1%!CVHVP[\1E,<,<Q,?5%%a(QF\K\KQRau'7'7':e  #ARFR 
Q' ^rq6)9H	X%< <= 
%(	%q5y9	U+	Kq5yA?JKtEIIeT2KK?JKtEIIeT2KK,a0,a0,-,- &&'7<	""9-I& % 3 34D E\klWX8H8H8Ku11!4QRRlOl##O[A,,_=J,,[9J4i@	
 #//	3?""#56"":/AB&&x7E585FGTQa!eZZ(GEG,12q!&&(2J2##J5z2))(E:H%8ALMA 44MMALMA 44MM.>?b2g^?	?9I J22 J J\klWX8H8H8Ku11!4QRRlOl##O[A''8E''4EE#89E3'&&x7E585FGTQa!eZZ(GEG,12q!&&(2J2##J5z2))(E:H%C{ QV\R& LK  m" H2 NM? J m H2s`   3Y,3Y13Y63Y; Z Z3Z
(Z
ZZ$ZZ$Z) 3Z.	(Z38Z9)NF)rU   r   typingr   r   r   r+   r   torch.optim.optimizerr   _typesr	   r   rV   r.   rF   ri   rh    r#   r"   <module>r      s    ( (   + X) XD #' :V:F|: v,: &\	:
 f: &\: $: : : : : : : :  !:" #:$ %:& 
':zS=VS=F|S= v,S= &\	S=
 fS= &\S= S= S= S= S= S= S= S= S=  !S=" #S=l{DV{DF|{D v,{D &\	{D
 f{D &\{D {D {D {D {D {D {D {D {D  !{D" #{Dr#   