
    kh6L              +       h   d Z ddlmZmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	gZd
 Zd'dZd Zd Z G d de
      Zdee   dee   dee   dee   dee   dee   dee   dedededeeef   dedee   dee   dededed ed!ed"ef(d#Zdee   dee   dee   dee   dee   dee   dee   dedededeeef   dedee   dee   dededed ed!ed"ef(d$Z	 	 	 	 	 	 d(dee   dee   dee   dee   dee   d%ee   d!ed"edee   dee   dedededeeef   dedee   dee   dededed ef*d&Zy))u@   ADOPT PyTorch Optimizer

ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate: https://arxiv.org/abs/2411.02853

Modified for reduced dependencies on PyTorch internals from original at: https://github.com/iShohei220/adopt

@inproceedings{taniguchi2024adopt,
 author={Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka},
 booktitle = {Advances in Neural Information Processing Systems},
 title = {ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate},
 year = {2024}
}

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285
    )castListOptionalTupleUnionN)Tensor)	Optimizer   )ParamsTAdoptadoptc                     t        |       D ]X  \  }}t        j                  |      st        j                  | |         | |<   |D ]  }t        j                  ||         ||<    Z y N)	enumeratetorch
is_complexview_as_real)paramsstate_and_gradsipss        L/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/adopt.py_view_as_realr      sg    &! 01A**6!95F1I$ 0))!A$/!00    c                     | rt         j                  S t        j                         t         j                  k(  rt         j                  S t         j                  S r   )r   float32get_default_dtypefloat64)is_fuseds    r   _get_scalar_dtyper!   $   s;    }}002emmCINr   c                      t        t        d      r8t        t        j                  d      rt        j                  j                         S y)Ncompileris_compilingF)hasattrr   r#   r$    r   r   _is_compilingr'   ,   s/    uj!gennn&M~~**,,r   c                     t         j                  j                         st               r| S t	        | t         j
                        r| j                         S | S r   )r   jitis_scriptingr'   
isinstancer   item)xs    r   
_get_valuer.   3   s9    99!!#%a6qvvx=A=r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddddddddedeeef   deeef   dede	e   d	ed
e
de
de
de	e
   de
de
de
f fdZ fdZd Z ej                         dd       Z xZS )r   uq   
    ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate: https://arxiv.org/abs/2411.02853

    F)cautionforeachmaximize
capturabledifferentiabler   lrbetasepsclip_expweight_decay	decoupledcorrected_weight_decayr0   r1   r2   r3   r4   c	                   t        |t              r-|
r|st        d      |j                         dk7  rt        d      d|k  st        d|       d|k  st        d|       d|d   cxk  rdk  sn t        d	|d          d|d   cxk  rdk  sn t        d
|d          d|k  st        d|       t	        ||||||||	||
||      }t
        |   ||       y )NElr as a Tensor is not supported for capturable=False and foreach=Truer
   zTensor lr must be 1-element        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: )r5   r6   r7   r9   r8   r:   r;   r0   r2   r1   r3   r4   )r+   r   
ValueErrornumeldictsuper__init__)selfr   r5   r6   r7   r8   r9   r:   r;   r0   r1   r2   r3   r4   defaults	__class__s                  r   rD   zAdopt.__init__@   s   " b&!z [  xxzQ !>??by6rd;<<cz6se<==eAh$$B58*MNNeAh$$B58*MNNl";L>JKK%#9!)
 	*r   c                    t         |   |       | j                  D ].  }|j                  dd       |j                  dd        |j                  dd       |j                  dd       |j                  dd        |j                  dd       |j                  dd       |d	   D ]  }| j                  j                  |g       }t        |      d
k7  s.t        j                  |d         rGt        |d         }|d   r*t        j                  |t               |j                        nt        j                  |t                     |d<    1 y )Nr2   Fr1   r3   r4   r8   r0   r;   r   r   stepdtypedevicerK   )rC   __setstate__param_groups
setdefaultstategetlenr   	is_tensorfloattensorr!   rL   )rE   rQ   groupr   p_statestep_valrG   s         r   rN   zAdopt.__setstate__s   s,   U#&& 	EZ/Y-\51-u5Z.Y.5u=8_ **..B/w<1$U__WV_-M$WV_5H !. $"3"5#$88 #\\(:K:MN FO		r   c                    d}|d   D ]  }|j                   |t        j                  |      z  }|j                  |       |j                   j                  rt        d      |j                  |j                          | j                  |   }	t        |	      dk(  r|d   r4t        j                  dt               |j                   j                        nt        j                  dt               	      |	d
<   t        j                  |j                   t        j                        |	d<   t        j                  |j                   t        j                        |	d<   |j                  |	d          |j                  |	d          |d   r|	d
   j                  rt        d      |d   r(t        j                  |d         r|d   st        d      |j                  |	d
           |S )NFr   z'ADOPT does not support sparse gradientsr   r3   r&   rJ   r>   rM   rI   )memory_formatexp_avg
exp_avg_sqr4   zB`requires_grad` is not supported for `step` in differentiable moder1   r5   r=   )gradr   r   append	is_sparseRuntimeErrorrQ   rS   zerosr!   rL   rV   
zeros_likepreserve_formatrequires_gradrT   )
rE   rW   params_with_gradgradsexp_avgsexp_avg_sqsstate_stepshas_complexr   rQ   s
             r   _init_groupzAdopt._init_group   s    x #	.Avv~5++A..K##A&vv"#LMMLL JJqME5zQ \* KK*;*=affmmTc1B1DE f $)#3#3AFF%J_J_#`i &+&6&6qvvUMbMb&cl#OOE),-u\23%&5=+F+F"#ghh YEOOE$K$@|I\"#jkkuV}-G#	.H r   c                    | j                          d}|$t        j                         5   |       }ddd       | j                  D ]  }g }g }g }g }g }|d   \  }	}
| j	                  ||||||      }t        |||||fi d|d|	d|
d|d   d|d   d|d   d	|d
   r| j                  d   ndd|d   d|d   d|d   d|d   d|d   d|d   d|d   dt        | dd      dt        | dd        |S # 1 sw Y   xY w)zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr6   rk   beta1beta2r5   r9   r8   max_lrr;   r:   r7   r0   r2   r1   r3   r4   
grad_scale	found_inf) _cuda_graph_capture_health_checkr   enable_gradrO   rl   r   rF   getattr)rE   closurelossrW   rf   rg   rh   ri   rj   rn   ro   rk   s               r   rI   z
Adopt.step   s    	--/""$ !y! && '	E-/"$E%'H(*K(*K >LE5** K   (   ; #>2 z* /44L.Mt}}T*SW  , %L i(  z*!" i(#$ !.%&  %%56'( #4t<)* "$T:+#'	R Y! !s   C::D)MbP?)g?gH.?gư>gZd;O?r>   FFr   )__name__
__module____qualname____doc__r   r   rU   r   r   r   boolrD   rN   rl   r   no_gradrI   __classcell__)rG   s   @r   r   r   ;   s     (,)6(-"%#+01+ "&+"$#(1+1+ eVm$1+ &	1+
 1+ uo1+  1+ 1+ %)1+ 1+ d^1+ 1+ 1+ !1+f0.b U]]_7 7r   r   rg   rh   ri   rj   rq   rr   rk   rn   ro   r5   r9   r8   rp   r:   r7   r0   r2   r3   r4   c                   ||J t         j                  j                         rt        |
t              sJ t        |       D ]  \  }}|s||   n||    }||   }||   }||   }|rgt               s]ddlm}  |       }|j                  j                  |j                  j                  k(  r|j                  j                  |v sJ d| d       |dz  }t        j                  |      rXt        j                  |      }|t        j                  |      }|t        j                  |      }t        j                  |      }|dk7  r|s|j                  ||      }|s|r|n
t        |      }|dk(  r"|j                  ||j!                                P|dk7  r%|r#||
dz  |z  n|
}|j#                  || |z         t        j$                  |j'                         |      }|j)                  |      }||dz
  |z  } |j+                  |  |        |j-                  |d|z
         |rU||z  dkD  j/                  |j0                        }!|!j3                  |!j5                         j+                  d	             ||!z  }|j#                  ||
        |j7                  |	      j                  ||j!                         d|	z
  
        y )Nr   !_get_capturable_supported_devicesIIf capturable=True, params and state_steps must be on supported devices: .r
   alpha   rx   )minvalue)r   r)   r*   r+   rU   r   r'   torch.optim.optimizerr   rL   typer   r   addr.   addcmul_conjadd_clampsqrtdivclamp_lerp_torK   div_meanmul_)"r   rg   rh   ri   rj   rq   rr   rk   rn   ro   r5   r9   r8   rp   r:   r7   r0   r2   r3   r4   r   paramr^   r\   r]   step_tr   capturable_supported_devicesrI   wd_scaledenomnormed_gradclip_valmasks"                                     r   _single_tensor_adoptr      s   . )"333yy "e$$$f% 5L5'uQxeAhY1+ ^
Q moO+L+N(<<$$(:(::u||?P?PTp?p |[\x[yyz{|p 	!E"%%d+D",,W5%"//
;
&&u-E1Y88E86D#~v:f;M19diik21+1+=rQw'2HJJuXI$<J=JOO-s3hhuoqX-Hy(3k1u9-dNQ&**4::6DIIdiik((T(23nG

72#
&''diikU'Kk5Lr   c                  ) t        |       dk(  ry t        |
t              r|st        d      |rBt	               s8ddlm}  |d      )t        )fdt        | |      D              sJ d) d       ||J |rJ d	       t        j                  | ||||g      }|j                         D ]  \  \  }}}}}}t        t        t           |      }t        t        t           |      }t        t        t           |      }t        t        t           |      }t        t        t           |      } |rt        ||||       |rt        j                   |      }t	               s=| d   j"                  r.t        j$                  | t        j&                  d
d      d
       nt        j$                  | d       |dk7  r5|s3|rt        j$                  |||       nt        j(                  |||      }| d   dk(  rt        j*                  |||       _|dk7  r*|r(||
dz  |z  n|
}!t        j$                  |||! |z         t        j,                  |      }"t        j.                  |"|       t        j0                  ||"      }#|8| d   dz
  |z  }$t        j.                  |#|$        t        j2                  |#|$       t        j4                  ||#d|z
         |rt        j6                  ||      }%t        |%|      D &'cg c]#  \  }&}'|&dkD  j9                  |'j:                        % }%}&}'|%D &cg c]  }&|&j=                          }(}&t        j.                  |(d       t        j>                  |%|(       t        j6                  ||%      }t        j$                  |||
        t        j@                  ||	       t        j*                  |||d|	z
         ! y c c}'}&w c c}&w )Nr   r=   r   F)supports_xlac              3      K   | ]N  \  }}|j                   j                  |j                   j                  k(  xr |j                   j                  v  P y wr   )rL   r   ).0r   rI   r   s      r   	<genexpr>z&_multi_tensor_adopt.<locals>.<genexpr>s  sJ      
4 HHMMT[[---_!((--C_2__
s   AAr   r   z#_foreach ops don't support autogradr?   cpu)rL   r   r
   r   rx   r   )!rS   r+   r   ra   r'   r   r   allzipr	   "_group_tensors_by_device_and_dtypevaluesr   r   r   r   _foreach_negis_cpu_foreach_add_rV   _foreach_add_foreach_addcmul__foreach_sqrt_foreach_maximum__foreach_div_foreach_minimum__foreach_lerp__foreach_mulr   rK   r   _foreach_div__foreach_mul_)*r   rg   rh   ri   rj   rq   rr   rk   rn   ro   r5   r9   r8   rp   r:   r7   r0   r2   r3   r4   r   grouped_tensorsdevice_params_device_grads_device_exp_avgs_device_exp_avg_sqs_device_state_steps__device_paramsdevice_gradsdevice_exp_avgsdevice_exp_avg_sqsdevice_state_stepsr   exp_avg_sq_sqrtr   r   masksmg
mask_scaler   s*                                            @r   _multi_tensor_adoptr   N  s   . 6{a"fjS
 	

 -/K'H(
$  
v{3
 
 	w WWsVttuv	w 

 )"333DDDBB	+{;O ""$Da 		 	T&\>:DL-8tF|-=>!$v,0CD!$v,0CD -HZ[ --l;L #5a#8#?#? 2ELLU4S[^_ 2A61Y##L-|T$11,Uaba A%##$6lS1+1+=rQw'2H}XIP\D\]--.@A5((G*1-1h>H##K(;##K:_k1u9E&&EE585MNTQa!eZZ(NEN,12q!&&(2J2##J5z2#00%HOM?2#F.6 2L,VWZ_V_`IDav O2s   5(O$Or1   c       
         @   |d}t               st        d |D              st        d      |r)t        j                  j                         rt        d      |r%t        j                  j                         st        }nt        } || ||||f|
||||||||||||||	d y)z?Functional API that performs ADOPT algorithm computation.

    NFc              3   P   K   | ]  }t        |t        j                           y wr   )r+   r   r   )r   ts     r   r   zadopt.<locals>.<genexpr>  s     &Xqz!U\\'B&Xs   $&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsz6torch.jit.script not supported with foreach optimizers)rk   rn   ro   r5   r9   r8   rp   r:   r7   r0   r2   r3   r4   rq   rr   )r'   r   ra   r   r)   r*   r   r   )r   rg   rh   ri   rj   r1   r3   r4   rq   rr   rk   rn   ro   r5   r9   r8   rp   r:   r7   r0   r2   funcs                         r   r   r     s    :  ?3&XK&X#X^
 	
 599))+STTuyy--/"#  !%)r   r   )NFFNNF)r|   typingr   r   r   r   r   r   r   r   r	   _typesr   __all__r   r!   r'   r.   r   r}   rU   r   r   r   r&   r   r   <module>r      s  " 6 5   + G
0>yI yxTLVTLF|TL v,TL &\	TL
 &\TL V$TL F#TL TL TL TL %- TL TL 5/TL TL  !TL" #TL$ %TL& 'TL( )TL* +TLnuaVuaF|ua v,ua &\	ua
 &\ua V$ua F#ua ua ua ua %- ua ua 5/ua ua  !ua" #ua$ %ua& 'ua( )ua* +uaB #' $'+&*!DVDF|D v,D &\	D
 &\D $D D D V$D F#D D D  !D" %- #D$ %D& 5/'D( )D* +D, -D. /D0 1Dr   