
    kh7                     J    d Z ddlmZ ddlmZ ddlZddlmZ  G d de      Zy)	an   PyTorch impl of LaProp optimizer

Code simplified from https://github.com/Z-T-WANG/LaProp-Optimizer, MIT License

Paper: LaProp: Separating Momentum and Adaptivity in Adam, https://arxiv.org/abs/2002.04839

@article{ziyin2020laprop,
  title={LaProp: a Better Way to Combine Momentum with Adaptive Gradient},
  author={Ziyin, Liu and Wang, Zhikang T and Ueda, Masahito},
  journal={arXiv preprint arXiv:2002.04839},
  year={2020}
}

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

    )Tuple)	OptimizerN   )ParamsTc                        e Zd ZdZ	 	 	 	 	 	 ddededeeef   dedededef fd	Z fd
Z	 e
j                         dd       Z xZS )LaPropzw LaProp Optimizer

    Paper: LaProp: Separating Momentum and Adaptivity in Adam, https://arxiv.org/abs/2002.04839
    paramslrbetasepsweight_decaycautioncorrected_weight_decayc                    d|k  st        dj                  |            d|k  st        dj                  |            d|d   cxk  rdk  sn t        dj                  |d               d|d   cxk  rdk  sn t        dj                  |d               t        ||||||	      }t        t        |   ||       y )
N        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})r
   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)
selfr	   r
   r   r   r   r   r   defaults	__class__s
            M/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/laprop.pyr   zLaProp.__init__    s     by8??CDDcz8??DEEeAh$$DKKERSHUVVeAh$$DKKERSHUVV%#9
 	fd$VX6    c                     t         |   |       | j                  D ]&  }|j                  dd       |j                  dd       ( y )Nr   Fr   )r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r   zLaProp.__setstate__<   sF    U#&& 	>EY.5u=	>r   c                    d}|$t        j                         5   |       }ddd       | j                  D ];  }|d   D ]/  }|j                  |j                  }|j                  rt        d      | j                  |   }t        |      dk(  r?d|d<   t        j                  |      |d<   d|d<   d|d	<   t        j                  |      |d
<   |d   |d
   }}|d   \  }	}
|dxx   dz  cc<   d|
z
  }d|	z
  }|j                  |
      j                  |||       |d   |	z  ||d   z  z   |d<   |d	   |
z  |z   |d	<   |d   dk7  r|d   |d   z  nd}|d	   }d|z  }|j                  |      j                         j                  |d         }||z  }|j                  |	      j                  ||d   |z         |d   rU||z  dkD  j                  |j                        }|j!                  |j#                         j%                  d             ||z  }|j                  ||        |d   dk7  s|d   r|d   dz  | j&                  d   z  }n|d   }|j                  || |d   z         2 > |S # 1 sw Y   WxY w)zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr	   z(LaProp does not support sparse gradientsr   stepexp_avgr   exp_avg_lr_1exp_avg_lr_2
exp_avg_sqr   r   )valuer
   r   r   )alphar   gMbP?)minr   r      )torchenable_gradr   grad	is_sparseRuntimeErrorr!   len
zeros_likemul_addcmul_divsqrt_add_todtypediv_meanclamp_r   )r   closurelossr"   pr/   r!   r%   r(   beta1beta2one_minus_beta2one_minus_beta1bias_correction1bias_correction2	step_sizedenomstep_of_this_gradmaskwd_scales                       r   r$   zLaProp.stepB   s    ""$ !y! && 8	GE8_ 7G66>vv>>&'QRR

1 u:?$%E&M','7'7':E)$,.E.),.E.)*/*:*:1*=E,'&+I&6l8K$W~uf""#e)"#e) &//d//R(-n(=(EZ_`dZeHe(en%(-n(=(E(Wn% KPPT+Y[J[5#85;#Fac #(#8  00	"'78>>@EEeElS$(5L!U#(():%+P_B_(`##dNQ.224::>DIIdiik00T0:;%nGwyj1(A-56#(;!#3dmmD6I#I#(;FF1XIn0E$EFFo7G8	Gt {! !s   I66J )g-C6:?)g?g+?gV瞯<r   FF)N)__name__
__module____qualname____doc__r   floatr   boolr   r   r-   no_gradr$   __classcell__)r   s   @r   r   r      s     )5"$!+077 7 &	7
 7  7 7 %)78> U]]_F Fr   r   )	rO   typingr   torch.optimr   r-   _typesr   r    r   r   <module>rX      s&   $  !  nY nr   