
    khj                     >    d Z ddlZddlmZ ddlmZ  G d de      Zy)a	   RMSProp modified to behave like Tensorflow impl

Originally cut & paste from PyTorch RMSProp
https://github.com/pytorch/pytorch/blob/063946d2b3f3f1e953a2a3b54e0b34f1393de295/torch/optim/rmsprop.py
Licensed under BSD-Clause 3 (ish), https://github.com/pytorch/pytorch/blob/master/LICENSE

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Modifications Copyright 2021 Ross Wightman
    N)	Optimizer   )ParamsTc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededef fdZ fdZ e	j                         dd       Z xZS )	RMSpropTFa  Implements RMSprop algorithm (TensorFlow style epsilon)

    NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt
    and a few other modifications to closer match Tensorflow for matching hyper-params.

    Noteworthy changes include:
    1. Epsilon applied inside square-root
    2. square_avg initialized to ones
    3. LR scaling of update accumulated in momentum buffer

    Proposed by G. Hinton in his
    `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.

    The centered version first appears in `Generating Sequences
    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.

    Args:
        params: iterable of parameters to optimize or dicts defining parameter groups
        lr: learning rate
        momentum: momentum factor
        alpha: smoothing (decay) constant
        eps: term added to the denominator to improve numerical stability
        centered: if ``True``, compute the centered RMSProp, the gradient is normalized by an estimation of its variance
        weight_decay: weight decay (L2 penalty) (default: 0)
        decoupled_decay: decoupled weight decay as per https://arxiv.org/abs/1711.05101
        corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr) when decoupled_decay is True
        lr_in_momentum: learning rate scaling is included in the momentum buffer update as per defaults in Tensorflow
        caution: apply caution
    paramslralphaepsweight_decaymomentumcentereddecoupled_decaycorrected_weight_decaylr_in_momentumcautionc                    d|k  st        dj                  |            d|k  st        dj                  |            d|k  st        dj                  |            d|k  st        dj                  |            d|k  st        dj                  |            t        ||||||||	|
|
      }t        t        |   ||       y )N        zInvalid learning rate: {}zInvalid epsilon value: {}zInvalid momentum value: {}zInvalid weight_decay value: {}zInvalid alpha value: {})
r	   r   r
   r   r   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)selfr   r	   r
   r   r   r   r   r   r   r   r   defaults	__class__s                Q/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/rmsprop_tf.pyr   zRMSpropTF.__init__3   s     by8??CDDcz8??DEEh9@@JKKl"=DD\RSSe|6==eDEE%+#9)
 	i'9    c                     t         t        |   |       | j                  D ]J  }|j	                  dd       |j	                  dd       |j	                  dd       |j	                  dd       L y )Nr   r   r   Fr   r   )r   r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r    zRMSpropTF.__setstate__Z   sh    i+E2&& 	>EZ+Z/Y.5u=		>r   c                    d}|$t        j                         5   |       }ddd       | j                  D ]  }|d   D ]t  }|j                  |j                  }|j                  rt        d      | j                  |   }t        |      dk(  rZd|d<   t        j                  |      |d<   |d   dkD  rt        j                  |      |d<   |d	   rt        j                  |      |d
<   |d   }d|d   z
  }|dxx   dz  cc<   |d   dk7  rY|d   r>|d   r|d   dz  | j                  d   z  }	n|d   }	|j                  d|	|d   z  z
         n|j                  ||d         }|j                  |j                  d      |z
  |       |d	   rP|d
   }
|
j                  ||
z
  |       |j                  |
|
d      j                  |d         j!                         }n"|j                  |d         j!                         }|d   dkD  r|d   }|j                  |d          d }|d   r9|j#                  |||d          |d   r	 |||      }|j                  |        $|j#                  ||       |d   r	 |||      }|j                  ||d           ]|j#                  |||d           w  |S # 1 sw Y   xY w)zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   z)RMSprop does not support sparse gradientsr   step
square_avgr   momentum_bufferr   grad_avgg      ?r
   r   r   r   r   r	      )r
   )valuer   c                     | |z  dkD  j                  |j                        }|j                  |j                         j	                  d             | |z  S )Nr   gMbP?)min)todtypediv_meanclamp_)_m_gmasks      r   _apply_cautionz&RMSpropTF.step.<locals>._apply_caution   sH     "R!//9		$))+"4"4"4">?!Dy(r   r   r   )torchenable_gradr!   grad	is_sparseRuntimeErrorr#   len	ones_like
zeros_liker   mul_addadd_powaddcmulsqrt_addcdiv_)r   closurelossr$   pr:   r#   r'   one_minus_alphawd_scaler)   avgbufr7   s                 r   r&   zRMSpropTF.stepb   s    ""$ !y! && D	>E8_ C>66>vv>>&'RSS

1 u:?$%E&M*///!*<E,'Z(1,383C3CA3F/0Z(,1,<,<Q,?j)"<0
"$uW~"5f"(A-./ !9:',T{a'7$--:M'MH',T{HrHu^/D$DDE#xx~1FxG j 8P $$Z0HMM$/MI$,,Xxr,JNNuUZ|\bbdC %..u6<<>C$q( 12CHHU:./) -.T3eDkB +"0d";Ct T3/ +"0d";Cs5;,7JJtStJ=GC>D	>L S! !s   J;;K)
g{Gz?g?g|=r   r   FFFTF)N)__name__
__module____qualname____doc__r   floatboolr   r    r8   no_gradr&   __classcell__)r   s   @r   r   r      s    B "# "$)+0#'!%:%: %: 	%:
 %:  %: %: %: "%: %)%: !%: %:N> U]]_R Rr   r   )rQ   r8   torch.optimr   _typesr   r    r   r   <module>rY      s#     ! a	 ar   