
    khZ)                     V    d Z ddlZddlmZmZ ddlZddlmZ ddlm	Z	  G d de      Z
y)	a   PyTorch Lamb optimizer w/ behaviour similar to NVIDIA FusedLamb

This optimizer code was adapted from the following (starting with latest)
* https://github.com/HabanaAI/Model-References/blob/2b435114fe8e31f159b1d3063b8280ae37af7423/PyTorch/nlp/bert/pretraining/lamb.py
* https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py
* https://github.com/cybertronai/pytorch-lamb

Use FusedLamb if you can (GPU). The reason for including this variant of Lamb is to have a version that is
similar in behaviour to APEX FusedLamb if you aren't using NVIDIA GPUs or cannot install/use APEX.

In addition to some cleanup, this Lamb impl has been modified to support PyTorch XLA and has been tested on TPU.

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Original copyrights for above sources are below.

Modifications Copyright 2021 Ross Wightman
    N)OptionalTuple)	Optimizer   )ParamsTc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 ddedededeeef   dededed	ee   d
ededededef fdZ	 fdZ
d Z ej                         dd       Z xZS )Lamba  Implements a pure pytorch variant of FuseLAMB (NvLamb variant) optimizer from apex.optimizers.FusedLAMB
    reference: https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py

    LAMB was proposed in:
    - Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:  https://arxiv.org/abs/1904.00962
    - On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ

    Args:
        params: Iterable of parameters to optimize or dicts defining parameter groups.
        lr: Learning rate
        betas: Coefficients used for computing running averages of gradient and its norm.
        eps: Term added to the denominator to improve numerical stability.
        weight_decay: Weight decay
        grad_averaging: Whether apply (1-beta2) to grad when calculating running averages of gradient.
        max_grad_norm: Value used to clip global grad norm.
        trust_clip: Enable LAMBC trust ratio clipping.
        always_adapt: Apply adaptive learning rate to 0.0 weight decay parameter.
        caution: Apply caution.
        decoupled: apply decoupled weight decay
        corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr) when using decoupled_decay
    paramslrbias_correctionbetasepsweight_decaygrad_averagingmax_grad_norm
trust_clipalways_adaptcautiondecoupled_decaycorrected_weight_decayc                 T    t        ||||||||	|
|||      }t        | 	  ||       y )N)r   r   r   r   r   r   r   r   r   r   r   r   )dictsuper__init__)selfr
   r   r   r   r   r   r   r   r   r   r   r   r   defaults	__class__s                  K/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/lamb.pyr   zLamb.__init__Z   sD      +%)'!%+#9
 	*    c                     t         |   |       | j                  D ]8  }|j                  dd       |j                  dd       |j                  dd       : y )Nr   Fr   r   )r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r!   zLamb.__setstate__z   sW    U#&& 	>EY..65u=	>r   c                    | j                   d   }|y g }| j                  D ]j  }|d   D ]`  }|j                  |j                  }|j                  rt	        d      |j                  t        j                  j                  |             b l t        j                  j                  t        j                  |            }||z  j                  d      }|S )Nr   r
   zDLamb does not support sparse gradients, consider SparseAdam instead.      ?min)r   r"   grad	is_sparseRuntimeErrorappendtorchlinalgvector_normstackclamp_)r   r   normsr%   pr*   global_normclip_global_norms           r   _get_clip_grad_normzLamb._get_clip_grad_norm   s    o6 && 	=E8_ =66>vv>>&'mnnU\\55d;<=	= ll..u{{5/AB'-7??C?Hr   c           
         d}|$t        j                         5   |       }ddd       | j                         }| j                  D ]  }|d   rdnd}|d   \  }}|d   rdnd}|rd|z
  nd}	d|v r|dxx   dz  cc<   nd|d<   |rd||d   z  z
  }
d||d   z  z
  }nd	\  }
}|d
   D ]  }|j                  |j                  }||j                  |       | j                  |   }t        |      dk(  r0t        j                  |      |d<   t        j                  |      |d<   |d   |d   }}|j                  |      j                  ||	       |j                  |      j                  ||d|z
         |j                         t        j                  |      z  j                  |d         }||
z  j                  |      }|d   ra||z  dkD  j                  |j                        }|j                  |j!                         j#                  d             |j                  |       |d   }|dk7  r`|j%                  dd      r;|d   r|d   dz  | j&                  d   z  }n|d   }|j                  || |z         n|j                  ||       |dk7  s|d   r|j)                  d      }|j)                  d      }||z  }t        j*                  |dkD  t        j*                  |dkD  |d      d      }|d   rt        j,                  |d      }|j                  |       |j                  ||d             |S # 1 sw Y   !xY w)zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r'   step)r'   r'   r
   exp_avg
exp_avg_sq)alpha)valuer   r   MbP?r(   r   r   Fr   r      r   g       @r   )max)r.   enable_gradr7   r"   r*   div_r$   len
zeros_likemul_add_addcmul_sqrtmathtodtypemeanr2   getr   normwhereclamp)r   closurelossclip_grad_normr%   r   beta1beta2r   beta3bias_correction1bias_correction2r4   r*   r$   r:   r;   denomupdatemaskr   wd_scalew_normg_normtrust_ratios                            r   r9   z	Lamb.step   so    ""$ !y! 113&& P	3E#():#;aO >LE5"'(8"9QqN!/AISE f" !f#$uf'=#= #$uf'=#= 5=2 "28_ =366>vv!-IIn-

1 u:?','7'7':E)$*/*:*:1*=E,'&+I&6l8K U#((U(;&//d!e)/L#*TYY7G-HHNNuUZ|\!$44::5A#"TMA-11$**=DIIdiik00T0:;KK%$^41$yy!2E: !9:',T{a'7$--:M'MH',T{Hq	L(@AA\:1$n(= VVC[F#[[-F"(6/K #(++
FQJSA#K
 \*&+kk+3&GKK,veDk\2{=3'P	3d o! !s   M  M
)r>   T)g?g+?gư>g{Gz?Tr'   FFFFF)N)__name__
__module____qualname____doc__r   floatboolr   r   r   r!   r7   r.   no_gradr9   __classcell__)r   s   @r   r	   r	   C   s    2 $()5"&#'-0$!&!$)+0++ + "	+
 &+ +  + !+ $E?+ + + + "+ %)+@> $ U]]__ _r   r	   )rc   rI   typingr   r   r.   torch.optimr   _typesr   r	    r   r   <module>rl      s*   r  "  ! p9 pr   