
    kh2'                     6    d dl Z d dlZd dlmZ  G d de      Zy)    N)	Optimizerc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 d fd	Z fdZ ej                         d        Z ej                         dd       Z	 xZ
S )	AdaBeliefa  Implements AdaBelief algorithm. Modified from Adam in PyTorch

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-16)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        decoupled_decay (boolean, optional): (default: True) If set as True, then
            the optimizer uses decoupled weight decay as in AdamW
        fixed_decay (boolean, optional): (default: False) This is used when weight_decouple
            is set as True.
            When fixed_decay == True, the weight decay is performed as
            $W_{new} = W_{old} - W_{old} \times decay$.
            When fixed_decay == False, the weight decay is performed as
            $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the
            weight decay ratio decreases with learning rate (lr).
        rectify (boolean, optional): (default: True) If set as True, then perform the rectified
            update similar to RAdam
        degenerated_to_sgd (boolean, optional) (default:True) If set as True, then perform SGD update
            when variance of gradient is high
    reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020

    For a complete table of recommended hyperparameters, see https://github.com/juntang-zhuang/Adabelief-Optimizer'
    For example train/args for EfficientNet see these gists
      - link to train_script: https://gist.github.com/juntang-zhuang/0a501dd51c02278d952cf159bc233037
      - link to args.yaml: https://gist.github.com/juntang-zhuang/517ce3c27022b908bb93f78e4f786dc3
    c                    d|k  st        dj                  |            d|k  st        dj                  |            d|d   cxk  rdk  sn t        dj                  |d               d|d   cxk  rdk  sn t        dj                  |d               t        |t        t        f      rht        |      dkD  rZt        |d   t              rG|D ]B  }d	|v s|d	   d   |d   k7  s|d	   d   |d   k7  s%t        d
      D cg c]  }g d c}|d<   D t        ||||||
||	|t        d
      D cg c]  }g d c}
      }t        t        | +  ||       y c c}w c c}w )Ng        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}   z%Invalid beta parameter at index 1: {}betas
   )NNNbuffer)
lrr	   epsweight_decayamsgraddegenerated_to_sgddecoupled_decayrectifyfixed_decayr   )
ValueErrorformat
isinstancelisttuplelendictrangesuperr   __init__)selfparamsr   r	   r   r   r   r   r   r   r   param_defaults	__class__s                 P/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/adabelief.pyr   zAdaBelief.__init__*   s{    by8??CDDcz8??DEEeAh$$DKKERSHUVVeAh$$DKKERSHUVVftUm,VqZPVWXPY[_E` Ne#w):eAh)F%PW.YZJ[_def_gJgCH9&Ma'9&ME(ON %1+#05b	:1&:
 	i'9 'N ;s   E3E c                 r    t         t        |   |       | j                  D ]  }|j	                  dd        y )Nr   F)r   r   __setstate__param_groups
setdefault)r   stategroupr#   s      r$   r&   zAdaBelief.__setstate__S   s7    i+E2&& 	/EY.	/    c                    | j                   D ]p  }|d   D ]f  }| j                  |   }|d   }d|d<   t        j                  |      |d<   t        j                  |      |d<   |sOt        j                  |      |d<   h r y )Nr   r   r   stepexp_avgexp_avg_varmax_exp_avg_var)r'   r)   torch
zeros_like)r   r*   pr)   r   s        r$   resetzAdaBelief.resetX   s    && 	CE8_ C

1	* !"f#(#3#3A#6i  (-'7'7':m$/4/?/?/BE+,C	Cr+   c           	      $	   d}|$t        j                         5   |       }ddd       | j                  D ]J  }|d   D ]>  }|j                  |j                  }|j                  t         j
                  t         j                  hv r|j                         }|j                  rt        d      |}|j                  t         j
                  t         j                  hv r|j                         }|d   }|d   \  }}	| j                  |   }
t        |
      dk(  rOd|
d<   t        j                  |      |
d<   t        j                  |      |
d	<   |rt        j                  |      |
d
<   |d   r;|d   s|j                  d|d   |d   z  z
         n6|j                  d|d   z
         n|d   dk7  r|j                  ||d          |
d   |
d	   }}|
dxx   dz  cc<   d||
d   z  z
  }d|	|
d   z  z
  }|j                  |      j                  |d|z
         ||z
  }|j                  |	      j                  ||d|	z
         |rh|
d
   }t        j                   ||j                  |d         |       |j#                         t%        j"                  |      z  j                  |d         }nJ|j                  |d         j#                         t%        j"                  |      z  j                  |d         }|d   s|d   |z  }|j'                  |||        n&|d   t)        |
d   dz           }|
d   |d   k(  r|d   |d   }}n|
d   |d<   |	|
d   z  }dd|	z
  z  dz
  }|d|
d   z  |z  d|z
  z  z
  }||d<   |dk\  rCt%        j"                  d|z
  |dz
  z  |dz
  z  |dz
  z  |z  |z  |dz
  z        d||
d   z  z
  z  }n|d   rdd||
d   z  z
  z  }nd}||d<   |dk\  r>|j#                         j                  |d         }|j'                  ||| |d   z         n|dkD  r|j                  || |d   z         |j                  t         j
                  t         j                  hv s.|j+                  |       A M |S # 1 sw Y   fxY w)zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   zOAdaBelief does not support sparse gradients, please consider SparseAdam insteadr   r	   r   r-   r.   r/   r0   r   r   r   r   r   )alphar   )valuer   )outr   r   r
            r   )r1   enable_gradr'   graddtypefloat16bfloat16float	is_sparseRuntimeErrorr)   r   r2   mul_add_addcmul_maxsqrtmathaddcdiv_intcopy_)r   closurelossr*   r3   r>   p_fp32r   beta1beta2r)   r.   r/   bias_correction1bias_correction2grad_residualr0   denom	step_sizebufferednum_smabeta2_tnum_sma_maxs                          r$   r-   zAdaBelief.stepj   s    ""$ !y! && b	$E8_ a$66>vv::%--!@@::<D>>&ik k 77u}}enn==#\\^F	*$W~u

1u:?$%E&M','7'7'?E)$+0+;+;F+CE-(383C3CF3K/0 *+ /C%+n8M*M$MNC%*?$?@^,1		&n0E	F (-Y'7}9Mf"#$uf'=#= #$uf'=#=  U#((QY(? $w  '00UVY^U^0_&+,=&>OIIo{/?/?e/MSbc -113dii@P6QQWWX]^cXdeE(--eEl;@@BTYYO_E``ffglmrgstE Y' %d.> >IOOGU9*OE  %Xs5=23E/FGHV}3-5a[(1+&+Fm"'5="8&'1u9o&9"-E&M0AG0KqSZ{0["[&- #a<(,		!"W!(1!.1<q!B!(1!.07!8 !,!, 0;Q!@)A EFQVW]Q^H^D^	)`I
 ##78(+q5E&M3I/I(JI(*I&/!| + 0 0 2 7 7e Eyj5QU;>VW"QGI:d3KL77u}}enn==GGFOCa$b	$H O! !s   RR)	gMbP?)g?g+?gؗҜ<r   FTFTT)N)__name__
__module____qualname____doc__r   r&   r1   no_gradr4   r-   __classcell__)r#   s   @r$   r   r      sm    !L  #':R/
 U]]_C C" U]]_o or+   r   )rJ   r1   torch.optim.optimizerr   r    r+   r$   <module>rd      s      +T	 Tr+   