
    kh"                        d Z ddlmZmZmZ ddlZddlmZ ddlm	Z	  G d de      Z
	 	 ddd	d
eej                     deej                     deej                     dedededededededee   fdZd
eej                     deej                     deej                     dededededededee   fdZd
eej                     deej                     deej                     dededededededee   fdZy)ah   Lion Optimizer
Paper: `Symbolic Discovery of Optimization Algorithms` - https://arxiv.org/abs/2302.06675
Original Impl: https://github.com/google/automl/tree/master/lion

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285
    )ListOptionalTupleN)	Optimizer   )ParamsTc                        e Zd ZdZ	 	 	 	 	 	 	 ddededeeef   dedededed	ee   f fd
Z	 fdZ
 ej                         dd       Z xZS )LionzImplements Lion algorithm.paramslrbetasweight_decaycautioncorrected_weight_decaymaximizeforeachc	           	      <   d|k  st        dj                  |            d|d   cxk  rdk  sn t        dj                  |d               d|d   cxk  rdk  sn t        dj                  |d               t        |||||||      }	t        
|   ||	       y	)
a  Initialize the hyperparameters.

        Args:
            params: iterable of parameters to optimize or dicts defining parameter groups
            lr: learning rate
            betas: coefficients used for computing running averages of gradient and its square
            weight_decay: weight decay coefficient
            caution: apply caution
            corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr)
                zInvalid learning rate: {}r   g      ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})r   r   r   r   r   r   r   N)
ValueErrorformatdictsuper__init__)selfr   r   r   r   r   r   r   r   defaults	__class__s             K/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/lion.pyr   zLion.__init__"   s    , by8??CDDeAh$$DKKERSHUVVeAh$$DKKERSHUVV%#9
 	*    c                     t         |   |       | j                  D ]J  }|j                  dd       |j                  dd       |j                  dd       |j                  dd        L y )Nr   Fr   r   r   )r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r    zLion.__setstate__I   sf    U#&& 	.EY.5u=Z/Y-		.r   c                    d}|$t        j                         5   |       }ddd       | j                  D ]  }g }g }g }|d   \  }}|d   D ]  }	|	j                  |j	                  |	       |	j                  j
                  rt        d      |j	                  |	j                         | j                  |	   }
t        |
      dk(  r(t        j                  |	t         j                        |
d<   |j	                  |
d           t        ||||||d   |d	   |d
   |d   |d   |d   r| j                  d   nd       
 |S # 1 sw Y   #xY w)zPerforms a single optimization step.

        Args:
            closure: A closure that reevaluates the model and returns the loss.

        Returns:
            the loss.
        Nr   r   z&Lion does not support sparse gradientsr   )memory_formatexp_avgr   r   r   r   r   r   )beta1beta2r   r   r   r   r   max_lr)torchenable_gradr!   gradappend	is_sparseRuntimeErrorr#   len
zeros_likepreserve_formatlionr   )r   closurelossr$   params_with_gradgradsexp_avgsr(   r)   pr#   s              r   stepz	Lion.stepQ   si    ""$ !y! && "	E!EH >LE58_ 266> ''*66##&'OPPQVV$

1 u:?','7'7I^I^'_E)$i 012   ;">2i(z*i(.34L.Mt}}T*SW-"	H O! !s   EE)g-C6?)g?gGz?r   FFFN)N)__name__
__module____qualname____doc__r   floatr   boolr   r   r    r+   no_gradr;   __classcell__)r   s   @r   r
   r
      s    %
 )4"%!+0"&*%+%+ %+ &	%+
  %+ %+ %)%+ %+ d^%+N. U]]_2 2r   r
   )r*   r   r8   r9   r   r   r(   r)   r   r   r   r*   c                b   |:	 |	 xs4 dt         j                  j                  j                  j	                         v }|r)t         j
                  j                         rt        d      |r%t         j
                  j                         st        }nt        } || |||||||	||

       y#  d}Y qxY w)z=Functional API that performs Lion algorithm computation.
    NScalarFz6torch.jit.script not supported with foreach optimizers)r(   r)   r   r   r   r   r*   )
r+   opsaten_foreach_maximum_	overloadsjitis_scriptingr0   _multi_tensor_lion_single_tensor_lion)r   r8   r9   r   r   r(   r)   r   r   r   r*   funcs               r   r4   r4      s    $ 	!k]X1Q1Q1[1[1]%]G 599))+STTuyy--/!"!	Gs   9B( (B.c                   t        |       D ]N  \  }
}|s||
   n||
    }||
   }t        j                  |      r?t        j                  |      }t        j                  |      }t        j                  |      }|	|n|dz  |	z  }|j	                  d||z  z
         |j                  |      j                  |d|z
        j                         }|ra||z  dkD  j                  |j                        }|j                  |j                         j                  d             |j	                  |       |j                  ||        |j                  |d|z
         Q y )N   r   alphar   MbP?)min)	enumerater+   
is_complexview_as_realmul_muladd_sign_todtypediv_meanclamp_lerp_)r   r8   r9   r(   r)   r   r   r   r   r*   iparamr-   r'   wd_scaleupdatemasks                    r   rM   rM      sA    f% '5'uQxeAhY1+E"%%d+D((1G&&u-E  2R1Wv-=

1x,../ U#((QY(?EEGTMA%))$**5DIIdiik((T(23KK

6"
% 	dAI&3'r   c                   t        |       dk(  ry |rt        j                  t        |            }|D 
cg c].  }
t        j                  |
      rt        j
                  |
      n|
0 }}
|D 
cg c].  }
t        j                  |
      rt        j
                  |
      n|
0 }}
| D 
cg c].  }
t        j                  |
      rt        j
                  |
      n|
0 } }
|	|n|dz  |	z  }t        j                  | d||z  z
         t        j                  ||      }t        j                  ||d|z
         |D cg c]  }|j                          }}|rt        j                  ||      }t        ||      D cg c]#  \  }}|dkD  j                  |j                        % }}}|D cg c]  }|j                          }}t        j                  |d       t        j                  ||       t        j                  ||       t        j                  | ||        t        j                  ||       t        j                  ||d|z
         y c c}
w c c}
w c c}
w c c}w c c}}w c c}w )Nr   rP   r   rQ   rS   )r1   r+   _foreach_negtuplerV   rW   _foreach_mul__foreach_mul_foreach_add_r[   zipr\   r]   r_   rH   _foreach_div_)r   r8   r9   r(   r)   r   r   r   r   r*   xrd   updatesumasksmg
mask_scales                     r   rL   rL      s    6{a""5<0JOPQe&6&6q&9U"q@PEPMUV)9)9!)<""1%!CVHVKQRau'7'7':e  #ARFR ^rq6)9H	H|$; ;<   51G	a%i8")*Qqwwy*G*""7E214UE1BCA!a%AGG$CC(-.1affh.
.
D1E:.GU+	s3 
%(	%q5y95 QVR +
 D.s#   3I-3I&3I8I8(I!'I')FN)r?   typingr   r   r   r+   torch.optim.optimizerr   _typesr   r
   TensorrA   r@   r4   rM   rL    r   r   <module>r{      s  , ) (  + e9 e\ , #',U\\",ELL!, u||$, , , , , , , , ,^&'U\\"&'ELL!&' u||$&'
 &' &' &' &' &' &' &'R-:U\\"-:ELL!-: u||$-:
 -: -: -: -: -: -: -:r   