
    kh              $          d Z ddlZddlmZmZ ddlZddlmZ ddlm	Z	 dej                  dej                  d	ej                  d
ej                  dededededej                  dedededededededeeef   def$dZ G d de      Zy)a   PyTorch MARS Optimizer

Code simplified from https://github.com/AGI-Arena/MARS

Paper: MARS: Unleashing the Power of Variance Reduction for Training Large Models - https://arxiv.org/abs/2411.10438

@article{yuan2024mars,
  title={MARS: Unleashing the Power of Variance Reduction for Training Large Models},
  author={Yuan, Huizhuo and Liu, Yifeng and Wu, Shuang and Zhou, Xun and Gu, Quanquan},
  journal={arXiv preprint arXiv:2411.10438},
  year={2024}
}
    N)OptionalTuple)	Optimizer   )ParamsTpgradexp_avg
exp_avg_sqlrweight_decaybeta1beta2	last_gradepsstepgamma	mars_type
is_grad_2doptimize_1dlr_1d_factorbetas_1dcautionc                 l   |s|rd|z
  }|
dk(  r|}nH||z
  j                  |||z  z        j                  |      }t        j                  |      }|dkD  r||z  }|j                  |      j                  ||       |rU||z  dkD  j	                  |j
                        }|j                  |j                         j                  d             ||z  }|dk(  r|j                  |      j                  ||d|z
         d||
z  z
  }d||
z  z
  }|j                         t        j                  |      z  j                  |	      }| |z  ||z  j                  |      z   }n|d	k(  r| |z  |j                         z   }nJ | j                  ||        ||fS |\  }}|j                  |      j                  |d|z
         |j                  |      j                  ||d|z
         d||
z  z
  }d||
z  z
  }|j                         t        j                  |      z  j                  |	      }|rU||z  dkD  j	                  |j
                        }|j                  |j                         j                  d             ||z  }| |z  ||z  j                  |      z   }| j                  |||z          ||fS )
N      ?r   )alphar   gMbP?)minadamw)valuelion)mul_add_torchnormtodtypediv_meanclamp_addcmul_sqrtmathsign)r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   one_minus_beta1c_tc_t_normmaskbias_correction1bias_correction2denomupdatebeta1_1dbeta2_1ds                               K/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/mars.py_mars_single_tensor_stepr9      s   * ju*19C)#))%5?3J*KLQQRVWCzz#H"}HnU  O <dNQ&**4::6DIIdiik((T(23nGOOE"++CBJ+G"Ud]2"Ud]2__&3C)DDJJ3OE%3C)C(I(I%(PPF& %6F5	vbS! J &(X##DX#>!**4R(]*KT!11T!11"TYY/?%@@FFsKdNQ&**4::6DIIdiik((T(23nG\!W/?%?$E$Ee$LL	vrL012J    c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 ddededeeef   dedededed	ed
ede	eeef      def fdZ
 fdZ ej                         dd       Z xZS )Marsz MARS Optimizer

    Paper: MARS: Unleashing the Power of Variance Reduction for Training Large Models
        https://arxiv.org/abs/2411.10438

    paramsr   betasr   r   r   r   r   r   r   r   c                    d|k  st        dj                  |            d|k  st        dj                  |            d|d   cxk  rdk  sn t        dj                  |d               d|d   cxk  rdk  sn t        dj                  |d               |d	v sJ d
       t        ||||||||	|
xs ||
      }t        t        |   ||       y )N        zInvalid learning rate: {}zInvalid epsilon value: {}r   r   z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})r   r    zMARS type not supported)
r   r>   r   r   r   r   r   r   r   r   )
ValueErrorformatdictsuperr<   __init__)selfr=   r   r>   r   r   r   r   r   r   r   r   defaults	__class__s                r8   rE   zMars.__init__b   s     by8??CDDcz8??DEEeAh$$DKKERSHUVVeAh$$DKKERSHUVV--H/HH-%#%&
 	dD"684r:   c                 r    t         t        |   |       | j                  D ]  }|j	                  dd        y )Nr   F)rD   r<   __setstate__param_groups
setdefault)rF   stategrouprH   s      r8   rJ   zMars.__setstate__   s7    dD&u-&& 	/EY.	/r:   c                    d}|$t        j                         5   |       }ddd       | j                  D ]"  }|d   D ]  }|j                  |j                  }|j                  rt        d      | j                  |   }t        |      dk  rMd|d<   t        j                  |      |d<   t        j                  |      |d<   t        j                  |      |d	<   |dxx   dz  cc<   |d   }|d   }|d	   }	|d   }
|d
   }|d   }|d   \  }}|j                  dk\  }t        ||||	|||||
|d   ||d   |d   ||d   |d   |d   |d          ||d<    % |S # 1 sw Y   >xY w)zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr=   zJAdam does not support sparse gradients, please consider SparseAdam insteadr   r   r   r
   r   r   r   r   r>      r   r   r   r   r   r   r   )r   r   r   r   r   r   )r#   enable_gradrK   r	   	is_sparseRuntimeErrorrM   len
zeros_likendimr9   )rF   closurelossrN   r   r	   rM   r   r
   r   r   r   wdr   r   r   s                   r8   r   z	Mars.step   s    ""$ !y! && 3	*E8_ 2*66>vv>>&'stt

1u:?$%E&M','7'7':E)$).)9)9!)<E+&*/*:*:1*=E,'f"V}	*"<0
!+.	4[>*$W~u!YY!^
 )%L'N#K0) %m 4!&~!6":.!),%* &*k"e2*3	*j q! !s   EE')
g~jth?)g?gGz?g:0yE>r@   g?r   Fr   NF)N)__name__
__module____qualname____doc__r   floatr   strboolr   rE   rJ   r#   no_gradr   __classcell__)rH   s   @r8   r<   r<   [   s     )4"$ $ %"%6:!$5$5 $5 &	$5
 $5  $5 $5 $5 $5  $5 uUE\23$5 $5L/
 U]]_A Ar:   r<   )r]   r,   typingr   r   r#   torch.optim.optimizerr   _typesr   Tensorr^   intr_   r`   r9   r<    r:   r8   <module>ri      s     "  + ?<<?ll? ? LL	?
 ? ? ? ? <<? ? ? ? ? ? ?  !?" u%#?$ %?Dt9 tr:   