
    kh&                     r    d Z ddlZddlmZmZ ddlZddlmZ  G d dej                  j                        Z
y)z Adafactor Optimizer

Lifted from https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py

Modified by Ross Wightman to fix some issues with factorization dims for non nn.Linear layers

Original header/copyright below.
    N)OptionalTuple   )ParamsTc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 ddedee   dededededeeeef      d	ed
edede	def fdZ
 fdZed        Zedd       Zed        Zd Z ej$                         dd       Z xZS )	Adafactora  Implements Adafactor algorithm.

    This implementation is based on: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
    (see https://arxiv.org/abs/1804.04235)

    Note that this optimizer internally adjusts the learning rate depending on the
    *scale_parameter*, *relative_step* and *warmup_init* options.

    To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
    `relative_step=False`.

    Ags:
        params: iterable of parameters to optimize or dicts defining parameter groups
        lr: external learning rate
        eps: regularization constants for square gradient and parameter scale respectively
        eps_scale: regularization constants for parameter scale respectively
        clip_threshold: threshold of root-mean-square of final gradient update
        decay_rate: coefficient used to compute running averages of square gradient
        beta1: coefficient used for computing running averages of gradient
        weight_decay: weight decay
        scale_parameter: if True, learning rate is scaled by root-mean-square of parameter
        warmup_init: time-dependent learning rate computation depends on whether warm-up initialization is being used
    paramslreps	eps_scaleclip_threshold
decay_ratebetasweight_decayscale_parameterwarmup_initmin_dim_size_to_factorcautionc                     | }|
r|st        d      |d n|d   }t        ||||||||	||
||      }t        t        |   ||       y )Nz'warmup_init requires relative_step=Truer   )r
   r   r   r   r   beta1r   r   relative_stepr   r   r   )
ValueErrordictsuperr   __init__)selfr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   defaults	__class__s                   P/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/adafactor.pyr   zAdafactor.__init__.   sm     }FGG58)!%+'##9
 	i'9    c                     t         |   |       | j                  D ]&  }|j                  dd       |j                  dd       ( y )Nr   Fr      )r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r#   zAdafactor.__setstate__R   sF    U#&& 	;EY.5r:	;r    c                     | d   rU| d   rd|d   z  nd}t        |dt        j                  |d         z        }d}| d   rt        | d   |d	         }||z  | d
<   | d
   S )Nr   r   gư>stepg{Gz?      ?r   r   RMSr
   )minmathsqrtmax)param_groupparam_statemin_steplr_tparam_scales        r   _get_lrzAdafactor._get_lrX   s    '5@5Otk&11UYHxtyyV1D'E!EFDK,-!+k":K<NO ${ 2K4  r    c                     | d   d u}d }t        |      }|dkD  r|d   |kD  r|d   |kD  rd}||fS |dk\  r|d   |kD  r|d   |kD  r
|dz
  |dz
  f}||fS )Nr      r   r   )r   r   )len)r0   param_shapemin_size_to_factoruse_first_momentfactoredndims         r   _get_optionszAdafactor._get_optionsc   s    &w/t;;
 !8A);;AQc@cH
 )))	 QY;r?-??KPROVhDhax)H)))r    c                 L    | j                  d      | j                         dz  z  S )Nr7   g      ?)normnumel)tensors    r   _rmszAdafactor._rmsu   s     {{1~3!677r    c                     ||j                  |d      z  j                         j                  |      }|j                  |      j                         }t	        j
                  ||      S )NT)dimkeepdim)meanrsqrt_	unsqueezersqrttorchmul)r   exp_avg_sq_rowexp_avg_sq_coldim_coldim_rowr_factorc_factors          r   _approx_sq_gradzAdafactor._approx_sq_grady   s[    "^%8%8Wd%8%SS[[]gghop!++G4::<yy8,,r    c                 	   d}|$t        j                         5   |       }ddd       | j                  D ]  }|d   D ]  }|j                  |j                  }|j                  t         j
                  t         j                  hv r|j                         }|j                  rt        d      | j                  |   }| j                  ||j                  |d         \  }}t        |      dk(  rd|d<   |rt        j                  |      |d<   |y|\  }	}
d	 }t        j                   ||j                  |
            j!                  |      |d
<   t        j                   ||j                  |	            j!                  |      |d<   nt        j                  |      |d<   d|d<   na|r|d   j!                  |      |d<   |/|d
   j!                  |      |d
<   |d   j!                  |      |d<   n|d   j!                  |      |d<   |}|j                  t         j
                  t         j                  hv r|j                         }|dxx   dz  cc<   | j#                  |      |d<   | j%                  ||      }dt'        j(                  |d   |d         z
  }|dz  |d   z   }||\  }	}
|d
   }|d   }|j+                  |      j-                  |j/                  |
      d|z
         |j+                  |      j-                  |j/                  |	      d|z
         | j1                  |||	|
      }|j+                  |       nI|d   }|j+                  |      j-                  |d|z
         |j3                         j+                  |      }|j5                  | j#                  |      |d   z  j7                  d             |j+                  |       |r|d   }|j+                  |d         j-                  |d|d   z
         |d   rV||z  dkD  j!                  |j                        }|j5                  |j/                         j7                  d             ||z  }n|}|d   dk7  r|j-                  ||d    |z         |j-                  |        |j                  t         j
                  t         j                  hv s|j9                  |         |S # 1 sw Y   xY w)zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model and returns the loss.
        Nr	   z,Adafactor does not support sparse gradients.r   )r<   r   r)   exp_avgc                     | d | | |dz   d  z   S )Nr    )shaperG   s     r   _remove_dimz#Adafactor.step.<locals>._remove_dim   s    #(#;sQwx#@@r    rO   rP   
exp_avg_sqr+   r   r*   r   r7   r   )rG   )alphar   )r,   r   r   MbP?r   )rM   enable_gradr$   graddtypefloat16bfloat16float	is_sparseRuntimeErrorr&   r@   rZ   r:   
zeros_likezerostorE   r5   r-   powmul_add_rI   rU   rL   div_clamp_copy_)r   closurelossr'   pr`   r&   factored_dimsr=   rQ   rR   r[   p_fp32r3   beta2tupdaterO   rP   r\   rW   masks                        r   r)   zAdafactor.step   s    ""$ !y! && Y	$E8_ X$66>vv::%--!@@::<D>>&'UVV

1262C2CJJ',-E'F 3D 3// u:?$%E&M'+0+;+;D+Ai($0+8(A27++k$**V]>^2_2b2bcg2h./27++k$**V]>^2_2b2bcg2h./.3.>.>t.Dl+#$E%L'+0+;+>+>t+Di($0278H2I2L2LT2R./278H2I2L2LT2R./.3L.A.D.DT.Jl+77u}}enn==#\\^Ff"#yy0e||E51txxfu\7JKKU5\1 ,'4$GW%*+;%<N%*+;%<N"''/44V[[W[5MUX[aUa4b"''/44V[[W[5MUX[aUa4b "11..RY[bcFKK%!&|!4JOOF+00sV|0L'--/44T:FTYYv.7G1HHPPUXPYZD!##I.GLLw055fAgDV5WY' '$ 266tzzB		$))+"4"4"4">?!(4!((A-KKu^/D.Dt.KKLVG$77u}}enn==GGFOqX$Y	$v }! !s   SS()NgKH9r^   r*   gNg        TFr"   F)r"   )N)__name__
__module____qualname____doc__r   r   rd   r   boolintr   r#   staticmethodr5   r@   rE   rU   rM   no_gradr)   __classcell__)r   s   @r   r   r      s   6 #'#$' $37"%$( %*,!":": ": 	":
 ": "": ": E%,/0":  ": "": ": %(": ":H; ! ! * *" 8 8- U]]_e er    r   )r{   r-   typingr   r   rM   _typesr   optim	Optimizerr   rY   r    r   <module>r      s0     "  P%% Pr    