
    kh,                        d Z ddlmZmZ ddlZddlmZ ddlmZ 	 ddlmZm	Z	 dZ
d	d
lmZ ddgZ G d de      Z	 	 ddddee   dee   deee      dedee   dedededededededee   fdZdee   dee   deee      dededededededededee   fdZdee   dee   deee      dededededededededee   fdZy# e$ r dZ
Y w xY w) a   SGD with decoupled weight-decay.

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Hacked together by Ross Wightman
    )ListOptionalN)Tensor)	Optimizer)_use_grad_for_differentiable_default_to_fused_or_foreachTF   )ParamsTSGDWsgdwc                        e Zd Z	 	 	 	 	 ddddddddededededed	ed
edededee   def fdZ fdZd Z	 e
j                         dd       Z xZS )r   FN)cautioncorrected_weight_decaymaximizeforeachdifferentiableparamslrmomentum	dampeningweight_decaynesterovr   r   r   r   r   c                    |dk  rt        d|       |dk  rt        d|       |dk  rt        d|       t        ||||||||	|
|
      }|r|dk  s|dk7  rt        d      t        |   ||       y )N        zInvalid learning rate: zInvalid momentum value: zInvalid weight_decay value: )
r   r   r   r   r   r   r   r   r   r   r   z8Nesterov momentum requires a momentum and zero dampening)
ValueErrordictsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   defaults	__class__s                K/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/sgdw.pyr   zSGDW.__init__   s     86rd;<<c>7zBCC#;L>JKK%#9)
 Q)q.WXX*    c                    t         |   |       | j                  D ]n  }|j                  dd       |j                  dd       |j                  dd       |j                  dd       |j                  dd        |j                  dd       p y )Nr   Fr   r   r   r   r   )r   __setstate__param_groups
setdefault)r   stategroupr!   s      r"   r%   zSGDW.__setstate__@   s    U#&& 	6EY.5u=Z/Z/Y--u5	6r#   c                 2   d}|d   D ]  }|j                   |j                  |       |j                  |j                          |j                   j                  rd}| j                  |   }d|vr|j                  d        y|j                  |d           |S )NFr   Tmomentum_buffer)gradappend	is_sparser(   )r   r)   params_with_gradgradsmomentum_buffer_listhas_sparse_gradpr(   s           r"   _init_groupzSGDW._init_groupJ   s    x 	JAvv! ''*QVV$66##&*O

1$E1(//5(//6G0HI	J r#   c                    d}|$t        j                         5   |       }ddd       | j                  D ]  }g }g }g }| j                  ||||      }t	        ||||d   |d   |d   |d   |d   |d   |d   ||d	   |d
   r| j
                  d   nd       t        ||      D ]  \  }}	| j                  |   }
|	|
d<     |S # 1 sw Y   xY w)zPerforms a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r2   r   max_lrr+   )torchenable_gradr&   r4   r   r    zipr(   )r   closurelossr)   r/   r0   r1   r2   r3   r+   r(   s              r"   stepz	SGDW.step^   s    ""$ !y! && 	;E!E#% "..u6FOcdO $">2z*;,z*i(z* /i(.34L.Mt}}T*SW" '**:<P&Q ;"?

1+:'(;1	;8 ?! !s   CC)MbP?r   r   r   FN)__name__
__module____qualname__r
   floatboolr   r   r%   r4   r7   no_gradr<   __classcell__)r!   s   @r"   r   r      s      !"$"$+ "+0"&*#($+$+ $+ 	$+
 $+  $+ $+ $+ %)$+ $+ d^$+ !$+L6( U]]_( (r#   )r6   r   r0   r1   r2   r   r   r   r   r   r   r   r   r6   c                t   t         rot        t        d      r_|2t        j                  j                         st        | dd      \  }}nd}|r+t        j                  j                         rt        d      d}|r%t        j                  j                         st        }nt        } || |||||||	|
|||       y)zlFunctional API that performs SGD algorithm computation.

    See :class:`~torch.optim.SGD` for details.
    "_group_tensors_by_device_and_dtypeNF)r   	use_fusedz6torch.jit.script not supported with foreach optimizers)	r   r   r   r   r   r   r2   r   r6   )
has_recent_pthasattrr   r7   jitis_scriptingr   RuntimeError_multi_tensor_sgdw_single_tensor_sgdw)r   r0   r1   r2   r   r   r   r   r   r   r   r   r6   _funcs                  r"   r   r      s    , ,PQ? 99))+9&QVbgh
7uyy--/WXXuyy--/!"!'r#   c       	         t   t        |       D ])  \  }}|	s||   n||    }||n|dz  |z  }|j                  d||z  z
         |dk7  r||   }|)t        j                  |      j	                         }|||<   n%|j                  |      j                  |d|z
         |rk|r|j                  ||      }||z  dkD  j                  |j                        }|j                  |j                         j                  d             ||z  }n|r|j                  ||      }n|}|j                  ||        , y )N         ?r   r	   alphar=   )min)	enumeratemul_r7   clonedetachadd_addtodtypediv_meanclamp_)r   r0   r1   r   r   r   r   r   r   r   r2   r6   iparamr,   wd_scalebufmasks                     r"   rO   rO      s:    f% $5'uQxeAhY2R1Wv-=

2<//0q=&q)C{kk$'..0*-$Q'"''A	M'B((3h(7Cd
Q**4::6		$))+,,,67Tz88Cx88DD

4s
#9$r#   c       	            t        |       dk(  ry t        j                  | ||gd      }|j                         D ]  \  \  }}}}|
xr t	        d |D              }|	rt        j                  |      }||n|dz  |z  }t        j                  | d||z  z
         |dk7  rg }d}t        t        |            D ]  }||   d} n|j                  ||          ! |r2t        j                  ||       t        j                  ||d|z
  	       ng }t        t        |            D ]y  }||   4t        j                  ||         j                         x}x||<   |||   <   n-||   }|j                  |      j                  ||   d|z
  	       |j                  |       { |r|rt        j                  |||	      }t        j                   ||      }t#        ||      D cg c]#  \  }}|dkD  j%                  |j&                        % }}}|D cg c]  }|j)                          }}t        j*                  |d
       t        j,                  ||       t        j                   ||      }n|rt        j                  |||	       n|}|st        j                  ||| 	       lt        t        |            D ]  }||   j                  ||   | 	         y c c}}w c c}w )Nr   T)with_indicesc              3   4   K   | ]  }|j                     y wr>   )r.   ).0r,   s     r"   	<genexpr>z%_multi_tensor_sgdw.<locals>.<genexpr>  s     8aD8as   rS   rT   Fr	   rU   r=   )lenr   rG   valuesanyr7   _foreach_neg_foreach_mul_ranger-   _foreach_add_rZ   r[   rY   r\   _foreach_add_foreach_mulr9   r^   r_   ra   _foreach_maximum__foreach_div_)r   r0   r1   r   r   r   r   r   r   r   r2   r6   grouped_tensorsdevice_paramsdevice_gradsdevice_momentum_buffer_listindicesdevice_has_sparse_gradre   bufsall_states_with_momentum_bufferrc   rf   masksmg
mask_scales                              r"   rN   rN      s    6{aBB	,-DBOQ`QgQgQi 9BM	C-'BW!0!aS8aT`8a5a --l;L2R1Wv-=FBL)@$@Aq=D.2+3:;< @.q196;3KK ;A >?@ /##D(3##D,a)mLs#>?@ %A215=!KKQ8??AB B9!<?ST[\]T^?_ :!<*//Qq9}/UKK$%  --lDQD**4>9<UL9QRA!a%AGG,RR0561affh6
6''
D9##E:6$11$>''d(K#'L%|B3G 3}-. Ba %%l1obS%ABq9BR S6s   !(KK#)NN)__doc__typingr   r   r7   r   torch.optim.optimizerr   r   r   rI   ImportError_typesr
   __all__r   rC   rB   r   rO   rN    r#   r"   <module>r      sC   "   +`M 6
n9 nn !%"&6  #'!6V6F|6 #8F#346 6 $6 6 6 6 6 6 6 6  !6r+$V+$F|+$ #8F#34+$
 +$ +$ +$ +$ +$ +$ +$ +$ +$\MBVMBF|MB #8F#34MB
 MB MB MB MB MB MB MB MB MBA  Ms   
C CC