
    kh6W                     4   d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
mZ ddlZddlZ	 ddlZddlZdej"                  j                  _        dej"                  j                  _        dZ	 dej,                  j.                  _        dZdd	lmZ  ej:                  e      Z	 	 	 	 dd
e de de de de dejB                  fdZ" G d dejF                  jH                        Z%ddZ&	 	 ddZ'd Z(d Z)d Z*d Z+d Z,d Z-d Z.y# e$ r dZY w xY w# e$ r dZY w xY w)a   PyTorch Implementation of the Kron (PSGD) optimizer

This is a PSGD optimizer using a Kronecker-factored preconditioner.

This impl was adapted from https://github.com/evanatyourservice/kron_torch
by Evan Walters, licensed CC-BY-4.0.

Contributions to above also made by
* Lucas Nestler, added to his https://github.com/ClashLuke/HeavyBall implementation.
* Omead Pooladzandi https://github.com/opooladz

The above work drew from https://github.com/lixilinx/psgd_torch by Xi-Lin Li

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

This `timm` impl
* works with a wider variety of torch versions
* fixes some checkpoint save/restore (resume issues)
* adds decoupled weight-decay option
* has some refactoring, cleanup of args, default/group items
* warning about not having opt_einsum (unusable without)

    N)AnyCallableDictOptionalTupleUnionTzauto-hqFi@B    )ParamsTnmax_probmin_probdecay
flat_startreturnc                     	 t        j                  | t         j                        } |t        j                  | | |z
  z        z  }|j	                  ||       |S )a  Anneal preconditioner update probability during beginning of training.

    PSGD benefits from more preconditioner updates at the beginning of training,
    but once the preconditioner is learned the update probability can drop low.

    This schedule is an exponential anneal with a flat start. Default settings keep
    update probability at 1.0 for 200 steps then exponentially anneal down to
    `min_prob` by 4000 steps. Default settings work very well for most models and
    training regimes.
    dtype)minmax)torchtensorfloat32expclamp_)r   r   r   r   r   probs         K/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/kron.pyprecond_update_prob_scheduler   7   sN    $ .Qemm,Aeii!j. 9::DKKH(K+K    c            )       l    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d dededededeeeef      de	d	e	d
ee
   dedededeej                     deej                     dededede	de	dedef( fdZ fdZ fdZdee
ef   f fdZdee
ef   ddf fdZ fdZ ej*                         d!d       Z xZS )"Krona  Implements PSGD Kron from https://github.com/lixilinx/psgd_torch.

    Args:
        params: Iterable of parameters to optimize or dicts defining parameter groups.
        lr: Learning rate.
        momentum: Momentum parameter.
        weight_decay: Weight decay.
        preconditioner_update_probability: Probability of updating the preconditioner.
            If None, defaults to a schedule that anneals from 1.0 to 0.03 by 4000 steps.
        max_size_triangular: Max size for dim's preconditioner to be triangular.
        min_ndim_triangular: Minimum number of dimensions a layer needs to have triangular preconditioners.
        memory_save_mode: 'one_diag', 'smart_one_diag', or 'all_diag', None is default
            to set all preconditioners to be triangular, 'one_diag' sets the largest
            or last dim to be diagonal per layer, and 'all_diag' sets all preconditioners to be diagonal.
        momentum_into_precond_update: whether to send momentum into preconditioner
            update instead of raw gradients.
        mu_dtype: Dtype of the momentum accumulator.
        precond_dtype: Dtype of the preconditioner.
        decoupled_decay: AdamW style decoupled weight decay
        corrected_weight_decay: apply corrected weight decay when using decoupled_decay (lr**2 / max_lr)
        flatten: Flatten dimensions instead of fully relying on expressions for higher rank params
        flatten_start_dim: Start of flatten range, defaults to 2. Seems good tradeoff for ConvNets.
        flatten_end_dim: End of flatten range, defaults to -1.
        stochastic_weight_decay: Enable random modulation of weight decay
        deterministic: Deterministic behaviour across save / load (resume). FIXME slow, needs work
    Nparamslrmomentumweight_decay!preconditioner_update_probabilitymax_size_triangularmin_ndim_triangularmemory_save_modemomentum_into_precond_update
precond_lrprecond_init_scalemu_dtypeprecond_dtypedecoupled_decaycorrected_weight_decayflattenflatten_start_dimflatten_end_dimstochastic_weight_decaydeterministicc                    t         st        j                  d       d|k  st        d|       d|cxk  rdk  sn t        d|       d|k  st        d|       t	        di d|d|d	|d
|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|}t
        t        |   ||       i | _        t        j                  t        j                        j                  | _        t        j                  d      | _        || _        t$        rt        j&                  t(        dd      | _        t        j&                  t*        dd      | _        t        j&                  t,        dd      | _        t        j&                  t.        dd      | _        y t(        | _        t*        | _        t,        | _        t.        | _        y )NzKIt is highly recommended to have 'opt_einsum' installed for this optimizer.        zInvalid learning rate:       ?zInvalid beta parameter: zInvalid weight_decay value: r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   i9  TF)	fullgraphdynamic )has_opt_einsumwarningswarn
ValueErrordictsuperr    __init___param_exprsr   finfobfloat16tiny_tinyrandomRandomrngr4   
has_dynamocompile_calc_A_and_conjB_q_terms_precond_grad
_balance_Q)selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   defaults	__class__s                         r   rA   zKron.__init__m   s   . MMghby6rd;<<h$$7zBCCl";L>JKK 


 &
 /P	

 !4
 !4
 .
 *F
 "
  2
 
 (
 ,
 $:
 
  0!
" ,#
$ %<%
( 	dD"684[[055
==&* %*]]3DPT^c%dD"!MM(dERDM!&}V[!\D#mmJ$PUVDO%6D"$DM!.D(DOr   c                 j    t         |   |       | j                  D ]  }|j                  dd        y )Nr/   F)r@   __setstate__param_groups
setdefault)rP   stategrouprR   s      r   rT   zKron.__setstate__   s6    U#&& 	>E5u=	>r   c                 B    t         |          }| j                  |d<   |S )NrI   )r@   __getstate__rI   )rP   _dictrR   s     r   rZ   zKron.__getstate__   s"    $&xxer   r   c                 ^    t         |          }| j                  j                         |d<   |S N	rng_state)r@   
state_dictrI   getstate)rP   optimizer_staterR   s     r   r_   zKron.state_dict   s/    ',. (,xx'8'8':$r   r_   c                     i }d|v r|j                  d      |d<   t        | 	  |       |j                  |       d|v r| j                  j                  |d          y y r]   )popr@   load_state_dictupdaterI   setstate)rP   r_   
rng_statesrR   s      r   rd   zKron.load_state_dict   sf    
*$&0nn[&AJ{# 	
+*% *$HHj56 %r   c                 2    t         |   |       i | _        y N)r@   rT   rB   )rP   rW   rR   s     r   rT   zKron.__setstate__   s    U#r   c                 D   d }|$t        j                         5   |       }d d d        d}d}d}d}| j                  D ]  }|j                  d      }|j                  dt         j                        }	|j                  dd      }
|j                  dd       }|d   D ]2  }|j
                  |j
                  }| j                  |   }d}|d	   rt        ||d
   |d         }d}t        |      dk(  rd|d<   d|d<   t        j                  ||xs |j                        |d<   t        ||d   |d   |d   |d   |	      \  |d<   }|| j                  |<   |d   j                         }||d   j                         z  dz  }||z  }||z  }t        d |d   D              }t        d |d   D              dz  }||z  }||z  }nK|| j                  vr.t        ||d   |d   |d   |d   |	d      }|| j                  |<   n| j                  |   }|t         }t#        |      r ||d         }|dxx   dz  cc<   |d   d|z  k\  }|rd|d<   |dxx   dz  cc<   |d   }d||d   z  z
  }|d   }|j%                  |d         j'                  |d|d   z
         |!|j)                  |j+                  |             ||z  j+                  |	      }| j,                  j/                         dk  xr |}|j1                         dkD  r|r| j3                  |d          |ru|\  }}}|d   }| j4                  rLt        j6                  |j8                        }|j;                  | j,                  j=                  dd             nd }t        j>                  |j@                  ||	|j8                        } |
r|n|}!| jC                  ||!||       \  }"}#| jE                  ||"|#      }$tG        ||$      D ]  \  }%\  }&}'|&|'z
  }(|(|d    z  }(|%j1                         d!k  r3|(|%z  }(|(|&|'z   jI                  tK        d"            | jL                  z   z  }(n8t        jN                  |(      }(|(tQ        |&|'z         | jL                  z   z  }(|(|%z  }(|%jS                  |(        | jU                  |d   ||      j+                  |j                        })|)j%                  t        jV                  d#|)jY                         j[                         j]                         d$z   z  d%&             |r|)j_                  |j@                        })|d'   }*|*dk7  rx|d(   r d!| j,                  j/                         z  |*z  }*|d)   r;|d*   r|d+   d!z  | j`                  d+   z  }+n|d+   }+|j%                  d%|+|*z  z
         n|)j'                  ||*       |j'                  |)|d+           5  |dkD  r:tb        je                  d,| d-|d.d/       tb        je                  d0| d-|d.d/       |S # 1 sw Y   xY w)1Nr   r,   r-   r)   Tr%   r!   Fr0   r1   r2   stepupdate_counterr   momentum_bufferr+   r&   r'   r(   Qi   c              3   <   K   | ]  }|j                           y wri   )numel.0qs     r   	<genexpr>zKron.step.<locals>.<genexpr>  s     &EQqwwy&Es   c              3   ^   K   | ]%  }|j                         |j                         z   ' y wri   )rp   element_sizerq   s     r   rt   zKron.step.<locals>.<genexpr>  s"     $VaQWWY1A%A$Vs   +-)r   init_qr	   r#   )alphag{Gz?)devicel        )	generatorr   ry   r*      infg?g:0yE>r7   )r   r$   r3   r.   r/   r"   zPSGD Momentum buffer size: z elements, z.2fz MBzPSGD Preconditioners size: )3r   enable_gradrU   getr   gradrW   safe_flattenlen
zeros_liker   _init_Q_exprsrB   rp   rv   sumr   callablemul_add_copy_torI   rG   dimrO   r4   	Generatorry   manual_seedrandintrandnshaperL   rM   zipnormfloatrF   triu_norm_lower_boundsub_rN   clampsquaremeansqrt_viewrQ   _loggerinfo),rP   closurelosstotal_momentum_sizetotal_momentum_mbtotal_precond_sizetotal_precond_mbrX   r,   r-   r)   update_probpr   rW   	flattenedexprsmomentum_sizemomentum_mbprecond_size
precond_mb	do_updatebetabias_correctionrm   debiased_momentumbalanceexprAexprGs_rn   	torch_rngVGAconjBtermsrs   term1term2tmppre_gradr$   wd_scales,                                               r   rk   z	Kron.step   s   ""$ !y!  && T	5Eyy,H!IIou}}EM+0995SUY+Z())$GNK8_ N566>vv

1!	#'e4G.H%PaJbcD $Iu:?$%E&M./E*+/4/?/?HLbX\XbXb/cE+,(523343401+)%E#J ,1D%%a( %**;$<$B$B$DM"/%8I2J2W2W2Y"Y\a"aK'=8'%4%#&&E%*&E#EL!$$V5QT:$V!VY^!^J&,6&$
2$d///)23343401+$E ,1D%%a( !--a0E &">KK("-eFm"<K&'1,'!"23q;F	./E*+f" Z("#deFm&;";"'(9":$$U:%67<<TUS]M^I^<_ '#))/*<*<8*<*LM%4%F$J$JQ^$J$_! ((//+d2@y88:>gOOE#J/ ',$E61c
A))$)OO;L;S;S$T	!--dhh.>.>q'.JK$(	)//"++077	A .J)tA#55eQ1EHAu MM&!U;E-0E] 
$)>E5#emu\22557Q;1HCEEM#7#7e#E

#RRC"'**S/C#4UU]#Cdjj#PPC1HCs
$  --#J% "177"#	  ekk#1B1G1G1I1O1O1QTX1X*Y_bcd'}}QWW5H  %^41$67'(488??+<'<|'K./ !9:',T{a'7$--:M'MH',T{HrH|$;;< a|< xd|4]N5T	5l "LL67J6K;WhilVmmpqrLL67I6J+VfgjUkknopE! !s   XX)MbP?g?r6   Ni   r{   NTg?r7   NNFFFr{   FFri   )__name__
__module____qualname____doc__r
   r   r   r   r   intstrboolr   r   rA   rT   rZ   r   r   r_   rd   no_gradrk   __classcell__)rR   s   @r   r    r    Q   s   < !NR#'#$*.-1$'*./3 %',!"!(-#+F)F) F) 	F)
 F) ,4E(E/4J+KF) !F) !F) #3-F) '+F) F) "F) 5;;'F)  ,F) F)  !%!F)" #F)$ %F)& 'F)( "&)F)* +F)P>

DcN 7$sCx. 7T 7 U]]_e er   r    c                     | j                   }t        |dk\  r|n||z   |dz
        }||k  s||kD  r| S | j                  ||      S )Nr   r	   )ndimr   r0   )r   	start_dimend_dimr   s       r   r   r     sR    ;;D W\'tg~taxHG yI/ >>)W--r   c           	      	   t         j                  t         j                  z   }||n| j                  }| j                  }g }	t        |      dk(  r4|r)|	j                  |t        j                  | |      z         d}
dg}d}nt        |      dkD  r"t        dt        | j                         d      |d	t        |      z  z  }||D cg c]  }d
 }}n|dk(  r3t        j                  |      ddd   }|D cg c]  }d
 }}d||d   <   n|dk(  rWt        j                  |      ddd   }t        |      }|D cg c]  }d
 }}t        |      dk\  r8|d   |d   kD  r-d||d   <   n$|dk(  r|D cg c]  }d }}nt        d| d      g dd}}}g }g g ddf\  }}}}t        t        ||            D ]  \  }\  }}|d	k(  s||kD  st        |      |k  s|r|r4|	j                  |t        j                  ||| j                         z         |j                  ||          |||   z   }|||   z   }dj#                  t%        t        |            D cg c]  }||k(  r||dz      n||    c}      }|dz   |z   dz   ||dz      z   }|j                  |       |j                  ||dz             |j                  ||dz             |||dz      z   }|||dz      z   }+|r4|	j                  |t        j&                  ||| j                         z         |j                  ||   ||dz      z          |||dz      z   }|||   z   }dj#                  t%        t        |            D cg c]  }||k(  r||dz      n||    c}      }dj#                  t%        t        |            D cg c]  }||k(  r||dz      n||    c}      }|dz   |z   dz   ||dz      z   ||dz      z   }|j                  |       ||   ||dz      ||dz      }!} }|j                  || z          |j                  ||!z          ||!z   }|| z   } dj#                  |      dz   |z   dz   |z   }
dj#                  |      dz   dj#                  |      z   dz   |z   dz   |z   }t)        |      }|r|	|
||fgS |
||fS c c}w c c}w c c}w c c}w c c}w c c}w c c}w )zFor a scalar or tensor t, we initialize its preconditioner Q and
    reusable einsum expressions for updating Q and preconditioning gradient.
    Nr   r   z,->z,,->   zGot tensor with dim z; Einstein runs out of letters!r	   Fone_diagr   Tsmart_one_diagr{   all_diagzInvalid memory_save_mode: z/, must be one of [None, 'one_diag', 'all_diag'] )r   ry   ,z->   )stringascii_lowercaseascii_uppercaser   r   r   appendr   	ones_liker>   npargsortsorted	enumerater   onesry   joinrangeeyetuple)"tscalemax_sizer'   r(   r   rw   lettersr   rn   r   r   exprPr   dim_diagrev_sorted_dimssorted_shapepiece1Apiece2Apiece3Apiece1Ppiece2Ppiece3Ppiece4Pisizedim_djpiece1
subscriptspiece2abcs"                                     r   r   r     s8    $$v'='==G&EAGGEGGE
A
5zQHHUU__Qe<<=u:?3CL>A`abb!c%j.)#',-!-H-+ jj/"5O',-!-H-+/H_Q'(!11 jj/"5O!%=L',-!-H-5zQ<#3l26F#F/3+,+&+,,H,,-=,>>mnp p &(R'.0"b"-=*'7 )#eX*> ? )	&A}e	(?u: 33 HHUUZZE!((%SSTwqz*!GAJ.!GAJ.UZ[^_d[eUf!gPQQ!V'!b&/"K!gh#c\F2T9GAFOK
j)wq2v/wq2v/!GAFO3!GAFO3 HHUUYYt5%RRSwqzGAFO;<!GAFO3!GAJ.UZ[^_d[eUf!gPQQ!V'!b&/"K!ghUZ[^_d[eUf!gPQQ!V'!b&/"K!gh#c\F2T9GAFOKgVWZ\V\o]
j)"1:wq2vBa1q1u%q1u%!A+!A+S)	&V !C''1D87B!C'#((7*;;cAGKdRU\\6]FE65)**fe##O . . . -. "h" "h!gs*   		R(9	R-<	R22	R7R<
'S
(S
c                    | |z  } t        j                  | | j                         z        }t        j                  t        j                  |d      d      \  }}t        j                  t        j                  |d      d      \  }}||kD  ri| d d |f   j                         | z  }|t         j
                  j                  |t         j
                  j                  |      z  | j                  z        z  S | | |   j                         z  }|t         j
                  j                  | j                  |t         j
                  j                  |      z  z        z  S )Nr   )r   r	   )r   realconjr   r   linalgvector_normH)r   max_absaavalue0r   value1r   xs           r   _lbr    s   	GA	AL	!B		%))BA.2IFA		%))BA.2IFAadGLLNQ111u||7O7OPQ7R3RVWVYVY2YZZZ!		O11!##U\\=U=UVW=X9X2YZZZr   c                 ~    | j                  t        d            }t        j                  |dkD  t	        | |      |      S )z-Cheap lower bound for the spectral norm of A.r|   r   )r   r   r   wherer  )r   r  s     r   r   r     s1    ffU5\"G;;w{C7OW==r   c           	      `   | j                   }| j                  t        j                        } |j                  t        j                        }t        j                  j                  || j                  d| j                  d            dd      j                  |       }|j                  |      S )z
X @ inv(A)r   r   TF)upperleft)	r   r   r   r   r   solve_triangularreshaper   
reshape_as)Xr   
orig_dtypeouts       r   _solve_triangular_rightr    s    J	5==!A	5==!A
,,
'
'199R+DDW\
'
]
h
hij
kC66
6##r   c           
         t        j                  | D cg c]  }|j                  t        d             c}      }|j	                         dt        |       z  z  }||z  }t        |       D ]  \  }}|j                  ||           y c c}w )Nr|   r	   )r   stackr   r   prodr   r   r   )Q_inrs   normsgeometric_meanr   s        r   rO   rO     sv    KKt<!e-<=EZZ\a#d)m4NU"E$ 1	uQx =s   !Bc                 ~    t        j                  |d   g| D cg c]  }|j                          c}| | S c c}w )z.Precondition gradient G with preconditioner Q.r   )r   einsumr   )rn   r   r   rs   s       r   rN   rN     s8    <<b	Aq$9!QVVX$9AAAqAA$9s   :
c                    t        j                  | g|| }|j                         }t        t	        |            }t        j
                  |j                         |dd  |d d z         }t        |      D ]L  \  }}	|	j                         dk  r||	z  nt        ||	      }||dz
  k  s3t        j                  |||dz
        }N ||fS )Nr	   r{   )
r   r  r   r   r   permuter   r   r  	transpose)
r   r   rn   r   r   orderr   r   r   rs   s
             r   rL   rL   !  s    U"Q""AEEGEeElAMM!&&(AabEAbqEM2E! 91UUWq[	.EeQ.Ouqy=OOE1eai8E9 e8Or   c                     g }| D ]_  }t        j                  |||j                               }t        j                  ||j                         |      }|j                  ||f       a |S ri   )r   r  r   r   )r   r   r   r   exprGr   r   s          r   rM   rM   -  s]    E %UAqvvx0UEJJL%8eU^$% Lr   )r7   gQ?r   i  )r   r   )NT)/r   loggingr   rG   r<   typingr   r   r   r   r   r   numpyr   r   
opt_einsumtorch.backends.opt_einsumbackendsenabledstrategyr;   ImportError_dynamoconfigcache_size_limitrJ   AttributeError_typesr
   	getLoggerr   r   r   Tensorr   optim	Optimizerr    r   r   r  r   r  rO   rN   rL   rM   r:   r   r   <module>r5     sX  2     > >  $(,ENN%)2ENN&N,5EMM)J 
'

H
%
   	
  \\4n5;;   nb	.( e$P
[>$B
	I  N  Js$   A D  -D  D
	D
DD