
    kh                        d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZmZ ddlmZ ddlZddlZddlmZ ddlZddlmZmZ dd	lmZmZmZ dd
lmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZH dd lImJZJ  ej                  eL      ZMd!eNd"efd#ZO ed$%       G d& d'             ZP G d( d)      ZQd*eQd"dfd+ZRd*eQd"dfd,ZSd*eQd"dfd-ZTd*eQd"dfd.ZUd*eQd"dfd/ZVd*eQd"dfd0ZWd*eQd"dfd1ZXd*eQd"dfd2ZY eQ       ZZdJd3Z[ e[        	 	 	 dKd4eeNe
eN   f   d5ee
eN      d6e\d"e
eeNeeNeNf   f      fd7Z]d8eNd"ePfd9Z^	 dLd8eNd:e\d"eeef   fd;Z_	 	 	 	 	 	 	 	 dMd<eej                  ef   d=eNd>eea   d?ead@eadAee\   dBe\dCeea   dDeeej                  gef      dEed"ej                  j                  fdFZddG Ze	 dLdHeej                  ef   dBe\d"ej                  j                  fdIZfy)Nzs Optimizer Factory w/ custom Weight Decay & Layer Decay support

Hacked together by / Copyright 2021 Ross Wightman
    N)	dataclass)partial)	AnyCallableDictListOptionalSetTupleTypeUnionfnmatch   )param_groups_layer_decayparam_groups_weight_decay)ParamsT	OptimTypeOptimizerCallable)	AdaBelief)	Adafactor)AdafactorBigVision)
Adahessian)AdamP)AdamWLegacy)Adan)Adopt)Kron)Lamb)LaProp)Lars)Lion)	Lookahead)MADGRAD)Mars)NAdamLegacy)NAdamW)
NvNovoGrad)RAdamLegacy)	RMSpropTF)SGDP)SGDWclass_stringreturnc                     	 | j                  dd      \  }}t        j                  |      }t        ||      S # t        t
        f$ r}t	        d|  d|       d}~ww xY w)z)Dynamically import a class from a string..r   zCould not import z: N)rsplit	importlibimport_modulegetattrImportErrorAttributeError)r-   module_name
class_namemodulees        U/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/_optim_factory.py_import_classr<   -   sl    C"."5"5c1"=Z((5vz**( C-l^2aSABBCs   58 AAAT)frozenc                       e Zd ZU dZeed<   eeef   ed<   dZeed<   dZ	e
ed<   dZe
ed	<   dZe
ed
<   dZeed<   dZe
ed<   dZeeeef      ed<   y)	OptimInfoaL  Immutable configuration for an optimizer.

    Attributes:
        name: Unique identifier for the optimizer
        opt_class: The optimizer class
        description: Brief description of the optimizer's characteristics and behavior
        has_eps: Whether the optimizer accepts epsilon parameter
        has_momentum: Whether the optimizer accepts momentum parameter
        has_betas: Whether the optimizer accepts a tuple of beta parameters
        num_betas: number of betas in tuple (valid IFF has_betas = True)
        defaults: Optional default parameters for the optimizer
    name	opt_class descriptionThas_epsFhas_momentum	has_betas   	num_betassecond_orderNdefaults)__name__
__module____qualname____doc__str__annotations__r   r   rC   rD   boolrE   rF   rH   intrI   rJ   r	   r   r        r;   r?   r?   8   sp     IS)^$$KGTL$ItIsL$)-HhtCH~&-rT   r?   c                      e Zd ZdZddZdeddfdZdededdfd	Zd
eddfdZ		 	 	 d de
eee   f   deee      dedee
eeeef   f      fdZd
edefdZ	 d!de
eef   dede
eef   fdZ	 	 	 	 	 	 	 d"de
ej*                  ef   dedee   dededee   dedee   deeej*                  gef      dedej6                  j8                  fdZy)#OptimizerRegistryzRegistry managing optimizer configurations and instantiation.

    This class provides a central registry for optimizer configurations and handles
    their instantiation with appropriate parameter groups and settings.
    r.   Nc                 "    i | _         dh| _        y )Nlion)_optimizers_foreach_defaults)selfs    r;   __init__zOptimizerRegistry.__init__X   s    13,28rT   infoc                     |j                   j                         }|| j                  v rt        j	                  d| d       || j                  |<   y)zRegister an optimizer configuration.

        Args:
            info: The OptimInfo configuration containing name, type and description
        
Optimizer z  already registered, overwritingN)r@   lowerrY   _loggerwarning)r[   r]   r@   s      r;   registerzOptimizerRegistry.register\   sH     yy 4###OOj.NOP!%rT   aliastargetc                     |j                         }|| j                  vrt        d|       | j                  |   | j                  |j                         <   y)zRegister an alias for an existing optimizer.

        Args:
            alias: The alias name
            target: The target optimizer name

        Raises:
            KeyError: If target optimizer doesn't exist
        z/Cannot create alias for non-existent optimizer N)r`   rY   KeyError)r[   rd   re   s      r;   register_aliasz OptimizerRegistry.register_aliasg   sQ     )))LVHUVV*.*:*:6*B'rT   r@   c                 V    | j                   j                  |j                                y)z4Register an optimizer as defaulting to foreach=True.N)rZ   addr`   r[   r@   s     r;   register_foreach_defaultz*OptimizerRegistry.register_foreach_defaultv   s    ""4::<0rT   filterexclude_filterswith_descriptionc                   
 t        | j                  j                               }|rLt        |t              r|g}n|}t               }|D ]  
|j                  
fd|D                t        |      }|r#|D ]  }|D cg c]  }t        ||      r| }}  |r)|D 	cg c]  }	|	| j                  |	   j                  f c}	S |S c c}w c c}	w )a  List available optimizer names, optionally filtered.

        Args:
            filter: Wildcard style filter string (e.g., 'adam*')
            exclude_filters: Optional list of wildcard patterns to exclude
            with_description: If True, return tuples of (name, description)

        Returns:
            List of either optimizer names or (name, description) tuples
        c              3   <   K   | ]  }t        |      s|  y wNr   ).0nfs     r;   	<genexpr>z4OptimizerRegistry.list_optimizers.<locals>.<genexpr>   s     %HA'!Q-a%Hs   )	sortedrY   keys
isinstancerO   setupdater   rC   )r[   rm   rn   ro   namesfiltersfiltered_namesexclude_filterrt   r@   ru   s             @r;   list_optimizersz!OptimizerRegistry.list_optimizersz   s      t'',,./&#&!(  UN I%%%H%HHI>*E"1 M$)LqN1KLLM KPQ4T4++D1==>QQ M Rs   >CC"C
c                 z    |j                         }|| j                  vrt        d| d      | j                  |   S )zGet the OptimInfo for an optimizer.

        Args:
            name: Name of the optimizer

        Returns:
            OptimInfo configuration

        Raises:
            ValueError: If optimizer is not found
        r_   z not found in registry)r`   rY   
ValueErrorrk   s     r;   get_optimizer_infoz$OptimizerRegistry.get_optimizer_info   sC     zz|t'''z$/EFGG%%rT   name_or_infobind_defaultsc                    t        |t              r| j                  |      }nt        |t              sJ |}t        |j                  t              r|j                  j                  d      r<t        j                  j                         sJ d       	 t        |j                        }ny|j                  j                  d      r<t        j                  j                         sJ d       	 t        |j                        }n"t        |j                        }n|j                  }|r"|j                  rt        |fi |j                  }|S # t        $ r}t        d      |d}~ww xY w# t        $ r}t        d      |d}~ww xY w)a  Get the optimizer class with any default arguments applied.

        This allows direct instantiation of optimizers with their default configs
        without going through the full factory.

        Args:
            name_or_info: Name of the optimizer
            bind_defaults: Bind default arguments to optimizer class via `partial` before returning

        Returns:
            Optimizer class or partial with defaults applied

        Raises:
            ValueError: If optimizer not found
        zapex.z!CUDA required for APEX optimizersz,APEX optimizers require apex to be installedNzbitsandbytes.z)CUDA required for bitsandbytes optimizersz<bitsandbytes optimizers require bitsandbytes to be installed)ry   rO   r   r?   rA   
startswithtorchcudais_availabler<   r5   rJ   r   )r[   r   r   opt_inforA   r:   s         r;   get_optimizer_classz%OptimizerRegistry.get_optimizer_class   s[   ( lC(..|<HlI666#Hh((#.!!,,W5zz..0U2UU0] -h.@.@ AI ##..?zz..0]2]]0m -h.@.@ AI *(*<*<=	 **I X..	?X->->?I# # ]%&TU[\\] # m%&dekllms0   E )E$ 	E!EE!$	E>-E99E>model_or_paramsoptlrweight_decaymomentumforeachweight_decay_exclude_1dlayer_decayparam_group_fnkwargsc
                 R   t        |t        j                        rY t        |dd              }|	r	 |	|      }n=|t	        |||||      }d}n(|r|rt        |||      }d}n|j                         }n|}|j                         j                  d      }|d   }t        |      d	kD  r|d
   dk(  nd}| j                  |      }d|i|
}|||d<   |j                  r4|j                  j                         D ]  \  }}|j                  ||        |j                  r|j                  d|       |j                  s|j!                  dd       |j"                  s|j!                  dd       ||j                  d|       | j%                  |d      } ||fi |}|rt'        |      }|S )a  Create an optimizer instance.

        Args:
            model_or_params: Model or parameters to optimize
            opt: Name of optimizer to create
            lr: Learning rate
            weight_decay: Weight decay factor
            momentum: Momentum factor for applicable optimizers
            foreach: Enable/disable foreach operation
            weight_decay_exclude_1d: Whether to skip weight decay for 1d params (biases and norm affine)
            layer_decay: Layer-wise learning rate decay
            param_group_fn: Optional custom parameter grouping function
            **kwargs: Additional optimizer-specific arguments

        Returns:
            Configured optimizer instance

        Raises:
            ValueError: If optimizer not found or configuration invalid
        no_weight_decayc                      t               S rr   )rz   rS   rT   r;   <lambda>z4OptimizerRegistry.create_optimizer.<locals>.<lambda>  s
    RURW rT   N)r   r   no_weight_decay_listr           )r   r   _r   r   	lookaheadFr   r   r   epsbetasr   r   )ry   nnModuler4   r   r   
parametersr`   splitlenr   rJ   items
setdefaultrE   rD   poprF   r   r#   )r[   r   r   r   r   r   r   r   r   r   r   r   params	opt_splitopt_nameuse_lookaheadr   opt_argskvrA   	optimizers                         r;   create_optimizerz"OptimizerRegistry.create_optimizer   s   F oryy1Xgo7H-XZO'8(1#!- +)8,C  ""92#!-)8
  "(335 %F IIK%%c*	R=7:9~7I	!3u**84 %3L#KF#K >HTN  ))//1 *1##Aq)*   
H5 LL%!!LL$' 	73 ,,XU,K	f11	 !),IrT   r.   NrB   NFT)Nr   ?NTNN)rK   rL   rM   rN   r\   r?   rc   rO   rh   rl   r   r   r	   rQ   r   r   r   r   r   r   r   r   r   floatr   r   r   optim	Optimizerr   rS   rT   r;   rV   rV   Q   s   4	&Y 	&4 	&CC C C C1S 1T 1 -/37%*	##tCy.)# &d3i0# #	#
 
eCsCx()	*#J&s &y &( #'1Y/1  1 
y++	,	1n #'"$!&*,0+/GKk"299g#56k k 	k
  k k d^k &*k "%k %Xryyk7.B%CDk k 
		krT   rV   registryc                 6   t        dt        j                  j                  dddddi      t        dt        j                  j                  dddddi      t        d	t        d
dddi      t        dt
        dddddi      g}|D ]  }| j                  |        y)zRegister SGD-based optimizerssgdzDtorch.Optim Stochastic Gradient Descent (SGD) with Nesterov momentumFTnesterovr@   rA   rC   rD   rE   rJ   r   zEtorch.Optim Stochastic Gradient Descent (SGD) with classical momentumsgdpz0SGD with built-in projection to unit norm spherer@   rA   rC   rE   rJ   sgdwz5SGD with decoupled weight decay and Nesterov momentumN)r?   r   r   SGDr+   r,   rc   )r   sgd_optimizersr   s      r;   _register_sgd_variantsr   Q  s     	kkoo^ $'	
 	kkoo_ %(	
 	J $'	
 	O $'	
1 NB  #rT   c                 P   t        dt        j                  j                  dd      t        dt        j                  j                  dd      t        dt
        dd      t        d	t        d
dddd      t        dt        j                  j                  dd      t        dt        dd      t        dt        dd      t        dt        j                  j                  dd      t        dt        dd      t        dt        j                  j                  ddddi      t        dt        j                  j                  dd      t        dt        d      t        d t        d!      t        d"t        d#      t        d$t        d%d&di'      g}|D ]  }| j!                  |        y())zRegister Adam-based optimizersadamz,torch.optim.Adam, Adaptive Moment EstimationTr@   rA   rC   rF   adamwz3torch.optim.AdamW, Adam with decoupled weight decayadamwlegacyz<legacy impl of AdamW that pre-dates inclusion to torch.optimadampz1Adam with built-in projection to unit norm sphereg{Gz?)wd_ratior   r@   rA   rC   rF   rJ   nadamz.torch.optim.NAdam, Adam with Nesterov momentumnadamlegacyz<legacy impl of NAdam that pre-dates inclusion in torch.optimnadamwz]Adam with Nesterov momentum and decoupled weight decay, mlcommons/algorithmic-efficiency implradamz:torch.optim.RAdam, Rectified Adam with variance adaptationradamlegacyz;legacy impl of RAdam that predates inclusion in torch.optimradamwzVtorch.optim.RAdamW, Rectified Adam with variance adaptation and decoupled weight decaydecoupled_weight_decayadamaxzCtorch.optim.Adamax, Adam with infinity norm for more stable updates	adafactorz?Memory-efficient implementation of Adam with factored gradientsr@   rA   rC   adafactorbvzPBig Vision variant of Adafactor with factored gradients, half precision momentumadoptuB   Modified Adam that can converge with any β2 with the optimal rateadoptwuU   Modified AdamW (decoupled decay) that can converge with any β2 with the optimal rate	decoupledr@   rA   rC   rJ   N)r?   r   r   AdamAdamWr   r   NAdamr&   r'   RAdamr)   Adamaxr   r   r   rc   )r   adam_optimizersr   s      r;   _register_adam_variantsr   x  s    	kk&&F		
 	kk''M		
 	!V		
 	K"&D9	
 	kk''H		
 	!V		
 	w		
 	kk''T		
 	!U		
 	kk''p.5	
 	kk((]		
 	Y	

 	(j	

 	\	

 	o!4(		
iZOv  #rT   c                    t        dt        dd      t        dt        ddddi      t        d	t        d
dddi      t        dt        ddddd      t        dt        dd      t        dt        ddddi      t        dt        ddddi      t        dt        ddddd      g}|D ]  }| j                  |        y)zRegister LAMB and LARS variantslambz2Layer-wise Adaptive Moments for batch optimizationTr   lambcz,LAMB with trust ratio clipping for stability
trust_clipr   lambwz LAMB with decoupled weight decaydecoupled_decaylambcwz@LAMB with trust ratio clipping for stability and decoupled decay)r   r   larsz Layer-wise Adaptive Rate Scalingr@   rA   rC   rE   larcz,LARS with trust ratio clipping for stabilityr   nlarszLARS with Nesterov momentumr   nlarcz2LARS with Nesterov momentum & trust ratio clipping)r   r   N)r?   r   r!   rc   )r   lamb_lars_optimizersr   s      r;   _register_lamb_larsr     s    	L		
 	F"D)	
 	:'.	
 	Z$(TB	
 	:		
 	F"D)	
 	5 $'	
 	L"&d;	
a7p $ #rT   c                 (   t        dt        ddddi      t        dt        ddddi      t        dt        d	d
dddd      t        dt        dddd      t        dt
        ddddd      t        dt        ddddd      t        dt        dd
dddi      t        dt        ddddi      t        dt        dddddd      t        d t        d!ddi      g
}|D ]  }| j                  |        t        d"t        d#dddd$      t        d%t        d&dddd'      t        d(t        d)dddd$      t        d*t        d+d
ddddd,      t        d-t        d.d
dddd$      t        d/t        d0ddd$      g}|D ]  }| j                  |        y1)2z2Register corrected weight decay optimizer variantsadamcu7   AdamW with corrected weight decay (lr²/max_lr scaling)Tcorrected_weight_decayr   nadamcu8   NAdamW with corrected weight decay (lr²/max_lr scaling)sgdcu?   SGD with corrected decoupled weight decay (lr²/max_lr scaling)F)r   r   r   adoptcuA   Adopt with corrected decoupled weight decay (lr²/max_lr scaling))r   r   r   lambcdu@   LAMB with corrected decoupled weight decay (lr²/max_lr scaling))r   r   kroncuE   PSGD Kron with corrected decoupled weight decay (lr²/max_lr scaling)r   lioncu6   Lion with corrected weight decay (lr²/max_lr scaling)r@   rA   rC   rD   rF   rJ   lapropcu8   LaProp with corrected weight decay (lr²/max_lr scaling)
rmsproptfcuL   RMSprop TF-style with corrected decoupled weight decay (lr²/max_lr scaling)r   )alphar   r   adafactorbvcuS   Adafactor Big Vision with corrected weight decay (lr²/max_lr or lr/max_lr scaling)cadamcu@   Cautious AdamW with corrected weight decay (lr²/max_lr scaling))cautionr   cadoptcuJ   Cautious Adopt with corrected decoupled weight decay (lr²/max_lr scaling))r   r  r   cnadamcuA   Cautious NAdamW with corrected weight decay (lr²/max_lr scaling)csgdcuH   Cautious SGD with corrected decoupled weight decay (lr²/max_lr scaling))r   r  r   clioncu?   Cautious Lion with corrected weight decay (lr²/max_lr scaling)cadafactorbvcz9Cautious Adafactor Big Vision with corrected weight decayN)r?   r   r'   r,   r   r   r   r"   r    r*   r   rc   )r   corrected_optimizersr   cautious_correcteds       r;   $_register_corrected_decay_optimizersr	    s>    	!Q.5	
 	R.5	
 	Y"&$G	
 	[#'4H		
 	Z)-N	
 	_)-N	
 	P.5	
 	R.5	
 	f"tW[\	
 	(m.5		
CGP $ #
 	!Z!%F	
 	d#'DTXY		
 	[!%F	
 	b"&4SWX	
 	Y!%F	
 	 (S!%F		
K+X " #rT   c                    t        dt        dddi      t        dt        dddi      t        dt        d	dddi
      t        dt        dddi      t        dt
        dddddd      t        dt
        dddddd      t        dt        dddd      t        dt        ddddi
      t        dt        ddddd
      t        dt        ddddi
      t        dt        d ddddi!      t        d"t        d#dddi
      t        d$t        d%dddi
      t        d&t        d'dd(dd)*      t        d+t        d,ddddd-.      g}|D ]  }| j                  |        y )/N
cadafactorzCautious Adafactorr  Tr   cadafactorbvzCautious Big Vision AdafactorcadamwzCautious AdamWr   cadoptzCautious Adoptcadanz-Cautious Adaptive Nesterov Momentum AlgorithmF)r  no_prox   r@   rA   rC   rJ   rF   rH   cadanwz?Cautious Adaptive Nesterov Momentum with decoupled weight decaycadoptwz!Cautious AdoptW (decoupled decay))r   r  clambzCautious LAMBclambwz)Cautious LAMB with decoupled weight decay)r  r   clapropzCautious LaPropclionzCautious Lionr   cmarszCautious MARScnadamwzCautious NAdamW
crmsproptfz!Cautious TensorFlow-style RMSpropr   )r   r  r   csgdwz>Cautious SGD with decoupled weight decay and Nesterov momentum)r   r  r   )r?   r   r   r   r   r   r   r    r"   r%   r'   r*   r,   rc   )r   cautious_optimizersr   s      r;   _register_cautious_optimizersr    s   ,&		
 	(7&		
 	!(&	
 	(&		
 	G!%%8	
 	Y!%$7	
 	;#'D9		
 	'&	
 	C!%$?	
 	)&	
 	'!4(	
 	'&	
 	)&	
 	;"t4	
 	X"&48	
EjV # #rT   c                 p   t        dt        ddddi      t        dt        ddddi      t        d	t        j                  j                  d
      t        dt        j                  j
                  dddi      t        dt        dddidd      t        dt        dddidd      t        dt        ddd      t        dt        dd      t        dt        ddd di!      t        d"t        d#d$      t        d%t        d&dd'      t        d(t        d)d      t        d*t        d+dd di!      t        d,t        d-d$      t        d.t        d/d$      t        d0t        j                  j                  d1dd2d3i!      t        d4t        d5dd2d3i!      g}|D ]  }| j!                  |        | j#                  d%       y6)7z!Register miscellaneous optimizers	adabeliefz7Adapts learning rate based on gradient prediction errorTrectifyFr   
radabeliefz,Rectified AdaBelief with variance adaptationadadeltazQtorch.optim.Adadelta, Adapts learning rates based on running windows of gradientsr   adagradzMtorch.optim.Adagrad, Adapts learning rates using cumulative squared gradientsr   g:0yE>r   adanz$Adaptive Nesterov Momentum Algorithmr  r  r  adanwz6Adaptive Nesterov Momentum with decoupled weight decay
adahessianz"An Adaptive Second Order Optimizer)r@   rA   rC   rF   rI   kronz5PSGD optimizer with Kronecker-factored preconditionerr   kronwzPPSGD optimizer with Kronecker-factored preconditioner and decoupled weight decayr   r   lapropz*Separating Momentum and Adaptivity in Adamr   rX   z8Evolved Sign Momentum optimizer for improved convergence)r@   rA   rC   rD   rF   madgradz'Momentum-based Adaptive gradient methodmadgradwz#MADGRAD with decoupled weight decaymarszDUnleashing the Power of Variance Reduction for Training Large Modelsnovogradz3Normalized Adam with L2 norm gradient normalizationrmspropz1torch.optim.RMSprop, Root Mean Square Propagationr   r   	rmsproptfzETensorFlow-style RMSprop implementation, Root Mean Square PropagationN)r?   r   r   r   AdadeltaAdagradr   r   r   r    r"   r$   r%   r(   RMSpropr*   rc   rl   )r   other_optimizersr   s      r;   _register_other_optimizersr5    s    	Q'	
 	F&	
 	kk**k	

 	kk))gT]		
 	>'	
 	P&	
 	 <	
 	O		
 	j'.	
 	D		
 	R	
 	A		
 	='.	
 	^		
 	 M		
 	kk))Ks^	
 	_s^	
Wrf   #%%f-rT   c                     t        dddddddi      t        dd	d
dddi      t        dd	ddddi      t        dddd      t        ddddddi      g}|D ]  }| j                  |        y)z&Register APEX optimizers (lazy import)fusedsgdzapex.optimizers.FusedSGDz8NVIDIA APEX fused SGD implementation for faster trainingFTr   r   	fusedadamzapex.optimizers.FusedAdamz%NVIDIA APEX fused Adam implementationadam_w_moder   
fusedadamwz&NVIDIA APEX fused AdamW implementation	fusedlambzapex.optimizers.FusedLAMBz%NVIDIA APEX fused LAMB implementationr   fusednovogradzapex.optimizers.FusedNovoGradz)NVIDIA APEX fused NovoGrad implementationr   )gffffff?g\(\?Nr?   rc   )r   apex_optimizersr   s      r;   _register_apex_optimizersr?  ~  s     	0R $'	
 	1?#U+	
 	1@#T*	
 	1?		
 	 5C|,	
;$OJ  #rT   c                 h   t        dddddddi      t        dd	d
ddddi      t        dddd      t        dddd      t        dddd      t        dddd      t        ddddd      t        ddddd      t        ddddd !      t        d"d#d$dd !      g
}|D ]  }| j                  |        y%)&z.Register bitsandbytes optimizers (lazy import)bnbsgdzbitsandbytes.optim.SGDzbitsandbytes SGDFTr   r   
bnbsgd8bitzbitsandbytes.optim.SGD8bitz0bitsandbytes 8-bit SGD with dynamic quantizationbnbadamzbitsandbytes.optim.Adamzbitsandbytes Adamr   bnbadam8bitz1bitsandbytes 8-bit Adam with dynamic quantizationbnbadamwzbitsandbytes.optim.AdamWzbitsandbytes AdamWbnbadamw8bitz2bitsandbytes 8-bit AdamW with dynamic quantizationbnblionzbitsandbytes.optim.Lionzbitsandbytes Lion)rC   rD   rF   bnblion8bitzbitsandbytes.optim.Lion8bitz1bitsandbytes 8-bit Lion with dynamic quantizationbnbademamixzbitsandbytes.optim.AdEMAMixzbitsandbytes AdEMAMixr  )rC   rF   rH   bnbademamix8bitzbitsandbytes.optim.AdEMAMix8bitz5bitsandbytes 8-bit AdEMAMix with dynamic quantizationNr=  )r   bnb_optimizersr   s      r;   _register_bnb_optimizersrL    s)    	.* $'	
 	2J $'	
 	/+		
 	/K		
 	0,		
 	0L		
 	%+	
 	)K	
 	)/	
 	-O	
}ENL  #rT   c                  L   t        t               t        t               t        t               t	        t               t        t               t        t               t        t               t        t               t        j                  dd       t        j                  dd       y)z7Register all default optimizers to the global registry.r   r   	nesterovwr   N)
r   default_registryr   r   r5  r?  rL  r  r	  rh   rS   rT   r;   _register_default_optimizersrP    sn     +,,-()/0./-.!"23()9: ##J6##K8rT   rm   rn   ro   c                 0    t         j                  | ||      S )a^  List available optimizer names, optionally filtered.

    List all registered optimizers, with optional filtering using wildcard patterns.
    Optimizers can be filtered using include and exclude patterns, and can optionally
    return descriptions with each optimizer name.

    Args:
        filter: Wildcard style filter string or list of filter strings
            (e.g., 'adam*' for all Adam variants, or ['adam*', '*8bit'] for
            Adam variants and 8-bit optimizers). Empty string means no filtering.
        exclude_filters: Optional list of wildcard patterns to exclude. For example,
            ['*8bit', 'fused*'] would exclude 8-bit and fused implementations.
        with_description: If True, returns tuples of (name, description) instead of
            just names. Descriptions provide brief explanations of optimizer characteristics.

    Returns:
        If with_description is False:
            List of optimizer names as strings (e.g., ['adam', 'adamw', ...])
        If with_description is True:
            List of tuples of (name, description) (e.g., [('adam', 'Adaptive Moment...'), ...])

    Examples:
        >>> list_optimizers()
        ['adam', 'adamw', 'sgd', ...]

        >>> list_optimizers(['la*', 'nla*'])  # List lamb & lars
        ['lamb', 'lambc', 'larc', 'lars', 'nlarc', 'nlars']

        >>> list_optimizers('*adam*', exclude_filters=['bnb*', 'fused*'])  # Exclude bnb & apex adam optimizers
        ['adam', 'adamax', 'adamp', 'adamw', 'nadam', 'nadamw', 'radam']

        >>> list_optimizers(with_description=True)  # Get descriptions
        [('adabelief', 'Adapts learning rate based on gradient prediction error'),
         ('adadelta', 'torch.optim Adadelta, Adapts learning rates based on running windows of gradients'),
         ('adafactor', 'Memory-efficient implementation of Adam with factored gradients'),
        ...]
    )rO  r   )rm   rn   ro   s      r;   r   r     s    T ++FOEUVVrT   r@   c                 ,    t         j                  |       S )zGet the OptimInfo for an optimizer.

    Args:
        name: Name of the optimizer

    Returns:
        OptimInfo configuration

    Raises:
        ValueError: If optimizer is not found
    )rO  r   )r@   s    r;   r   r   :  s     ..t44rT   r   c                 0    t         j                  | |      S )a  Get optimizer class by name with option to bind default arguments.

    Retrieves the optimizer class or a partial function with default arguments bound.
    This allows direct instantiation of optimizers with their default configurations
    without going through the full factory.

    Args:
        name: Name of the optimizer to retrieve (e.g., 'adam', 'sgd')
        bind_defaults: If True, returns a partial function with default arguments from OptimInfo bound.
            If False, returns the raw optimizer class.

    Returns:
        If bind_defaults is False:
            The optimizer class (e.g., torch.optim.Adam)
        If bind_defaults is True:
            A partial function with default arguments bound

    Raises:
        ValueError: If optimizer name is not found in registry

    Examples:
        >>> # Get SGD with nesterov momentum default
        >>> SGD = get_optimizer_class('sgd')  # nesterov=True bound
        >>> opt = SGD(model.parameters(), lr=0.1, momentum=0.9)

        >>> # Get raw optimizer class
        >>> SGD = get_optimizer_class('sgd')
        >>> opt = SGD(model.parameters(), lr=1e-3, momentum=0.9)

    r   )rO  r   )r@   r   s     r;   r   r   I  s    D //M/RRrT   r   r   r   r   r   r   filter_bias_and_bnr   r   r   c	                 @    t        j                  | f||||||||d|	S )a  Create an optimizer instance via timm registry.

    Creates and configures an optimizer with appropriate parameter groups and settings.
    Supports automatic parameter group creation for weight decay and layer-wise learning
    rates, as well as custom parameter grouping.

    Args:
        model_or_params: A PyTorch model or an iterable of parameters/parameter groups.
            If a model is provided, parameters will be automatically extracted and grouped
            based on the other arguments.
        opt: Name of the optimizer to create (e.g., 'adam', 'adamw', 'sgd').
            Use list_optimizers() to see available options.
        lr: Learning rate. If None, will use the optimizer's default.
        weight_decay: Weight decay factor. Will be used to create param groups if model_or_params is a model.
        momentum: Momentum factor for optimizers that support it. Only used if the
            chosen optimizer accepts a momentum parameter.
        foreach: Enable/disable foreach (multi-tensor) implementation if available.
            If None, will use optimizer-specific defaults.
        filter_bias_and_bn: If True, bias, norm layer parameters (all 1d params) will not have
            weight decay applied. Only used when model_or_params is a model and
            weight_decay > 0.
        layer_decay: Optional layer-wise learning rate decay factor. If provided,
            learning rates will be scaled by layer_decay^(max_depth - layer_depth).
            Only used when model_or_params is a model.
        param_group_fn: Optional function to create custom parameter groups.
            If provided, other parameter grouping options will be ignored.
        **kwargs: Additional optimizer-specific arguments (e.g., betas for Adam).

    Returns:
        Configured optimizer instance.

    Examples:
        >>> # Basic usage with a model
        >>> optimizer = create_optimizer_v2(model, 'adamw', lr=1e-3)

        >>> # SGD with momentum and weight decay
        >>> optimizer = create_optimizer_v2(
        ...     model, 'sgd', lr=0.1, momentum=0.9, weight_decay=1e-4
        ... )

        >>> # Adam with layer-wise learning rate decay
        >>> optimizer = create_optimizer_v2(
        ...     model, 'adam', lr=1e-3, layer_decay=0.7
        ... )

        >>> # Custom parameter groups
        >>> def group_fn(model):
        ...     return [
        ...         {'params': model.backbone.parameters(), 'lr': 1e-4},
        ...         {'params': model.head.parameters(), 'lr': 1e-3}
        ...     ]
        >>> optimizer = create_optimizer_v2(
        ...     model, 'sgd', param_group_fn=group_fn
        ... )

    Note:
        Parameter group handling precedence:
        1. If param_group_fn is provided, it will be used exclusively
        2. If layer_decay is provided, layer-wise groups will be created
        3. If weight_decay > 0 and filter_bias_and_bn is True, weight decay groups will be created
        4. Otherwise, all parameters will be in a single group
    )r   r   r   r   r   r   r   r   )rO  r   )
r   r   r   r   r   r   rT  r   r   r   s
             r;   create_optimizer_v2rV  n  sB    V ,,! 2%  rT   c                    t        | j                  | j                  | j                  | j                        }t        | dd      | j                  |d<   t        | dd      | j                  |d<   t        | dd      | j                  |d<   t        | dd      |j                  | j                         t        | d	d      | j                  |d
<   |S )z cfg/argparse to kwargs helper
    Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn.
    )r   r   r   r   opt_epsNr   	opt_betasr   r   r   opt_foreachr   )dictr   r   r   r   r4   rX  rY  r   r{   r   rZ  )cfgr   s     r;   optimizer_kwargsr]    s     GG66%%	F sIt$0usK&2--wsM4(4 #}sJ%1cll#sM4(4OOyMrT   modelc                 6    t        |fi t        |       d|iS )zk Legacy optimizer factory for backwards compatibility.
    NOTE: Use create_optimizer_v2 for new code.
    )r\  rT  )rV  r]  )argsr^  rT  s      r;   r   r     s+     
t
$ . rT   r   r   r   )r   Nr   r   NTNN)grN   loggingdataclassesr   	functoolsr   typingr   r   r   r   r	   r
   r   r   r   r   r2   r   torch.nnr   torch.optim_param_groupsr   r   _typesr   r   r   r   r   r   r   adafactor_bvr   r'  r   r   r   r   r   r%  r   r   r   r(  r   r   r   r*  r    r   r!   rX   r"   r   r#   r+  r$   r-  r%   r   r&   r   r'   
nvnovogradr(   r   r)   
rmsprop_tfr*   r   r+   r   r,   	getLoggerrK   ra   rO   r<   r?   rV   r   r   r   r	  r  r5  r?  rL  rO  rP  rQ   r   r   r   r   r   r   r   rV  r]  r   rS   rT   r;   <module>rm     s]    !  O O O      N 9 9     , "                "  !  
'

H
%C C C $. . .0} }@$%6 $4 $N^&7 ^D ^B;"3 ; ;|{3D { {|m,= m$ m^w.): w.t w.t((9 (d (VI'8 IT IX %& 9$  
 )+/3!&*Wc49n%*W!$s),*W *W 
%U38_$
%&	*WZ5S 5Y 5" #"S"S"S 9''("SN " "&#''+CGVryy'12VV UOV 	V
 V $V !V e_V !299+w*>!?@V V [[Vr4 $(RYY'( ! [[	rT   