
    khh2              '       &   d Z ddlmZmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ d Zd	eed
f   dededeeeef      fdZ G d de	      Zdee   dee   deee      deee      deee      deee      dee   dededededededee   deeej(                  f   dee   d ed!ed"ee   f&d#Zdee   dee   deee      deee      deee      deee      dee   dededededededee   deeej(                  f   dee   d ed!ed"ee   f&d$Zy)%a   Adafactor (Big Vision variant) for PyTorch

Adapted from the implementation in big vision: https://github.com/google-research/big_vision

Described in 'Scaling Vision Transformers': https://arxiv.org/abs/2106.04560

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Adaptation and PyTorch modifications by Ross Wightman
    )ListOptionalTupleUnionN)Tensor)	Optimizer   )ParamsTc                  "    t         j                  S )z6Get the scalar dtype that the optimizer uses for state)torchfloat64     S/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/optim/adafactor_bv.py_get_scalar_dtyper      s    ==r   shape.factoredmin_dim_size_to_factorreturnc                     |rt        |       dk  ryt        d t        |       D              }| |d   d      |k  ryt        |d   d         t        |d   d         fS )a  Whether to use a factored second moment estimator.

    This function returns a tuple with the two largest axes to reduce over.
    If no two dimensions have size >= min_dim_size_to_factor, return None.

    Args:
      shape: an input shape
      factored: whether to use factored second-moment estimator for > 2d vars.
      min_dim_size_to_factor: only factor accumulator if two array dimensions have at least this size.

    Returns:
      None or a tuple of ints
       Nc              3   *   K   | ]  \  }}||f  y wNr   ).0ixs      r   	<genexpr>z!_factored_dims.<locals>.<genexpr>/   s     >TQ1a&>s   r	   )lensorted	enumerateint)r   r   r   sorted_dimss       r   _factored_dimsr%      sj    $ s5zA~>Yu-=>?K[_Q #99{2q!"CB(:$;;;r   c            !           e Zd ZdZddddddej
                  dd	dd
d
d
fd
ddedededededede	e   de
eej                  f   de	e   dede	e   dededede	e   f fdZ fdZ ej                          dd       Z xZS )AdafactorBigVisionz
    PyTorch implementation of BigVision's Adafactor variant with both single and multi tensor implementations.

    Adapted from https://github.com/google-research/big_vision by Ross Wightman
          ?   g?r   g+?g?N        F)foreachparamslrr   
decay_ratedecay_offset	beta2_capmomentummomentum_dtypeepsweight_decayclipping_thresholdunscaled_wdcautioncorrected_weight_decayr+   c                   t        |t              rK|dk(  rt        j                  }n5|dk(  rt        j                  }n|dk(  s
J | d       t        j
                  }t        ||||||||	|
|||||      }t        | !  ||       y )Nfloat16bfloat16float32z dtype not supported)r-   r   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r+   )	
isinstancestrr   r:   r;   r<   dictsuper__init__)selfr,   r-   r   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r+   defaults	__class__s                    r   rA   zAdafactorBigVision.__init__<   s    & nc**!&:-!&%2[~6FFZ4[[2!& #9!%)%1##9
  	*r   c                 ,   t         |   |       | j                  D ]  }|j                  dd       |j                  dd       |j                  dd        |d   D ]  }| j                  j                  |i       }t        |      dk7  rFt        j                  |d         s.t        j                  t        |d         t                     |d<   d	|v sxt        j                  |d	         s|d	   j                  | j                  d
         |d	<     y )Nr7   Fr8   r+   r,   r   stepdtypeexp_avgr2   )r@   __setstate__param_groups
setdefaultstategetr    r   	is_tensortensorfloatr   torC   )rB   rM   grouppp_staterD   s        r   rJ   zAdafactorBigVision.__setstate__k   s    U#&& 	fEY.5u=Y-8_ 	f**..B/w<1$U__WV_-M&+ll53IQbQd&eGFO'EOOGI<N,O *1);)>)>T]]ScEd)>)eGI&	f		fr   c                    d }|$t        j                         5   |       }d d d        | j                  D ]  }g }g }g }g }g }g }	g }
|d   D ]d  }|j                  |j                  j                  rt        d      |j                  |       |j                  |j                         | j                  |   }t        |      dk(  rQt        j                  dt                     |d<   |j                  j                  }t        |d| j                  d   	      }||\  }}t        |j                  j                        }d
||<   t        |j                  j                        }d
||<   |j                  j                  |      |d<   |j                  j                  |      |d<   n2t        j                   |j                  t         j"                        |d<   | j                  d   1t        j                   |j                  | j                  d         |d<   |	j                  |d          |j                  |j%                  dd              |j                  |j%                  dd              |j                  |j%                  dd              |
j                  |j%                  dd              g |d   rt&        }nt(        } |d$i d|d|d|d|d|d|
d|	d|d   d|d   d|d   d|d   d|d   d|d   d|d   d|d   d|d   d |d    d!|d!   d"|d#   r| j                  d   nd   |S # 1 sw Y    xY w)%Nr,   zSparse gradients not supportedr   r*   rG   rF   Tr   )r   r   r	   exp_avg_sq_rexp_avg_sq_c)memory_format
exp_avg_sqr1   r2   rI   r+   gradsexp_avg_sq_rsexp_avg_sq_csexp_avg_sqsexp_avgsstate_stepsbeta2_decayr.   r0   r3   r-   r4   r5   r6   r7   max_lrr8   r   )r   enable_gradrK   grad	is_sparseRuntimeErrorappendrM   r    rP   r   r   r%   rC   list	new_zeros
zeros_likepreserve_formatrN   _multi_tensor_adafactor_single_tensor_adafactor)rB   closurelossrS   params_with_gradr[   r\   r]   r^   r`   r_   rT   rM   r   factored_dimsdcdr	row_shape	col_shapefuncs                       r   rF   zAdafactorBigVision.step|   s   ""$ !y! && M	E!EMMKKH8_ )<66>66##&'GHH ''*QVV$

1u:?$)LL<M<O$PE&MFFLLE$2!%/3}}=U/V%M %0!.B$($6	()	"$($6	()	"010@0@0Kn-010@0@0Kn-.3.>.>qvvUZUjUj.kl+}}Z0<+0+;+;AFF$--XhJi+ji(""5=1$$UYY~t%DE$$UYY~t%DE""599\4#@A		)T :;S)<V Y./ ' , ,	
 ( " ( ",/  , (--E'F %L ; #>2 z*  %%56  $))=#>!" "-0#$ i(%& /44L.Mt}}T*SW'sM	^ e! !s   L??M	r   )__name__
__module____qualname____doc__r   r;   r
   rQ   r#   r   r   r>   rH   boolrA   rJ   no_gradrF   __classcell__)rD   s   @r   r'   r'   5   s    *, # !$(+6;nn#'"%26 %!+0-+" ',#-+-+ -+ %(	-+
 -+ -+ -+ uo-+ "#u{{"23-+ %-+  -+ !)-+ -+ -+ %)-+" d^#-+^f" U]]_U Ur   r'   r,   r[   r\   r]   r^   r_   r`   ra   r0   r3   r-   r4   r1   r2   r5   r6   r7   rb   c                R   t        |       D ]  \  }}||   }||   }||   }||   }||   }||   }|
!|j                  t        j                  k(  rdnd}
|dz  }t	        |dt        |      | z  z
        }d|z
  }t        j                  |      |
z   }|t        |j                  d|	      \  }}|j                  |j                  |d      |       |j                  |j                  |d      |       ||kD  r|dz
  n|} |j                  | d      }!||!z  j                         }"|j                         }#||"z  |#z  }$n+||J |j                  ||       ||j                         z  }$|I|$j                  d      |$j                         d	z  |z  z  j                  d
      }%|$j                  |%       ||||j                  k7  r@|j                  |$j!                  |      d|z
         |j!                  |j                        }$n%|j                  |$d|z
         |j#                         }$|ra|$|z  dkD  j!                  |j                        }&|&j                  |&j                         j                  d             |$j%                  |&       |$j%                  |       |dk7  rk|r2||j%                  d|z
         nR|j%                  d||z  |z  z
         n7||j%                  d||z  z
         n|j%                  d|dz  |z  |z  z
         |j'                  |$d        y )NgHz>gKH9r	   r(   T)r   )dimkeepdimr   g      ?)maxr   gMbP?)ming      )alpha)r"   rH   r   r:   r   rQ   squarer%   r   lerp_meanrsqrtnormnumelclamp_div_rR   clonemul_add_)'r,   r[   r\   r]   r^   r_   r`   ra   r0   r   r3   r-   r4   r1   r2   r5   r6   r7   rb   r   paramrd   rW   rX   rZ   rI   step_tbeta2_tone_minus_beta2_tgrad_sqrrr   rs   	reduce_dcrow_col_mean
row_factor
col_factorupdatedenommasks'                                          r   rm   rm      s-   , f% M'5Qx$Q'$Q' ^
1+Q;**5$5C 	!iuV}+'F!FGK<<%+#DJJMcdFBx}}T}BDUVx}}T}BDUV"$r'QrI',,D,IL&5<<>J%++-JJ&3F  'L,@@@X'89J,,..F )[[^#(=AS'ST\\ad\eEKK G$7+fii7XF DJJ/fa(l3 )--djj9		$))+,,,67D! 	B 1>JJrL01 JJrR&[L$@@A >JJrB$556 JJrR1Wv%5$EEF 	

6
&[M'r   c                    J d       )Nz2multi-tensor fn (foreach=True) not implemented yetr   )r,   r[   r\   r]   r^   r_   r`   ra   r0   r   r3   r-   r4   r1   r2   r5   r6   r7   rb   s                      r   rl   rl   ;  s    . GFF5r   )rz   typingr   r   r   r   r   r   torch.optimr   _typesr
   r   r#   r{   tupler%   r'   rQ   r>   rH   rm   rl   r   r   r   <module>r      s   0 /   ! 
<S#X<< !$< eCHo	<4] ]@c'Vc'F|c' HV,-c' HV,-	c'
 (6*+c' x'(c' &\c' c' c' !$c' c' c' c' 5/c'  c5;;./!c'" %UO#c'$ %c'& 'c'( )c'LGVGF|G HV,-G HV,-	G
 (6*+G x'(G &\G G G !$G G G G 5/G  c5;;./!G" %UO#G$ %G& 'G( )Gr   