
    khdP                        d Z ddlZddlmZmZ ddlmZ ddlmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z  ddl!m"Z"m#Z# e G d d             Z$e G d d             Z%dHdZ& G d dejN                        Z( G d dejN                        Z) G d dejN                        Z* G d dejN                        Z+ G d dejN                        Z, G d dejN                        Z-dIdZ.dHdZ/ e  e/d d!"       e/d#d!"       e/d$d%"       e/d&d%"       e/d'd%"       e/d(d%d)d*+       e/d,d%d-d*+       e/d.d%d/d*+       e/d0d1"       e/d2d1d)d*+       e/d3d1d-d*+       e/d4d1d/d*+       e/d5d6d)d*+       e/d7d6d-d*+       e/d8d6d/d*+      d9      Z0edJd:e"fd;       Z1edJd:e"fd<       Z2edJd:e"fd=       Z3edJd:e"fd>       Z4edJd:e"fd?       Z5edJd:e"fd@       Z6edJd:e"fdA       Z7edJd:e"fdB       Z8edJd:e"fdC       Z9edJd:e"fdD       Z:edJd:e"fdE       Z;edJd:e"fdF       Z<edJd:e"fdG       Z=y)Kab   ViTamin

Paper: Designing Scalable Vison Models in the Vision-Language Era
A family of model weights on Huggingface: https://huggingface.co/collections/jienengchen/vitamin-family-661048126b72debdaca060bf

@inproceedings{chen2024vitamin,
  title={ViTamin: Designing Scalable Vision Models in the Vision-language Era},
  author={Chen, Jieneng and Yu, Qihang and Shen, Xiaohui and Yuille, Alan and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year={2024}
}

Based on Apache 2.0 licensed code at https://github.com/ViTamin/ViTamin

Modifications and timm support by Jieneng Chen 2024

Reference:
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer_hybrid.py
    N)	dataclassfield)partial)OptionalUnionTupleOPENAI_CLIP_MEANOPENAI_CLIP_STD)create_act_layerget_norm_layerget_norm_act_layercreate_conv2dmake_divisibleDropPathHybridEmbed   )build_model_with_cfg)named_applycheckpoint_seq)register_modelgenerate_default_cfgs)VisionTransformercheckpoint_filter_fnc                       e Zd ZU dZeed<   dZeed<   dZe	ed<   dZ
e	ed<   d	Zeed
<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZeed<   y)
VitConvCfg      @expand_ratioTexpand_output   kernel_sizer   
group_sizeFpre_norm_actdwstride_modeavg2	pool_typedownsample_pool_typegelu	act_layer 
norm_layergh㈵>norm_epsdown_shortcutmlpN)__name__
__module____qualname__r   float__annotations__r   boolr!   intr"   r#   r%   strr'   r(   r*   r,   r-   r.   r   r/        O/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/vitamin.pyr   r   '   s    L%M4KJL$KIs &#&IsJHe$(M8D>(Cr9   r   c                       e Zd ZU dZeeeeedf   f   df   ed<   dZeeeeedf   f   df   ed<   dZ	eed<    e
e      Zeed	<   d
Zeed<   y)VitCfg)`           .	embed_dim)   r       rB   depths@   
stem_width)default_factoryconv_cfgr+   	head_typeN)r0   r1   r2   rA   r   r   r6   r4   rD   rF   r   r   rH   rI   r7   r8   r9   r:   r<   r<   8   sq    9LIuU3c3h/0#56L6BFE%U38_,-s23BJ <Hj<Isr9   r<   c                    t        | t        j                        r| j                  d   | j                  d   z  | j                  z  }|| j
                  z  }t        j                  j                  | j                  dt        j                  d|z               | j                  *t        j                  j                  | j                         y y y )Nr   r          @)
isinstancennConv2dr!   out_channelsgroupsinitnormal_weightmathsqrtbiaszeros_)modulenameschemefan_outs       r:   
_init_convr\   A   s    &"))$$$Q'&*<*<Q*??&BUBUUFMM!
q$))C'M*BC;;"GGNN6;;' #	 %r9   c                   H     e Zd Z	 	 	 	 d	dedededededef fdZd Z xZ	S )
Stemin_chsout_chsr*   r,   r-   rV   c                     t         |           t        t        ||      |      }|| _        t        ||dd|      | _         ||      | _        t        ||dd|      | _        t        t        |        y )Nepsr    rB   striderV   r   )super__init__r   r   r`   r   conv1norm1conv2r   r\   )	selfr_   r`   r*   r,   r-   rV   norm_act_layer	__class__s	           r:   rg   zStem.__init__K   sj     	 !3J	!JPXY"67AadK
#G,
"7GQqtL
J%r9   c                 l    | j                  |      }| j                  |      }| j                  |      }|S N)rh   ri   rj   rk   xs     r:   forwardzStem.forward^   s.    JJqMJJqMJJqMr9   )r)   layernorm2dư>T)
r0   r1   r2   r6   r7   r3   r5   rg   rr   __classcell__rm   s   @r:   r^   r^   J   sU    
 $+"&& & 	&
 & & &&r9   r^   c            	       <     e Zd Z	 	 ddedededef fdZd Z xZS )Downsample2ddimdim_outr'   rV   c                     t         |           t        j                  dddd      | _        ||k7  rt        j
                  ||d|      | _        y t        j                         | _        y )Nr    rB   r   F)r!   re   paddingcount_include_padrV   )rf   rg   rM   	AvgPool2dpoolrN   expandIdentity)rk   ry   rz   r'   rV   rm   s        r:   rg   zDownsample2d.__init__f   sQ     	LLQq!W\]	'>))C!$?DK++-DKr9   c                 J    | j                  |      }| j                  |      }|S ro   )r   r   rp   s     r:   rr   zDownsample2d.forwardu   s!    IIaLKKNr9   )r&   T)	r0   r1   r2   r6   r7   r5   rg   rr   ru   rv   s   @r:   rx   rx   e   s;    
 $(( ( 	(
 (r9   rx   c                   4     e Zd ZdZ	 	 	 	 	 d fd	Zd Z xZS )StridedConvz downsample 2d as well
    c                     t         |           t        t        d      d      }t	        j
                  |||||      | _         ||      | _        y )Nrs   rt   rb   )r!   re   r|   )rf   rg   r   r   rM   rN   projnorm)rk   r!   re   r|   in_chansrA   r,   rm   s          r:   rg   zStridedConv.__init__~   sG     	^M:E
IIh	{SYcjk	x(	r9   c                 J    | j                  |      }| j                  |      }|S ro   )r   r   rp   s     r:   rr   zStridedConv.forward   s!    IIaLIIaLr9   )r    rB   r   r    r@   )r0   r1   r2   __doc__rg   rr   ru   rv   s   @r:   r   r   {   s#     )r9   r   c                   f     e Zd ZdZ	 	 	 	 	 	 	 ddededededededed	ed
ef fdZddZd Z	 xZ
S )MbConvLNBlockzL Pre-Norm Conv Block - 1x1 - kxk - 1x1, w/ inverted bottleneck (expand)
    r_   r`   re   	drop_pathr!   r,   r-   r*   r   c
           	         t         t        |           |||c| _        | _        | _        t        ||	z        }
t        t        ||      |      }|dk(  rt        ||dd      | _
        n=||k7  rt        j                  ||dd      | _
        nt        j                         | _
         ||d	      | _        t        j                         | _        t!        ||
ddd
      | _        t%        |d      | _        t!        |
|
||d|
d      | _        t%        |d      | _        t!        |
|dd      | _        |dkD  rt/        |      | _        y t        j                         | _        y )Nrb   rB   avgT)r'   rV   r   r~   F)	apply_actrd   )inplace)re   dilationrP   rV           )rf   r   rg   re   r_   r`   r   r   r   rx   shortcutrM   rN   r   pre_normdownr   	conv1_1x1r   act1	conv2_kxkact2	conv3_1x1r   r   )rk   r_   r`   re   r   r!   r,   r-   r*   r   mid_chsprenorm_act_layerrm   s               r:   rg   zMbConvLNBlock.__init__   s*    	mT+-17.T[$, <!78#$6z9$MS[\Q;(EPTUDMwIIfgqtDDMKKMDM)&EBKKM	&vw!$O$Y=	&Wk&1W[_a$Y=	&wF09B),BKKMr9   c                 :    t        t        t        |      |        y )N)rZ   )r   r   r\   )rk   rZ   s     r:   init_weightszMbConvLNBlock.init_weights   s    GJv6=r9   c                 >   | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      |z   }|S ro   )	r   r   r   r   r   r   r   r   r   )rk   rq   r   s      r:   rr   zMbConvLNBlock.forward   s    ==#MM!IIaL NN1IIaL NN1IIaL NN1NN1(r9   )r   r   r    rs   rt   r)   r   r+   )r0   r1   r2   r   r6   r3   r7   rg   r   rr   ru   rv   s   @r:   r   r      s     ! +"#"% R R  R 	 R
  R  R  R  R  R   RF>r9   r   c            	       P     e Zd ZdZ	 	 ddedeeeeef   f   def fdZd Z	 xZ
S )MbConvStagesz3 MobileConv for stage 1 and stage 2 of ViTamin
    cfgimg_sizer   c           
      b   t         
|           d| _        t        ||j                        | _        g }t        |j                        | _        t        |j                  d d       D ]}  \  }}|dkD  r|j                  |dz
     n|j                  }t        |j                  |         D cg c]  }t        |dk(  r|n|||dk(  rdnd        }	}|t        j                  |	 gz  } t        j                  | | _        t!        d|j                  d   |j                  d         | _        y c c}w )NF)r_   r`   rB   r   r   )r_   r`   re   )re   r   rA   )rf   rg   grad_checkpointingr^   rF   stemlenrA   
num_stages	enumeraterangerD   r   rM   
Sequentialstagesr   r   )rk   r   r   r   r   sry   stage_in_chsdblocksrm   s             r:   rg   zMbConvStages.__init__   s(    	"'NN
	
 cmm,bq 12 
	/FAs1213==1-#..L szz!}-  -.T\s!"#q&QaF  r}}f-..F
	/ mmV,]]1%mmA&
	s   *#D,c                     | j                  |      }| j                  r5t        j                  j	                         st        | j                  |      }n| j                  |      }| j                  |      }|S ro   )r   r   torchjitis_scriptingr   r   r   rp   s     r:   rr   zMbConvStages.forward   sV    IIaL""599+A+A+Ct{{A.AAAIIaLr9   )   r    )r0   r1   r2   r   r<   r   r6   r   rg   rr   ru   rv   s   @r:   r   r      sI    
 58	!
!
 CsCx01!
 	!
Fr9   r   c                   .     e Zd Z	 	 	 	 d fd	Zd Z xZS )GeGluMlpc                 >   t         |           t        t        |xs d      d      } ||      | _        t        j                  |||      | _        t        |      | _	        t        j                  |||      | _
        t        j                  |||      | _        y )N	layernormrt   rb   r~   )rf   rg   r   r   r   rM   Linearw0r   actw1w2)rk   in_featureshidden_featuresr*   r,   rV   droprm   s          r:   rg   zGeGluMlp.__init__  sz     	^J,E+FDQ
{+	))KtD#I.))KtD))O[tDr9   c                     | j                  |      }| j                  | j                  |            | j                  |      z  }| j	                  |      }|S ro   )r   r   r   r   r   rp   s     r:   rr   zGeGluMlp.forward  sD    IIaLHHTWWQZ 4771:-GGAJr9   )r)   NTr   )r0   r1   r2   rg   rr   ru   rv   s   @r:   r   r      s    
 E$r9   r   c                     |j                  dd      }|J t        ||j                  dd            }t        t        |d      |d<   |j                  dd	       t        t        | |ft        t        |d
      d|S )Nout_indicesr    r   )r   r   F)backboner   embed_layer
patch_sizer   getter)r   feature_cls)pretrained_filter_fnfeature_cfg)
popr   getr   r   
setdefaultr   r   r   dict)variant
pretrained	embed_cfgkwargsr   r   s         r:   _create_vitaminr     s    **]A.K   	FJJz14MNH#K(OF=
lA& 2[hG  r9   c                 2    | ddd dddt         t        ddd|S )	Ni  )r    r   r   g?bicubicTzpatch_embed.backbone.stem.conv1head)urlnum_classes
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanstd
first_conv
classifierr	   )r   r   s     r:   _cfgr   +  s2    =t 7  r9   zjienengchen/ViTamin-S-LTTr?   )	hf_hub_idr   zjienengchen/ViTamin-Szjienengchen/ViTamin-B-LTTr@   zjienengchen/ViTamin-Bzjienengchen/ViTamin-L-224pxzjienengchen/ViTamin-L-256px)r       r   g      ?)r   r   r   r   zjienengchen/ViTamin-L-336px)r    P  r   zjienengchen/ViTamin-L-384px)r    r?   r?   zjienengchen/ViTamin-L2-224px   zjienengchen/ViTamin-L2-256pxzjienengchen/ViTamin-L2-336pxzjienengchen/ViTamin-L2-384pxzjienengchen/ViTamin-XL-256px  zjienengchen/ViTamin-XL-336pxzjienengchen/ViTamin-XL-384px)z%vitamin_small_224.datacomp1b_clip_lttz!vitamin_small_224.datacomp1b_clipz$vitamin_base_224.datacomp1b_clip_lttz vitamin_base_224.datacomp1b_clipz!vitamin_large_224.datacomp1b_clipz!vitamin_large_256.datacomp1b_clipz!vitamin_large_336.datacomp1b_clipz!vitamin_large_384.datacomp1b_clipz"vitamin_large2_224.datacomp1b_clipz"vitamin_large2_256.datacomp1b_clipz"vitamin_large2_336.datacomp1b_clipz"vitamin_large2_384.datacomp1b_clipz"vitamin_xlarge_256.datacomp1b_clipz"vitamin_xlarge_336.datacomp1b_clipz"vitamin_xlarge_384.datacomp1b_clipreturnc           
          t        dddt        dd      d      }t        d	d
dt        ddd|      }t	        dd| it        |fi |}|S )N)rE      r?   rB      r   rE   rs   rt   r,   r-   1drA   rD   rF   rH   rI   r?         rK   Fr   rA   depth	num_heads	mlp_layer	mlp_ratioclass_tokenglobal_poolr   r   )vitamin_small_224r<   r   r   r   r   r   r   r   
model_argsmodels        r:   r   r   b  sj     $
 	I R1Bu	J eJe$zJd]cJdeELr9   c           
          t        dddt        dd      d      }t        d	d
dt        ddd|      }t	        dd| it        |fi |}|S )N)r   r   r@   r   r   rs   rt   r   r   r   r@   r      rK   Fr   r   r   )vitamin_base_224r   r   s        r:   r  r  v  sk    !$
 	I R2Ru	CJ d:djIc\bIcdELr9   c           
          t        dddt        dd      d      }t        d	d
dt        ddd|      }t	        dd| it        |fi |}|S )N   i@  r   r   r  rs   rt   r   r   r   r         rK   Fr   r   r   )vitamin_large_224r   r   s        r:   r
  r
    sj    "$
 	I bB(bu	J eJe$zJd]cJdeELr9   c                     t        dddt        dd      d      }t        d	d
ddt        ddd|	      }t	        dd| it        |fi |}|S )Nr  r   r  rs   rt   r   r   r   r   r   r  r	  rK   Fr   	r   rA   r   r   r   r   r   r   r   r   )vitamin_large_256r   r   s        r:   r  r    n    "$
 	I B"\^u	CJ eJe$zJd]cJdeELr9   c                     t        dddt        dd      d      }t        d	d
ddt        ddd|	      }t	        dd| it        |fi |}|S )Nr  r   r  rs   rt   r   r   r   r   r   r  r	  rK   Fr   r  r   )vitamin_large_336r   r   s        r:   r  r    sm    "$
 	I B"\^u	J eJe$zJd]cJdeELr9   c                     t        dddt        dd      d      }t        d	d
ddt        ddd|	      }t	        dd| it        |fi |}|S )Nr  r   r  rs   rt   r   r   r   r?   r   r  r	  rK   Fr   r  r   )vitamin_large_384r   r   s        r:   r  r    r  r9   c           
          t        dddt        dd      d      }t        d	d
dt        ddd|      }t	        dd| it        |fi |}|S )Nr  r   r  rs   rt   r   r   r   r   r  r	  rK   Fr   r   r   )vitamin_large2_224r   r   s        r:   r  r    sk    "$
 	I bB(bu	J fZf4PZKe^dKefELr9   c                     t        dddt        dd      d      }t        d	d
ddt        ddd|	      }t	        dd| it        |fi |}|S )Nr  r   r  rs   rt   r   r   r   r   r   r  r	  rK   Fr   r  r   )vitamin_large2_256r   r   s        r:   r  r    o    "$
 	I B"\^u	CJ fZf4PZKe^dKefELr9   c                     t        dddt        dd      d      }t        d	d
ddt        ddd|	      }t	        dd| it        |fi |}|S )Nr  r   r  rs   rt   r   r   r   r   r   r  r	  rK   Fr   r  r   )vitamin_large2_336r   r   s        r:   r  r    sn    "$
 	I B"\^u	J fZf4PZKe^dKefELr9   c                     t        dddt        dd      d      }t        d	d
ddt        ddd|	      }t	        dd| it        |fi |}|S )Nr  r   r  rs   rt   r   r   r   r?   r   r  r	  rK   Fr   r  r   )vitamin_large2_384r   r   s        r:   r  r    r  r9   c                     t        dddt        dd      d      }t        d	d
ddt        dddd|
      }t	        	 dd| it        |fi |}|S )Nr>   r?   r   r   r>   rs   rt   r   r   r   r   r       r	  rK   Fr   none
r   rA   r   r   r   r   r   r   	pos_embedr   r   vitamin_xlarge_256r   r   s        r:   r#  r#  %  s}    "$
 	I B"\^u)UJ S)3S7;J7Q&7QSELr9   c                     t        dddt        dd      d      }t        d	d
ddt        dddd|
      }t	        dd| it        |fi |}|S )Nr  r   r>   rs   rt   r   r   r   r   r   r  r	  rK   Fr   r  r   r   r"  r   r   s        r:   vitamin_xlarge_336r%  9  q    "$
 	I B"\^u)UJ fZf4PZKe^dKefELr9   c                     t        dddt        dd      d      }t        d	d
ddt        dddd|
      }t	        dd| it        |fi |}|S )Nr  r   r>   rs   rt   r   r   r   r?   r   r  r	  rK   Fr   r  r   r   )vitamin_xlarge_384r   r   s        r:   r(  r(  L  r&  r9   r   )FN)F)>r   rT   dataclassesr   r   	functoolsr   typingr   r   r   r   torch.nnrM   	timm.datar
   r   timm.layersr   r   r   r   r   r   r   _builderr   _manipulater   r   	_registryr   r   vision_transformerr   r   r   r<   r\   Moduler^   rx   r   r   r   r   r   r   default_cfgsr   r  r
  r  r  r  r  r  r  r  r#  r%  r(  r8   r9   r:   <module>r5     se  *  (  ) )   7* * * * 4 < G       (299 6299 ,")) .;BII ;|-299 -`ryy 4"	 %-1-3.@)-)s*<,0-3-@(,)s)<)-/S*B)-/S 3*0 *./S 3*0 *./S 3*0 +/0d+D*.0d 3+0 +/0d 3+0 +/0d 3+0 +/0d 3+0 +/0d 3+0 +/0d 3+0K(& (V 5F  & 4E  $ 5F  & 5F  $ 5F  & 5F  $ 6G  & 6G  $ 6G  & 6G  $ 6G  & 6G  $ 6G  r9   