
    kh%F                        d Z ddlmZ ddlmZmZmZmZ ddlZddl	m
Z
 ddlmZmZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZ ddl m!Z!  G d de
jD                        Z#d^dZ$d_dZ%	 	 d`dee&ejN                  f   de!de&de(dee&ejN                  f   f
dZ)dadZ*dbdZ+ ei d e+dddd      d e+d ddd!d"d#      d$ e+d%dd&      d' e+d(dd!d"d)      d* e+       d+ e+d,dd!d"-      d. e+d/dd&      d0 e+d1dd!d"d)      d2 e+d3dd4d5dd6      d7 e+d8dd4d5d9      d: e+ddd5;      d< e+d=dd4d5d9      d> e+eed?@      dA e+eed?@      dB e+eed?@      dC e+eed?@      dD e+dEdFdGdHdIdJK      dL e+dMdNdGdHdIdJK      i      Z,edcde!fdO       Z-edcde!fdP       Z.edcde!fdQ       Z/edcde!fdR       Z0edcde!fdS       Z1edcde!fdT       Z2edcde!fdU       Z3edcde!fdV       Z4edcde!fdW       Z5edcde!fdX       Z6edcde!fdY       Z7edcde!fdZ       Z8edcde!fd[       Z9edcde!fd\       Z: ee;d2d7d:d:d<d+d]       y)da   Hybrid Vision Transformer (ViT) in PyTorch

A PyTorch implement of the Hybrid Vision Transformers as described in:

'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
    - https://arxiv.org/abs/2010.11929

`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
    - https://arxiv.org/abs/2106.10270

NOTE These hybrid model definitions depend on code in vision_transformer.py.
They were moved here to keep file sizes sane.

Hacked together by / Copyright 2020, Ross Wightman
    )partial)DictTupleTypeUnionN)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)StdConv2dSame	StdConv2dConvNormAct	to_ntupleHybridEmbed   )build_model_with_cfg)generate_default_cfgsregister_modelregister_model_deprecations)	resnet26d	resnet50d)ResNetV2create_resnetv2_stem)VisionTransformerc                        e Zd Zddddddej                  ej
                  fdededeeeedf   f   d	eeeedf   f   d
eeeedf   f   dee	eeedf   f   de
ej                     de
ej                     f fdZ xZS )ConvStem   @   )   r   r    in_chansdepthchannels.kernel_sizestridepadding
norm_layer	act_layerc	                    t         |           t        |t              r.t	        t        |      D 	cg c]
  }	|d|	z  z   c}	d d d         } t        |      |      } t        |      |      }|t        |      cxk(  rt        |      cxk(  rt        |      k(  sJ  J |}
t        t        |            D ]L  }	|	t        |      dz
  k(  }| j                  |	 t        |
||	   ||	   ||	   ||	   || | ||
             ||	   }
N y c c}	w )Nr   r   )r"   r#   r$   bias
apply_norm	apply_actr%   r&   )
super__init__
isinstanceinttupleranger   len
add_moduler   )selfr   r    r!   r"   r#   r$   r%   r&   iin_chs	last_conv	__class__s               a/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/vision_transformer_hybrid.pyr-   zConvStem.__init__!   s    	h$eE1h!Q$.EddKLH&i&{3")E"7+FHs;'7H3x=HHHHHs8}% 	!AS]Q..IOOqcK'Nay
(='-%#%  a[F	! Fs   D)__name__
__module____qualname__nnBatchNorm2dReLUr/   r   r   strr   Moduler-   __classcell__)r8   s   @r9   r   r       s     46782;8:*,..)+#!#! #! CsCx01	#!
 sE#s(O34#! #uS#X./#! 3U38_45#! RYY#! BII#! #!    r   c           	         |j                  dd      }|rdnd}|rt        t        d      nt        t        d      }t	        |       r$t        | dd|j                  dd	      d
||      }|S t        |j                  dd	      |d
|      }|S )z ResNet-V2 backbone helperpadding_sameTsamer   g:0yE>)epsr   r   r   F)layersnum_classesglobal_poolr   preact	stem_type
conv_layer)rL   rK   rM   )getr   r
   r   r2   r   r   )rH   kwargsrE   rL   rM   backbones         r9   	_resnetv2rQ   G   s    ::nd3L&BI5AD1wy^bGcJ
6{qb6::jZ[C\I*F O (JJz1%5U_aOrC   c                 "   i }| j                         D ]x  \  }}|j                  |      s|j                  |d      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd	      }|j                  d
d      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|dk(  rd}|j                  d      }d|v rU|j                  dd      }|j                  dd      }|j                  }t        j                  |j                  d         ||<   |||<   { |S ) Nr   z
patch_emb.zpatch_embed.backbone.z
block.convconvz
block.normbnzpost_transformer_norm.znorm.zpre_norm_mha.0norm1zpre_norm_mha.1attnzpre_norm_ffn.0norm2zpre_norm_ffn.1zmlp.fc1zpre_norm_ffn.4zmlp.fc2z	qkv_proj.zqkv.z	out_proj.zproj.ztransformer.zblocks.zpos_embed.pos_embed.pos_embed	pos_embedr   zclassifier.projz	head.biaszhead.weight)items
startswithreplacesqueezeTtorchzerosshape)
state_dictmodelprefixoutkvbias_ks          r9   _convert_mobilecliprh   V   s   
C  " 1||F#IIfb!IIl$;<IIlF+IIlD)II.8II&0II&/II&0II&	2II&	2IIk6*IIk7+IIni0//A		!A!YY0+>F		+];AA++aggaj1CKA12 JrC   Tra   rb   interpolation	antialiasreturnc                 F    ddl m} d| v rt        | |      }  || |||      S )Nr   )checkpoint_filter_fnz1image_encoder.model.patch_emb.0.block.conv.weight)ri   rj   )vision_transformerrm   rh   )ra   rb   ri   rj   
_filter_fns        r9   rm   rm   t   s.     G:jH(U;
j%}PYZZrC   c                     |j                  dd      }|xs i }t        t        fd|i|}|j                  d|       |j                  dd       t	        t
        | |ft        t        |d      d	|S )
Nout_indicesr   rP   embed_layer
patch_sizer   getter)rq   feature_cls)pretrained_filter_fnfeature_cfg)popr   r   
setdefaultr   r   rm   dict)variantrP   
embed_args
pretrainedrO   rq   rr   s          r9   !_create_vision_transformer_hybridr~      s    **]A.K!rJ+GGJGK
m[1
lA& 2[hG  rC   c                 "    | ddd dddddddd	|S )
Ni  )r      r   ?bicubicT)      ?r   r   zpatch_embed.backbone.stem.convhead)urlrI   
input_size	pool_sizecrop_pctri   fixed_input_sizemeanstd
first_conv
classifier )r   rO   s     r9   _cfgr      s1    =t6f  rC   z*vit_tiny_r_s16_p8_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npzztimm/zpatch_embed.backbone.conv)r   	hf_hub_idcustom_loadr   z*vit_tiny_r_s16_p8_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r     r         ?)r   r   r   r   r   r   z*vit_small_r26_s32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_light0-wd_0.03-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.03-res_224.npz)r   r   r   z*vit_small_r26_s32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r   r   r   r   r   zvit_base_r26_s32_224.untrainedz'vit_base_r50_s16_384.orig_in21k_ft_in1kzthttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_384-9fd3c705.pth)r   r   r   r   z*vit_large_r50_s32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npzz*vit_large_r50_s32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npzz"vit_tiny_r_s16_p8_224.augreg_in21kzohttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npziSU  r   )r   r   rI   r   r   r   z"vit_small_r26_s32_224.augreg_in21kzshttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0.npz)r   r   rI   r   r   zvit_base_r50_s16_224.orig_in21k)r   rI   r   z"vit_large_r50_s32_224.augreg_in21kzrhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0.npzz!vit_small_resnet26d_224.untrainedzpatch_embed.backbone.conv1.0)r   r   r   z%vit_small_resnet50d_s16_224.untrainedz vit_base_resnet26d_224.untrainedz vit_base_resnet50d_224.untrainedzvit_base_mci_224.apple_mclip_ltzapple/mobileclip_b_lt_timmzYhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_blt.pti   )        r   r   )r   r   r   zpatch_embed.backbone.0.conv)r   r   rI   r   r   r   zvit_base_mci_224.apple_mclipzapple/mobileclip_b_timmzWhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_b.ptc           	      p    t        dddi|}t        dddd      }t        	 d	|| dt        |fi |}|S )
z3 R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 224 x 224.
    rH   r            r   rs   	embed_dimr    	num_headsrP   r}   )vit_tiny_r_s16_p8_224rQ   rz   r~   r}   rO   rP   
model_argsrb   s        r9   r   r      Y     --f-HcqIJ-i*2ziMQR\Mg`fMgiELrC   c           	      p    t        dddi|}t        dddd      }t        	 d	|| dt        |fi |}|S )
z3 R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 384 x 384.
    rH   r   r   r   r   r   r   r   )vit_tiny_r_s16_p8_384r   r   s        r9   r   r      r   rC   c           	      j    t        di |}t        ddd      }t        	 d|| dt        |fi |}|S ) R26+ViT-S/S32 hybrid.
    r   r      r   r    r   r   )r   r   r   r   )vit_small_r26_s32_224r   r   s        r9   r   r     R     00H2;J-i*2ziMQR\Mg`fMgiELrC   c           	      j    t        di |}t        ddd      }t        	 d|| dt        |fi |}|S )r   r   r   r   r   r   r   )vit_small_r26_s32_384r   r   s        r9   r   r     r   rC   c           	      j    t        di |}t        ddd      }t        	 d|| dt        |fi |}|S )z R26+ViT-B/S32 hybrid.
       r   r   r   r   )vit_base_r26_s32_224r   r   s        r9   r   r     sR     00H2<J-h)1jhLPQ[Lf_eLfhELrC   c           	      j    t        di |}t        ddd      }t        	 d|| dt        |fi |}|S )zR R50+ViT-B/S16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
    r   r   r   r   )r      	   )vit_base_r50_s16_224r   r   s        r9   r   r   #  sR     -f-H2<J-h)1jhLPQ[Lf_eLfhELrC   c           	      j    t        di |}t        ddd      }t        	 d|| dt        |fi |}|S )z R50+ViT-B/16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
    r   r   r   r   r   )vit_base_r50_s16_384r   r   s        r9   r   r   .  sR    
 -f-H2<J-h)1jhLPQ[Lf_eLfhELrC   c           	      j    t        di |}t        ddd      }t        	 d|| dt        |fi |}|S ) R50+ViT-L/S32 hybrid.
             r   r   )r   r   r   r   )vit_large_r50_s32_224r   r   s        r9   r   r   :  R     00HB"=J-i*2ziMQR\Mg`fMgiELrC   c           	      j    t        di |}t        ddd      }t        	 d|| dt        |fi |}|S )r   r   r   r   r   r   r   )vit_large_r50_s32_384r   r   s        r9   r   r   E  r   rC   c           	          t        | |j                  dd      ddg      }t        dddd      }t        	 d
|| d	t        |fi |}|S )zL Custom ViT small hybrid w/ ResNet26D stride 32. No pretrained weights.
    r   r   Tr   r}   r   features_onlyrq   r   r   r   r    r   	mlp_ratior   )vit_small_resnet26d_224r   rN   rz   r~   r   s        r9   r   r   P  sh     JJPQ9RbfuvtwxH1QGJ-!k,4kOST^OibhOikELrC   c           	          t        | |j                  dd      ddg      }t        dddd      }t        	 d	|| dt        |fi |}|S )
zV Custom ViT small hybrid w/ ResNet50D 3-stages, stride 16. No pretrained weights.
    r   r   Tr   r   r   r   r   )vit_small_resnet50d_s16_224r   rN   rz   r~   r   s        r9   r   r   [  sh     JJPQ9RbfuvtwxH1QGJ-%o08ZoSWXbSmflSmoELrC   c           	          t        | |j                  dd      ddg      }t        ddd      }t        	 d
|| d	t        |fi |}|S )zK Custom ViT base hybrid w/ ResNet26D stride 32. No pretrained weights.
    r   r   Tr   r   r   r   r   r   )vit_base_resnet26d_224r   r   s        r9   r   r   f  f     JJPQ9RbfuvtwxH2<J- j+3
jNRS]NhagNhjELrC   c           	          t        | |j                  dd      ddg      }t        ddd      }t        	 d
|| d	t        |fi |}|S )K Custom ViT base hybrid w/ ResNet50D stride 32. No pretrained weights.
    r   r   Tr   r   r   r   r   r   )vit_base_resnet50d_224r   r   s        r9   r   r   q  r   rC   c           
          t        dddd|j                  dd      t        j                        }t	        dddd	
      }t        	 d|t	        d      | dt	        |fi |}|S )r   )r   r   r   )r   r   r   r   r   r   )r!   r#   r"   r$   r   r&   r   r   T)r   r    r   no_embed_classF)proj)rP   r|   r}   )vit_base_mci_224)r   rN   r=   GELUrz   r~   r   s        r9   r   r   |  sy     &J*''H 2DQJ-%-$E:J!%j!;F!;E LrC   )vit_tiny_r_s16_p8_224_in21kvit_small_r26_s32_224_in21kvit_base_r50_s16_224_in21kvit_base_resnet50_224_in21kvit_large_r50_s32_224_in21kvit_base_resnet50_384r   )zimage_encoder.model.)r   T)NF)r   )F)<__doc__	functoolsr   typingr   r   r   r   r^   torch.nnr=   	timm.datar   r	   timm.layersr
   r   r   r   r   _builderr   	_registryr   r   r   resnetr   r   resnetv2r   r   rn   r   
Sequentialr   rQ   rh   r@   Tensorboolrm   r~   r   default_cfgsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r:   r   rC   r9   <module>r      s    + +   A U U * Y Y ( 4 1$!r}} $!NB '	[ell*+[ [ [ 	[
 
#u||
[  % L&0$ f.	30L& 1$ f.=SVdh3jL& 1$ i3L&  1$ j 3D3B!L&( %df)L&* .t C 300+L&2 1$ i33L&< 1$ i 3D3=L&J )$}C4O]a+cKL&R )$ BCT+;SL&Z &t(%[L&b )$ ACT+;cL&n ("(<Ig*ioL&r ,T"(<Ig.isL&v '"(<Ig)iwL&z '"(<Ig)i{L&@ &t.g|8U	(AL&L #D+e|8U	%ML& L^ 9J   9J   9J   9J   8I   8I   8I   9J   9J   ;L   ?P   :K   :K   4E  & H#G#G"C#D#GF' rC   