
    khu                    b	   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmc mZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, dd	l-m.Z. dd
l/m0Z0 ddl1m2Z2m3Z3 dgZ4 G d dejj                        Z6 G d dejj                        Z7 G d dejj                        Z8 G d dejj                        Z9	 dxdee:ejv                  f   dejj                  de:dee:ejv                  f   fdZ<	 	 dydee:ejv                  f   dejj                  de:de=dee:ejv                  f   f
dZ>dzde:de=de9fdZ?d{de:dee:ef   fd Z@d{de:dee:ef   fd!ZA e2i d" e@d#$      d% e@d#d&d'd()      d* e@d#eed&d'd(+      d, e@d#eed-d'd(+      d. e@d#d/d'd()      d0 e@d#d/d'd()      d1 e@d#d/d'd()      d2 e@d#d&d'3      d4 e@d#d&d'3      d5 e@d#d/d'3      d6 e@d#d/d'3      d7 e@d#d/d'3      d8 e@d#d/d'd(d9:      d; e@d#d/d'd(d9:      d< e@d#d/d'd(d9:      d= e@d#d>      d? e@d#d>      i d@ e@d#d>      dA e@d#d>      dB e@d#d>      dC e@d#dD>      dE e@d#dD>      dF e@d#dG>      dH e@d#dI>      dJ e@d#d&d'dIK      dL e@d#dD>      dM e@d#dD>      dN e@dO      dP e@d#dQdRdSdST      dU e@d#dQdRdSdST      dV e@d#dQdR3      dW e@d#dQdRdSdST      dX eAd#dYdDZ      d[ eAd#d&dDZ       eAd#d/d\Z       eAd#d/dZ       eAd#d/dZ       eAd#d/dZ      d]      ZBe3dzde=de9fd^       ZCe3dzde=de9fd_       ZDe3dzde=de9fd`       ZEe3dzde=de9fda       ZFe3dzde=de9fdb       ZGe3dzde=de9fdc       ZHe3dzde=de9fdd       ZIe3dzde=de9fde       ZJe3dzde=de9fdf       ZKe3dzde=de9fdg       ZLe3dzde=de9fdh       ZMe3dzde=de9fdi       ZNe3dzde=de9fdj       ZOe3dzde=de9fdk       ZPe3dzde=de9fdl       ZQe3dzde=de9fdm       ZRe3dzde=de9fdn       ZSe3dzde=de9fdo       ZTe3dzde=de9fdp       ZUe3dzde=de9fdq       ZVe3dzde=de9fdr       ZWe3dzde=de9fds       ZXe3dzde=de9fdt       ZYe3dzde=de9fdu       ZZe3dzde=de9fdv       Z[e3dzde=de9fdw       Z\y)|a   EVA

EVA from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636

@article{EVA,
  title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
  author={Fang, Yuxin and Wang, Wen and Xie, Binhui and Sun, Quan and Wu, Ledell and Wang, Xinggang and Huang,
  Tiejun and Wang, Xinlong and Cao, Yue},
  journal={arXiv preprint arXiv:2211.07636},
  year={2022}
}

EVA-02: A Visual Representation for Neon Genesis - https://arxiv.org/abs/2303.11331
@article{EVA02,
  title={EVA-02: A Visual Representation for Neon Genesis},
  author={Fang, Yuxin and Sun, Quan and Wang, Xinggang and Huang, Tiejun and Wang, Xinlong and Cao, Yue},
  journal={arXiv preprint arXiv:2303.11331},
  year={2023}
}

This file contains EVA & EVA02 model implementations evolved from BEiT, additional models in vision_transformer.py.

Modifications by / Copyright 2023 Ross Wightman, original copyrights below
    N)partial)AnyCallableDictListOptionalSetTupleUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDOPENAI_CLIP_MEANOPENAI_CLIP_STD)
PatchEmbedMlpGluMlpSwiGLU	LayerNormDropPathPatchDropoutRotaryEmbeddingCatapply_rot_embed_catapply_keep_indices_nlctrunc_normal_resample_patch_embedresample_abs_pos_embedglobal_pool_nlc	to_2tupleuse_fused_attnAttentionRopeAttentionPoolLatent   )build_model_with_cfg)feature_take_indices)
checkpoint)generate_default_cfgsregister_modelEvac                        e Zd ZU dZej
                  j                  e   ed<   	 	 	 	 	 	 	 	 	 	 	 dde	de	dededede	d	e
d
e
dee	   dee   dedef fdZ	 	 ddeej                     deej                     fdZ xZS )EvaAttentionzG EVA Attention with ROPE, no k-bias, and fused/unfused qkv options
    
fused_attndim	num_headsqkv_bias	qkv_fusedqkv_bias_separatenum_prefix_tokens	attn_drop	proj_dropattn_head_dim
norm_layerqk_norm
scale_normc                    t         |           |s|r	|
J d       || _        ||z  }|	|	}|| j                  z  }|dz  | _        || _        t               | _        || _        |rt        j                  ||dz  d      | _
        dx| _        x| _        | _        |rt        j                  t        j                   |            | _        | j%                  dt        j                   |      d       t        j                  t        j                   |            | _        ndx| _        x| _        | _        nst        j                  |||      | _        t        j                  ||d      | _        t        j                  |||      | _        d| _
        dx| _        x| _        | _        |r |
| j*                        nt        j,                         | _        |r |
| j*                        nt        j,                         | _        t        j2                  |      | _        |r |
|      nt        j,                         | _        t        j                  ||      | _        t        j2                  |      | _        y)	a,  
        Args:
            dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to add a bias term to the query, key, and value projections
            qkv_fused: Whether qkv projections are fused into one projection or separate
            qkv_bias_separate: Whether to apply bias to qkv as a separate addition or part of F.linear() call
            num_prefix_tokens: Number of reg/cls tokens at the beginning of the sequence that
                should not have position embeddings applied
            attn_drop: Dropout rate for attention weights
            proj_drop: Dropout rate for the output projection
            attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
            norm_layer: Normalization layer constructor to use for QK and scale normalization
            qk_norm: Enable normalization of query (Q) and key (K) vectors with norm_layer
            scale_norm: Enable normalization (scaling) of attention output with norm_layer
        Nz<norm_layer must be provided if qk_norm or scale_norm is Trueg         F)biask_bias)
persistent)super__init__r-   scaler1   r   r+   r0   nnLinearqkvq_projk_projv_proj	Parametertorchzerosq_biasregister_bufferv_biasr;   head_dimIdentityq_normk_normDropoutr2   normprojr3   )selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   rL   attn_dim	__class__s                  K/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/eva.pyr>   zEvaAttention.__init__5   s   > 	)i+ii)")#$$Hdnn,%
!2(*!2yyhl?DH6::DK:$+ ll5;;x+@A$$Xu{{8/DQV$W ll5;;x+@A:>>>dkDK))CADK))C>DK))CADKDH6::DK:$+3:j/3:j/I.,6Jx(BKKM	IIh,	I.    rope	attn_maskc                    |j                   \  }}}| j                  | j                  | j                  |      }nt        j                  | j                  | j
                  | j                  f      }| j                  r| j                  |      }||z  }n,t        j                  || j                  j                  |      }|j                  ||d| j                  d      j                  ddddd      }|j                  d      \  }	}
}n| j                  |      j                  ||| j                  d      j!                  dd      }	| j#                  |      j                  ||| j                  d      j!                  dd      }
| j%                  |      j                  ||| j                  d      j!                  dd      }| j'                  |	      | j)                  |
      }
}	|| j*                  }t        j                  |	ddddd|ddf   t-        |	dddd|dddf   |      gd	      j/                  |      }	t        j                  |
ddddd|ddf   t-        |
dddd|dddf   |      gd	      j/                  |      }
| j0                  r=t        j2                  |	|
||| j4                  r| j6                  j8                  nd
      }n|	| j:                  z  }	|	|
j!                  dd      z  }|F|j=                  t        j>                        }|jA                  |ddddddf    tC        d            }|jE                  d	      }| j7                  |      }||z  }|j!                  dd      j                  |||      }| jG                  |      }| jI                  |      }| jK                  |      }|S )a  Forward pass for the attention module.

        Args:
            x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
            rope: Rotary position embeddings tensor for position-aware attention
            attn_mask: Optional attention mask to apply during attention computation

        Returns:
            Tensor of shape (batch_size, sequence_length, embedding_dim)
        N)weightr:   r9      r   r"      r,           )rY   	dropout_pz-inf)&shaperB   rI   rG   catr;   rK   r0   Flinearr[   reshaper-   permuteunbindrC   	transposerD   rE   rN   rO   r1   r   type_asr+   scaled_dot_product_attentiontrainingr2   pr?   toboolmasked_fillfloatsoftmaxrQ   rR   r3   )rS   xrX   rY   BNCrB   r.   qkvnptattns                 rV   forwardzEvaAttention.forwardw   s'     ''1a88{{"hhqk 99dkk4;;%LM))((1+C8OC((1TXX__8LC++aAt~~r:BB1aAqQCjjmGAq!A&&q!T^^R@JJ1aPAA&&q!T^^R@JJ1aPAA&&q!T^^R@JJ1aPA{{1~t{{1~1((C		1Q4C4]+-@1aq=AQSW-XY_`aiijklA		1Q4C4]+-@1aq=AQSW-XY_`aiijklA??..1a#.2mm$..**A DJJAB++D$%LL4	''1dD!3C)D(DeFmT<<B<'D>>$'DqAKK1%%aA.IIaLIIaLNN1rW   )   TTFr"   r`   r`   NNFTNN)__name__
__module____qualname____doc__rG   jitFinalrp   __annotations__intrr   r   r   r>   Tensorr}   __classcell__rU   s   @rV   r*   r*   0   s   		%%
 !"&+%&!!+/-1!#@/@/ @/ 	@/
 @/  $@/  #@/ @/ @/ $C=@/ !*@/ @/ @/J ,004	@ 5<<(@  -	@rW   r*   c            $            e Zd Zddddddddddddej                  edfded	ed
ededededededede	dededede
e   dedede
e   f" fdZdde
ej                     de
ej                     fdZ xZS )EvaBlockT      @Fr"   evar`   Nr,   r-   r.   r/   	mlp_ratio
swiglu_mlp	scale_mlpscale_attn_innerr1   	attn_typer3   r2   	drop_pathinit_values	act_layerr5   r4   c                    t         |            ||      | _        |
dk(  rt        nt        } ||||||	|||||
      | _        |+t        j                  |t        j                  |      z        nd| _
        |dkD  rt        |      nt        j                         | _         ||      | _        t        ||z        }|rG|rt!        |||r|nd|      | _        nEt%        ||dz  |r|ndt        j&                  d|      | _        nt)        ||||r|nd|	      | _        |+t        j                  |t        j                  |      z        nd| _        |dkD  rt        |      | _        yt        j                         | _        y)
a   Initialize the EVA transformer block.

        Args:
          dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to use bias terms in query, key, value projections
            qkv_fused: Whether to use a single projection for query, key, value
            mlp_ratio: Ratio of MLP hidden dimension to input dimension
            swiglu_mlp: Whether to use SwiGLU activation in the MLP
            scale_mlp: Whether to use normalization in the MLP
            scale_attn_inner: Whether to use normalization within the attention mechanism
            num_prefix_tokens: Number of tokens at the beginning of the sequence (class tokens, etc.)
            attn_type: Type of attention module to use ('eva' or 'rope')
            proj_drop: Dropout rate for projection layers
            attn_drop: Dropout rate for attention matrix
            drop_path: Stochastic depth rate
            init_values: Initial value for LayerScale, None = no LayerScale
            act_layer: Activation layer constructor
            norm_layer: Normalization layer constructor
            attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
        rX   	r-   r.   r/   r1   r2   r3   r4   r5   r7   Nr`   in_featureshidden_featuresr5   dropr]   Fr   r   r5   r   	gate_lastr   r   r   r   r5   r   )r=   r>   norm1r    r*   r|   r@   rF   rG   onesgamma_1r   rM   
drop_path1norm2r   r   mlpr   SiLUr   gamma_2
drop_path2)rS   r,   r-   r.   r/   r   r   r   r   r1   r   r3   r2   r   r   r   r5   r4   attn_clsr   rU   s                       rV   r>   zEvaBlock.__init__   sg   R 	_
$-$7=\/'!'
	 GRF]r||K%**S/$ABcg1:R(9-R[[]_
cIo.! #$3-6zD"	 " #$3a$7-6zD gg#"  /#)2:DH GRF]r||K%**S/$ABcg1:R(9-R[[]rW   rX   rY   c           	         | j                   i|| j                  | j                  | j                  |      ||            z   }|| j	                  | j                  | j                  |                  z   }|S || j                  | j                   | j                  | j                  |      ||      z        z   }|| j	                  | j                  | j                  | j                  |            z        z   }|S N)rX   rY   )r   r   r|   r   r   r   r   r   rS   rt   rX   rY   s       rV   r}   zEvaBlock.forward  s    <<DOODIIdjjm$R[I$\]]ADOODHHTZZ]$;<<A  DOODLL499TZZ]QUaj93k$kllADOODLL488DJJqM3J$JKKArW   r   )r   r   r   r@   GELUr   r   rp   rr   strr   r   r>   rG   r   r}   r   r   s   @rV   r   r      s8    ""!$#%*%&"!!!+/"$''#,+/%YSYS YS 	YS
 YS YS YS YS #YS  #YS YS YS YS YS "%YS   !YS" !#YS$ $C=%YSvx5 RWR^R^I_ rW   r   c            $           e Zd ZdZddddddddddddej
                  ej                  dfd	ed
ededede	de
dedededede	de	de	dee	   dededee   f" fdZddeej                     deej                     fdZ xZS )EvaBlockPostNormzF EVA block w/ post-norm and support for swiglu, MLP norm scale, ROPE. Tr   r   Fr"   r`   Nr,   r-   r.   r/   r   r   r   r   r   r1   r3   r2   r   r   r   r5   r4   c                 6   t         |           |dk(  rt        nt        } ||||||
|||||	
      | _         ||      | _        |dkD  rt        |      nt        j                         | _	        t        ||z        }|rG|rt        |||r|nd|      | _        nEt        ||dz  |r|ndt        j                  d|      | _        nt        ||||r|nd|	      | _         ||      | _        |dkD  rt        |      | _        yt        j                         | _        y)
a   Initialize the post-norm EVA transformer block.

        Args:
          dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to use bias terms in query, key, value projections
            qkv_fused: Whether to use a single projection for query, key, value
            mlp_ratio: Ratio of MLP hidden dimension to input dimension
            swiglu_mlp: Whether to use SwiGLU activation in the MLP
            scale_mlp: Whether to use normalization in the MLP
            scale_attn_inner: Whether to use normalization within the attention mechanism
            num_prefix_tokens: Number of tokens at the beginning of the sequence (class tokens, etc.)
            attn_type: Type of attention module to use ('eva' or 'rope')
            proj_drop: Dropout rate for projection layers
            attn_drop: Dropout rate for attention matrix
            drop_path: Stochastic depth rate
            init_values: Initial value for LayerScale, None = no LayerScale (NOTE: ignored for post-norm block)
            act_layer: Activation layer constructor
            norm_layer: Normalization layer constructor
            attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
        rX   r   r`   Nr   r]   Fr   r   )r=   r>   r    r*   r|   r   r   r@   rM   r   r   r   r   r   r   r   r   r   )rS   r,   r-   r.   r/   r   r   r   r   r   r1   r3   r2   r   r   r   r5   r4   r   r   rU   s                       rV   r>   zEvaBlockPostNorm.__init__#  s   R 	$-$7=\/'!'
	  _
1:R(9-R[[]cIo.! #$3-6zD"	 " #$3a$7-6zD gg#"  /#)2:DH  _
1:R(9-R[[]rW   rX   rY   c           
          || j                  | j                  | j                  |||                  z   }|| j                  | j	                  | j                  |                  z   }|S r   )r   r   r|   r   r   r   r   s       rV   r}   zEvaBlockPostNorm.forward|  sV    

499QTY9+W XYY

488A; 788rW   r   )r   r   r   r   r@   r   r   r   rp   rr   r   r   r   r>   rG   r   r}   r   r   s   @rV   r   r   !  s?   P
 ""!"$#%*%&!!!+/"$''#%<<+/%WSWS WS 	WS
 WS WS WS WS WS #WS  #WS WS WS WS "%WS   !WS" !#WS$ $C=%WSrx5 RWR^R^I_ rW   r   c            Q           e Zd ZdZddddddddd	d	d
ddddddddddedd	dd	dddddddddddddf'deeeeef   f   deeeeef   f   dedededededede	de	de
de	de	de	d ed!e
d"e
d#e
d$e
d%e
d&e
d'ed(ee
   d)e	d*ed+e	d,e	d-e
d.ed/e	d0e	d1ee	   d2ee	   d3ee   d4ee
   d5e	d6e	d7eeeeef   ef      d8e
fN fd9ZdWd;Zd<ej                   d:dfd=Zej&                  j(                  d:ee   fd>       Zej&                  j(                  dXd?e	d:dfd@       Zej&                  j(                  dYdAe	d:eeef   fdB       Zej&                  j(                  d:ej                   fdC       ZdZdedee   d:dfdDZd:eej:                  eej:                     f   fdEZ	 	 	 	 	 	 d[dFej:                  dGeeeee   f      dHe	dIe	dJe	dKedLe	d:eeej:                     eej:                  eej:                     f   f   fdMZ 	 	 	 d\dGeeee   f   dNe	dOe	fdPZ!dZdFej:                  dQee   d:ej:                  fdRZ"dFej:                  d:ej:                  fdSZ#dYdFej:                  dTe	d:ej:                  fdUZ$dFej:                  d:ej:                  fdVZ% xZ&S )]r(   a!   Eva Vision Transformer w/ Abs & Rotary Pos Embed

    This class implements the EVA and EVA02 models that were based on the BEiT ViT variant
      * EVA - abs pos embed, global avg pool
      * EVA02 - abs + rope pos embed, global avg pool, SwiGLU, scale Norm in MLP (ala normformer)
          r9     avg      Tr   Fr   r`   Nr   ijgMbP?img_size
patch_sizein_chansnum_classesglobal_pool	embed_dimdepthr-   r.   r/   r   r   r   r   r   	drop_ratepos_drop_ratepatch_drop_rateproj_drop_rateattn_drop_ratedrop_path_rater5   r   class_tokennum_reg_tokensuse_abs_pos_embuse_rot_pos_embrope_grid_offsetrope_grid_indexinguse_post_normuse_pre_transformer_normuse_post_transformer_normuse_fc_normattn_pool_num_headsattn_pool_mlp_ratiodynamic_img_sizedynamic_img_padref_feat_shapehead_init_scalec(                 t
   t         2|           |dv sJ || _        || _        |x| _        x| _        | _        |rdnd|z   | _        |$| _        d| _	        |}(|!|!})n|dk(  })| | }*n|) }*i }+|$r|+j                  t        dd             t        d|||||%| d	|+| _        | j                  j                  },t        | j                  d
      r| j                  j!                         n|}-|r*t#        j$                  t'        j(                  dd|            nd| _        |r*t#        j$                  t'        j(                  d||            nd| _        |xr | j,                  du | _        |r7t#        j$                  t'        j(                  d|,| j                  z   |            nd| _        t#        j2                  |      | _        |dkD  rt7        || j                  d      | _        nd| _        |rA|&t;        |&      nd}&t=        ||z  d|$rdn| j                  j>                  |&||      | _         nd| _         |(r ||      nt#        jB                         | _"        t'        jF                  d||      D .cg c]  }.|.jI                          }/}.|rtJ        ntL        }0t#        jN                  tQ        |      D 1cg c]&  }1 |0|||	|
|||||| j                  |||/|1   ||      ( c}1      | _)        tQ        |      D 1cg c]  }1t        d|1 ||-       c}1| _*        |*r ||      nt#        jB                         | _+        |dk(  r:|"xs |}"|#xs |}#tY        | j                  |"|#|t"        jZ                        | _.        nd| _.        |)r ||      nt#        jB                         | _/        t#        j2                  |      | _0        |dkD  rt#        jb                  ||      nt#        jB                         | _2        | jg                  | jh                         | j0                  tk        | j0                  d       | j*                  tk        | j*                  d       | j,                  tk        | j,                  d       | jm                          to        | jd                  t"        jb                        rtk        | jd                  jp                  d       | jd                  jp                  jr                  ju                  |'       | jd                  jv                  jr                  ju                  |'       yyc c}.w c c}1w c c}1w )a  Initialize the EVA Vision Transformer model.

        Args:
            img_size: Input image size (single int for square, or tuple for rectangular)
            patch_size: Patch size to divide image into tokens (single int for square, or tuple)
            in_chans: Number of input image channels
            num_classes: Number of classes (output dim) for classification head (final projection), 0 for pass-through
            global_pool: Type of global pooling for final sequence ('avg', 'token', 'map', etc.)
            embed_dim: Embedding dimension for tokens
            depth: Number of transformer blocks
            num_heads: Number of attention heads
            qkv_bias: Enable bias for query, key, value projections
            qkv_fused: Use a single projection for query, key, value
            mlp_ratio: Ratio of mlp hidden dim to embedding dim
            swiglu_mlp: Use SwiGLU activation in MLP
            scale_mlp: Apply scaling normalization in MLP (normformer style)
            scale_attn_inner: Apply scaling normalization inside attention
            attn_type: Type of attention module to use
            drop_rate: Dropout rate after final projection and pooling
            pos_drop_rate: Dropout rate for positional embeddings
            patch_drop_rate: Rate of dropping patches during training
            proj_drop_rate: Dropout rate for projections
            attn_drop_rate: Dropout rate for attention
            drop_path_rate: Stochastic depth rate
            norm_layer: Normalization layer constructor
            init_values: Initial layer-scale values
            class_token: Use class token
            num_reg_tokens: Number of additional learnable 'register' tokens to add to the sequence
            use_abs_pos_emb: Use absolute (learned) positional embeddings
            use_rot_pos_emb: Use rotary position embeddings
            rope_grid_offset: Offset for rotary position embedding grid
            rope_grid_indexing: Indexing mode for rotary position embeddings ('ij' or 'xy')
            use_post_norm: Use post-norm transformer block type
            use_pre_transformer_norm: Use normalization layer before transformer blocks
            use_post_transformer_norm: Use normalization layer after transformer blocks
            use_fc_norm: Use normalization layer after pooling, before final classifier
            attn_pool_num_heads: Number of heads in attention pooling
            attn_pool_mlp_ratio: MLP ratio in attention pooling
            dynamic_img_size: Support dynamic image sizes in forward pass
            dynamic_img_pad: Apply dynamic padding for irregular image sizes
            ref_feat_shape: Reference feature shape for rotary position embedding scale
            head_init_scale: Initialization scale for classification head weights
        ) r   avgmaxmaxtokenmapr"   r   FNr   NHWC)strict_img_size
output_fmt)r   r   r   r   r   r:   
feat_ratio)rn   T)r1   return_indices)	in_pixels
feat_shaper   grid_offsetgrid_indexing)r,   r-   r.   r/   r   r   r   r   r   r1   r3   r2   r   r5   r   blocks.)modulenum_chs	reductionr   )r-   r   r5   r   {Gz?std )<r=   r>   r   r   num_featureshead_hidden_sizer   r1   r   grad_checkpointingupdatedictr   patch_embednum_patcheshasattrr   r@   rF   rG   rH   	cls_token	reg_token	cls_embed	pos_embedrP   pos_dropr   
patch_dropr   r   	grid_sizerX   rM   norm_prelinspaceitemr   r   
ModuleListrangeblocksfeature_inforQ   r!   r   	attn_poolfc_norm	head_droprA   headapply_init_weightsr   fix_init_weight
isinstancer[   datamul_r:   )3rS   r   r   r   r   r   r   r   r-   r.   r/   r   r   r   r   r   r   r   r   r   r   r   r5   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   activate_pre_normactivate_fc_normactivate_post_norm
embed_argsr   rrt   dprblock_fnirU   s3                                                     rV   r>   zEva.__init__  s   j 	JJJJ&&ENNND1DN'2!^!K 0"' 5"**e3$0!:%5!5
d5VLM% 
!+--
 
 &&22-4T5E5E|-TD'')ZdGRekk!Q	&BCX\Tbekk!^Y&OPhl$?4)? Q` KK;)?)??KMei 	

]3Q*"&"8"8#DO #DO:H:TY~6Z^N*Y&#349I9I9S9S-,0DI DI1B
9-!&>5!IJAqvvxJJ'4#(mm$ 5\%%#$ # #!##%#!1#"&"8"8((a&%'%# $( QVV[P\^KLD'!yAF^ /AZ	*bkkm	%"5"B"5"B0--%''DN "DN0@z),bkkmI.9DqBIIi5bkkm	

4%%&>>%$..c2>>%$..c2>>%$..c2dii+$))**4II!!&&7IINN$$_5 ,g K%#&^s   1T+4+T08T5returnc                    d }t        | j                        D ]m  \  }} ||j                  j                  j                  j
                  |dz           ||j                  j                  j                  j
                  |dz          o y)z=Fix initialization weights by rescaling based on layer depth.c                 R    | j                  t        j                  d|z               y )Ng       @)div_mathsqrt)paramlayer_ids     rV   rescalez$Eva.fix_init_weight.<locals>.rescale\  s    JJtyyx01rW   r"   N)	enumerater  r|   rR   r[   r  r   fc2)rS   r  r  layers       rV   r  zEva.fix_init_weightZ  si    	2  )5 	=OHeEJJOO**//A>EIIMM((--x!|<	=rW   mc                     t        |t        j                        rNt        |j                  d       |j
                  *t        j                  j                  |j
                         yyy)zbInitialize weights for Linear layers.

        Args:
            m: Module to initialize.
        r   r   N)r  r@   rA   r   r[   r:   initzeros_)rS   r#  s     rV   r
  zEva._init_weightsc  sH     a#!((,vv!qvv& " $rW   c                     ddh}|S )z(Parameters to exclude from weight decay.r   r   r   )rS   nwds     rV   no_weight_decayzEva.no_weight_decayn  s     K(
rW   enablec                     || _         y)z)Enable or disable gradient checkpointing.N)r   )rS   r*  s     rV   set_grad_checkpointingzEva.set_grad_checkpointingt  s     #)rW   coarsec                 $    t        dddg      }|S )z(Create layer groupings for optimization.z ^cls_token|pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr  )r   )rS   r-  matchers      rV   group_matcherzEva.group_matchery  s!     4-/CD
 rW   c                     | j                   S N)r  )rS   s    rV   get_classifierzEva.get_classifier  s    yyrW   c                     || _         ||| _        |dkD  r&t        j                  | j                  |      | _        yt        j
                         | _        y)zReset the classifier head.

        Args:
            num_classes: Number of output classes.
            global_pool: Global pooling type.
        Nr   )r   r   r@   rA   r   rM   r  )rS   r   r   s      rV   reset_classifierzEva.reset_classifier  sF     '"*D>IAoBIIdnnk:	SUS^S^S`	rW   c                 :   | j                   r|j                  \  }}}}| j                  <| j                  j                  }t        | j                  ||f|| j                        }nd }|j                  |d|      }| j                  | j                  j                  ||f      nd }n4| j                  }| j                  | j                  j                         nd }| j                  At        j                  | j                  j                  |j                  d   dd      |fd      }|||z   }| j                  g }	| j                  9|	j                  | j                  j                  |j                  d   dd             |	j                  | j                  j                  |j                  d   dd             t        j                  |	|gz   d      }| j!                  |      }| j"                  %| j#                  |      \  }}
||
t%        |||
      }||fS )N)new_sizeold_sizer1   r\   )rc   r   r"   r_   )r   rc   r   r   r   r   r1   viewrX   	get_embedr   rG   rd   expandr   appendr   r   r   )rS   rt   ru   HWrw   prev_grid_sizer   rot_pos_embedto_catkeep_indicess              rV   
_pos_embedzEva._pos_embed  s     JAq!Q~~)!%!1!1!;!;2NNV+&*&<&<		 !	q"a AAEAVDII//q!f/=\`MI59YY5JDII//1PTM>>%		4>>00RDaHaPA IA>>%F~~)dnn33AGGAJBGHMM$..//
BCD		&A3,A.AMM! ??&"ooa0OA|(\-E 6q- V-rW   rt   indicesreturn_prefix_tokensrQ   
stop_earlyr   intermediates_onlyc           	         |dv sJ d       |dk(  }g }	t        t        | j                        |      \  }
}|j                  \  }}}}| j	                  |      }| j                  |      \  }}| j                  |      }t        j                  j                         s|s| j                  }n| j                  d|dz    }t        |      D ]q  \  }}| j                  r-t        j                  j                         st        |||      }n
 |||      }||
v sN|	j                  |r| j                  |      n|       s | j                  rD|	D cg c]  }|ddd| j                  f    }}|	D cg c]  }|dd| j                  df    }	}|ra| j                  j!                  ||f      \  }}|	D cg c]6  }|j#                  |||d      j%                  dd	dd
      j'                         8 }	}t        j                  j                         s|rt)        t+        |	            }	|r|	S | j                  |      }||	fS c c}w c c}w c c}w )a)   Forward features that returns intermediates.
        Args:
            x: Input image tensor
            indices: Take last n blocks if an int, if is a sequence, select by matching indices
            return_prefix_tokens: Return both prefix and spatial intermediate tokens
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        )NCHWNLCz>Output format for EVA-ViT features must be one of NCHW or NLC.rJ  Nr"   rX   r   r\   r9   r]   )r$   lenr  rc   r   rD  r   rG   r   is_scriptingr   r   r%   r=  rQ   r1   dynamic_feat_sizerg   rh   
contiguouslistzip)rS   rt   rE  rF  rQ   rG  r   rH  rg   intermediatestake_indices	max_indexru   _heightwidthrA  r  r  blkyprefix_tokensr>  r?  s                           rV   forward_intermediateszEva.forward_intermediates  s.   ( _,n.nn,&"6s4;;7G"Qi  gg1feQ??1-=MM!99!!#:[[F[[)a-0F' 	BFAs&&uyy/E/E/GsAM:.L $$TTYYq\qA	B !!ERSQq!D$:$:"::;SMSDQRqQq$"8"8"99:RMR##55vuoFDAq^klYZQYYq!Q3;;Aq!QGRRTlMlyy%%',@ ]M!BCM  IIaL- TR ms   
H;,I 0;I
prune_norm
prune_headc                    t        t        | j                        |      \  }}| j                  d|dz    | _        |rt        j                         | _        |r2d| _        t        j                         | _        | j                  dd       |S )z@ Prune layers not required for specified intermediates.
        Nr"   r   r   )	r$   rM  r  r@   rM   rQ   r  r  r6  )rS   rE  r]  r^  rT  rU  s         rV   prune_intermediate_layerszEva.prune_intermediate_layers  sq     #7s4;;7G"Qikk.9q=1DI!DN;;=DL!!!R(rW   	pool_typec                     | j                   | j                  |      }|S || j                  n|}t        ||| j                        }|S )N)ra  r1   )r  r   r   r1   )rS   rt   ra  s      rV   poolzEva.pool  sJ    >>%q!AH(1(9D$$y	AdF\F\]rW   c                 <   | j                  |      }| j                  |      \  }}| j                  |      }| j                  D ]E  }| j                  r-t
        j                  j                         st        |||      }< |||      }G | j                  |      }|S )zForward pass through feature extraction layers.

        Args:
            x: Input tensor.

        Returns:
            Feature tensor.
        rL  )
r   rD  r   r  r   rG   r   rN  r%   rQ   )rS   rt   rA  rY  s       rV   forward_featureszEva.forward_features  s     Q??1-=MM!;; 	/C&&uyy/E/E/GsAM:.		/
 IIaLrW   
pre_logitsc                     | j                  |      }| j                  |      }| j                  |      }|r|S | j                  |      S )zForward pass through classifier head.

        Args:
            x: Feature tensor.
            pre_logits: Return pre-logits if True.

        Returns:
            Output tensor.
        )rc  r  r  r  )rS   rt   rf  s      rV   forward_headzEva.forward_head'  sA     IIaLLLONN1q0DIIaL0rW   c                 J    | j                  |      }| j                  |      }|S )zoForward pass.

        Args:
            x: Input tensor.

        Returns:
            Output tensor.
        )re  rh  )rS   rt   s     rV   r}   zEva.forward6  s)     !!!$a rW   )r  N)TFr3  )NFFFrJ  F)r"   FT)'r   r   r   r   r   r   r   r
   r   rp   rr   r   r   r>   r  r@   Moduler
  rG   r   ignorer	   r)  r,  r   r   r1  r4  r6  r   rD  r   r\  r`  rc  re  rh  r}   r   r   s   @rV   r(   r(     s    5868#$ !"!$#%*"!#%%'$&$&$&#,+/ $"#$($)&(&*"'-28<*.1537%*$)DH%*QN6CsCx01N6 c5c?23N6 	N6
 N6 N6 N6 N6 N6 N6 N6 N6 N6 N6 #N6  !N6" #N6$ !%N6& #'N6( ")N6* "+N6, "-N6. !/N60 "%1N62 3N64  5N66 "7N68 "9N6: $;N6< !$=N6>  ?N6@ '+AN6B (0~CN6D "$EN6F "*#GN6H "*%IN6J #KN6L "MN6N %U5c?C+?%@AON6P #QN6`=	'ryy 	'T 	' YYS  
 YY)T )T ) ) YYD T#s(^   YY		  
aC 
ahsm 
aW[ 
a' uU\\8ELL3I%IJ ' X 8<).$$',< ||<  eCcN34<  #'	< 
 <  <  <  !%<  
tELL!5tELL7I)I#JJ	K< @ ./$#	3S	>*  	$ell x}  %,, 5<< (1ell 1 1 1 %,, rW   
state_dictmodelprefixr  c                    | j                  d|       } | j                         D ci c]  \  }}|j                  dd      | } }}i }g d}t        |      }| j                         D ]n  \  }}|r|j	                  |      s||d }|D ]  }|j                  |d   |d         } |j	                  d      r|j                  d	d      }|j                  d
d      }|j                  dd      }|j	                  d      r|j
                  d   dz  }	|j                  d      r|d|	 |d<   ||	d |d<   n!|j                  d      r|d|	 |d<   ||	d |d<   |dk(  r:d}|j                  dd      }t        j                  |j
                  d         |d<   n>|dk(  r#d}|j                  d      j                  d      }n|dk(  r|j                  d      }|||<   q |S c c}}w )zConvert Perception Encoder weights.

    Args:
        state_dict: State dictionary to convert.
        model: Target model instance.
        prefix: Prefix to strip from keys.

    Returns:
        Converted state dictionary.
    rn  zmodule.r   ))conv1patch_embed.proj)positional_embeddingr   )ztransformer.resblocks.r   )ln_prer   )ln_postrQ   )ln_rQ   )z
ls_1.gammar   )z
ls_2.gammar   )in_proj_zqkv.)out_projrR   )zmlp.c_fcmlp.fc1)z
mlp.c_projmlp.fc2Nr   r"   r  zattn_pool.attnzattn_pool.layernormzattn_pool.normzattn_pool.probezattn_pool.latentzattn_pool.qkvr9   r[   zattn_pool.q.weightzattn_pool.kv.weightr:   zattn_pool.q.biaszattn_pool.kv.biasrR   zhead.weightz	head.biasclass_embeddingr   r   )getitemsreplacerM  
startswithrc   endswithrj   rG   rH   	unsqueeze)
rm  rn  ro  ry   rz   out_dictswaps
len_prefixspr,   s
             rV   _convert_per  D  s    4J:D:J:J:LM$!Q!))Ir*A-MJMHE VJ  " 1<<'*+A 	(B		"Q%A'A	( <<$		*K8A		/1ABA		+-?@A||O,ggajAo::h'56tWH1267gH23ZZ'34Tc7H/045cdGH01&[AAq!A$)KK
$;H[!##AA((+A+AA?B Og Ns   Ginterpolation	antialiasc           	         i }| j                  d|       } | j                  d|       } | j                  d|       } | j                  d|       } d| v rt        | |      S d| v rt        | |d      S d	| v rd
}n	d| v rd}nd}|dz   | v }|dz   | v }t        |      }| j                         D ]  \  }	}
|r|	j	                  |      s|	|d }	d|	v r%d|	v rf|j
                  j                  j                  j                  \  }}}}|
j                  d   |k7  s|
j                  d   |k7  rt        |
||f||d      }
no|	dk(  rj|
j                  d   |j                  j                  d   k7  rAt        |dd      rdnt        |dd      }t        |
|j
                  j                  |||d      }
|	j                  dd      }	|	j                  dd       }	|	j                  d!d"      }	|	j                  d#d$      }	|	j                  d%d&      }	|	j                  d'd(      }	|r$|	j                  d)d*      }	|	j                  d+d,      }	|r#|	d-v r|	d.k(  s|	d/k(  r|	j                  d0d1      }	n|
||	<    |S )2aZ  Convert patch embedding weight from manual patchify + linear proj to conv.

    Args:
        state_dict: Checkpoint state dictionary.
        model: Target model instance.
        interpolation: Interpolation method for resizing.
        antialias: Whether to use antialiasing when resizing.

    Returns:
        Filtered state dictionary.
    	model_emarn  r   rm  zvisual.conv1.weightzconv1.weightr   )ro  zvisual.trunk.pos_embedzvisual.trunk.zvisual.pos_embedvisual.
mask_tokenzblocks.0.attn.q_proj.weightNrX   zpatch_embed.proj.weightr\   rb   T)r  r  verboser   r"   no_embed_classFr   r1   )r8  r1   r  r  r  z
mlp.ffn_lnzmlp.normzattn.inner_attn_lnz	attn.normzmlp.w12ry  zmlp.w1z	mlp.fc1_gzmlp.w2z	mlp.fc1_xzmlp.w3rz  rI   zq_proj.biasrK   zv_proj.bias)r  zlm_head.weightzlm_head.biasnorm.weight	norm.biasr  r  rQ   r  )r|  r  rM  r}  r  r   rR   r[   rc   r   r   getattrr   r   r~  )rm  rn  r  r  r  ro  mim_weightsno_qkvr  ry   rz   rV  r>  r?  r1   s                  rV   checkpoint_filter_fnr    s   " HZ8J4J*5Jj9J 
*:u--	:	%:uR88  :- 	z	)<':5K33zAFVJ  " 21<<'*+AQ;$)**//66<<JAq!Qwwr{a1772;!#3(F"/'  +!''!*0E0Ea0H"H%,U4De%LRYZ_atvwRx&**44"3+#A IIlJ/II*K8IIi+IIh,IIh,IIh	*		(M2A		(M2A1 llM!Q+%5IIfi0 e2h OrW   variant
pretrainedc                 r    |j                  dd      }t        t        | |ft        t	        |d      d|}|S )zCreate an EVA model.

    Args:
        variant: Model variant name.
        pretrained: Load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        Instantiated Eva model.
    out_indicesr9   getter)r  feature_cls)pretrained_filter_fnfeature_cfg)popr#   r(   r  r   )r  r  kwargsr  rn  s        rV   _create_evar    sH     **]A.K Wj1[hG 	E LrW   urlc                 4    | ddddddt         t        ddd	d
|S )zGenerate default configuration for EVA models.

    Args:
        url: Model weights URL.
        **kwargs: Additional configuration parameters.

    Returns:
        Model configuration dictionary.
    r   r9   r   r   Ng?bicubicTrr  r  mitr  r   
input_size	pool_sizecrop_pctr  fixed_input_sizemeanr   
first_conv
classifierlicense)r   r   r  r  s     rV   _cfgr    s6     =t ( # rW   c                 $    | dddddddddd	d
d|S )zGenerate default configuration for Perception Encoder models.

    Args:
        url: Model weights URL.
        **kwargs: Additional configuration parameters.

    Returns:
        Model configuration dictionary.
    r   r  N      ?r  T      ?r  r  rr  r  customr  r   r  s     rV   _pe_cfgr    s6     D)(  & rW   z"eva_giant_patch14_224.clip_ft_in1kztimm/)	hf_hub_idz"eva_giant_patch14_336.clip_ft_in1k)r9   P  r  r  squash)r  r  r  	crop_modez(eva_giant_patch14_336.m30m_ft_in22k_in1k)r  r  r   r  r  r  z(eva_giant_patch14_560.m30m_ft_in22k_in1k)r9   0  r  z.eva02_base_patch14_448.mim_in22k_ft_in22k_in1k)r9     r  z/eva02_large_patch14_448.mim_in22k_ft_in22k_in1kz.eva02_large_patch14_448.mim_m38m_ft_in22k_in1kz(eva02_tiny_patch14_336.mim_in22k_ft_in1k)r  r  r  z)eva02_small_patch14_336.mim_in22k_ft_in1kz(eva02_base_patch14_448.mim_in22k_ft_in1kz)eva02_large_patch14_448.mim_in22k_ft_in1kz(eva02_large_patch14_448.mim_m38m_ft_in1kz)eva02_base_patch14_448.mim_in22k_ft_in22kiQU  )r  r  r  r  r   z*eva02_large_patch14_448.mim_in22k_ft_in22kz)eva02_large_patch14_448.mim_m38m_ft_in22kz eva02_tiny_patch14_224.mim_in22k)r  r   z!eva02_small_patch14_224.mim_in22kz eva02_base_patch14_224.mim_in22kz!eva02_large_patch14_224.mim_in22kz eva02_large_patch14_224.mim_m38mz$eva_giant_patch14_clip_224.laion400m   z#eva_giant_patch14_clip_224.merged2bz$eva02_base_patch16_clip_224.merged2b   z%eva02_large_patch14_clip_224.merged2br   z%eva02_large_patch14_clip_336.merged2b)r  r  r  r   z'eva02_enormous_patch14_clip_224.laion2bz,eva02_enormous_patch14_clip_224.laion2b_plusz(eva02_enormous_patch14_clip_224.pretrain)r   z-vit_medium_patch16_rope_reg1_gap_256.sbb_in1k)r9      r  gffffff?r  )r  r  r  r  r   z.vit_mediumd_patch16_rope_reg1_gap_256.sbb_in1kz.vit_betwixt_patch16_rope_reg4_gap_256.sbb_in1kz+vit_base_patch16_rope_reg1_gap_256.sbb_in1kzvit_pe_core_base_patch16_224.fbr  )r  r  r   z vit_pe_core_large_patch14_336.fbi   )z#vit_pe_core_gigantic_patch14_448.fbz vit_pe_lang_large_patch14_448.fbz#vit_pe_lang_gigantic_patch14_448.fbz&vit_pe_spatial_gigantic_patch14_448.fbc           	      T    t        ddddd      }t        dd| it        |fi |}|S )	. EVA-g model https://arxiv.org/abs/2211.07636      (   r   tE]t@r   r   r   r-   r   r  )eva_giant_patch14_224r   r  r  r  
model_argsrn  s       rV   r  r    9     t2WbcJeJe$zJd]cJdeELrW   c           	      T    t        ddddd      }t        dd| it        |fi |}|S )	r  r  r  r  r   r  r  r  )eva_giant_patch14_336r  r  s       rV   r  r    r  rW   c           	      T    t        ddddd      }t        dd| it        |fi |}|S )	r  r  r  r  r   r  r  r  )eva_giant_patch14_560r  r  s       rV   r  r     r  rW   c                 \    t        ddddddddd		      }t        dd
| it        |fi |}|S )Nr   r     r   r9   UUUUUU@Tr   r   	r   r   r   r   r-   r   r   r   r   r  )eva02_tiny_patch14_224r  r  s       rV   r  r  (  L    
J fZf4PZKe^dKefELrW   c                 \    t        ddddddddd		      }t        dd
| it        |fi |}|S )Nr   r    r      r  Tr  r  r  )eva02_small_patch14_224r  r  s       rV   r  r  9  L    
J gjgDQ[Lf_eLfgELrW   c                 `    t        ddddddddddd	      }t        dd
| it        |fi |}|S )Nr   r  r   r   Fr  Tr  r   r   r   r   r-   r/   r   r   r   r   r   r  )eva02_base_patch14_224r  r  s       rV   r  r  J  R    J fZf4PZKe^dKefELrW   c                 `    t        ddddddddddd	
      }t        dd| it        |fi |}|S )Nr   r  r     r   r  FTr  r   r   r   r   r-   r   r/   r   r   r   r   r  )eva02_large_patch14_224r  r  s       rV   r  r  ]  R    J gjgDQ[Lf_eLfgELrW   c                 \    t        ddddddddd		      }t        dd
| it        |fi |}|S )Nr  r  r  r   r9   r  Tr  r  r  )eva02_tiny_patch14_336r  r  s       rV   r  r  p  r  rW   c                 \    t        ddddddddd		      }t        dd
| it        |fi |}|S )Nr  r  r  r   r  r  Tr  r  r  )eva02_small_patch14_336r  r  s       rV   r  r    r  rW   c                 `    t        ddddddddddd	      }t        dd
| it        |fi |}|S )Nr  r  r   r   Fr  Tr  r  r  )eva02_base_patch14_448r  r  s       rV   r  r    r  rW   c                 `    t        ddddddddddd	
      }t        dd| it        |fi |}|S )Nr  r  r  r  r   r  FTr  r  r  )eva02_large_patch14_448r  r  s       rV   r  r    r  rW   c                 v    t        ddddd|j                  dd            }t        d
d	| it        |fi |}|S )zB EVA-g CLIP model (only difference from non-CLIP is the pooling)  r  r  r  r   r  r   r   )r   r   r   r-   r   r   r  )eva_giant_patch14_clip_224r   r  r  r  s       rV   r  r    sL     R2JJ}g68J jjtT^OibhOijELrW   c                     t        dddddddddddd|j                  d	d
            }t        dd| it        |fi |}|S )zU A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_base r   r   r   r   Fr  Tr  r   r   )r   r   r   r   r-   r/   r   r   r   r   r   r   r   r  )eva02_base_patch16_clip_224r  r  s       rV   r  r    sf     JJ}g6J k*kPTU_PjciPjkELrW   c                     t        dddddddddddd	|j                  d
d            }t        dd| it        |fi |}|S )V A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large r   r  r  r  r   r  FTr  r   r   r   r   r   r   r-   r   r/   r   r   r   r   r   r   r  )eva02_large_patch14_clip_224r  r  s       rV   r  r    f     JJ}g6J l:lQUV`QkdjQklELrW   c                     t        dddddddddddd	|j                  d
d            }t        dd| it        |fi |}|S )r  r  r  r  r  r   r  FTr  r   r   r  r  )eva02_large_patch14_clip_336r  r  s       rV   r  r    r  rW   c                 z    t        ddddddd|j                  dd	      
      }t        dd| it        |fi |}|S )zD A EVA-CLIP specific variant that uses residual post-norm in blocks r   r  i   @   r   gI$I$!@Tr   r   )r   r   r   r   r-   r   r   r   r  )eva02_enormous_patch14_clip_224r  r  s       rV   r  r    sW     JJ}g6	J ojoTXYcTngmTnoELrW   c                 d    t        dddddddddd	ddd
      }t        dd| it        |fi |}|S )Nr  r   r  r   r~   Th㈵>Fr"   r  r   r   r   r   r-   r/   r.   r   r   r   r   r   r   r  )$vit_medium_patch16_rope_reg1_gap_256r  r  s       rV   r  r    sY    J t:tY]^hYslrYstELrW   c                 d    t        dddddddddd	ddd
      }t        dd| it        |fi |}|S )Nr  r   r     r~   TFr  r"   r  r  r  )%vit_mediumd_patch16_rope_reg1_gap_256r  r  s       rV   r  r  *  sY    J uJuZ^_iZtmsZtuELrW   c                 d    t        dddddddddd	ddd
      }t        dd| it        |fi |}|S )Nr  r   i  r   
   Tr  Fr^   r  r  r  )%vit_betwixt_patch16_rope_reg4_gap_256r  r  s       rV   r  r  ?  sY    J uJuZ^_iZtmsZtuELrW   c                 d    t        ddddddddddddd	
      }t        dd| it        |fi |}|S )Nr  r   r   r   Tr  Fr"   r  r  r  )"vit_base_patch16_rope_reg1_gap_256r  r  s       rV   r  r  T  sY    J rrW[\fWqjpWqrELrW   c                     t        ddddddddddd	d
ddt        t        d            }t        dd| it        |fi |S )Nr   r   r   r   r   rX   T)r  r  r  xyr~   r  epsr   r   r   r-   r   r   r   r   r   r   r   r   r   r   r5   r  )vit_pe_core_base_patch16_224r   r   r   r  r  r  r  s      rV   r  r  i  sb    !%9$/J$ k*kPTU_PjciPjkkrW   c                     t        dddddddddd	d
dddt        t        d            }t        dd| it        |fi |S )Nr  r  r  r   r   r   rX   T)r  r  r  r  r~   r  r	  r  r  )vit_pe_core_large_patch14_336r  r  s      rV   r  r    sb    !%9$/J$ l:lQUV`QkdjQkllrW   c                     t        ddddddddd	d	d
dddt        t        d            }t        dd| it        |fi |S )Nr     2   r   UUUUUU@r   rX   FT    r  r  r~   r   r  r	  )r   r   r   r-   r   r   r   r   r   r   r   r   r   r   r5   r  ) vit_pe_core_gigantic_patch14_448r  r  s      rV   r  r    sb    !%9$/J$ ojoTXYcTngmTnoorW   c           	          t        d i ddddddddd	d
dddddddddddddddddddddt        t        d      }t        d!d| it        |fi |S )"Nr   r  r   r  r      r-   r   r   r   r   rX   r   Tr   r   r  r   r  r   r  r   r   Fr   r   皙?r5   r  r	  r  r   )vit_pe_lang_large_patch14_448r  r  s      rV   r  r    s       	
          "& #(    9$/!J& l:lQUV`QkdjQkllrW   c                     t        ddddddddd	d
ddddt        t        d            }t        dd| it        |fi |S )Nr  r  /   r   r  rX   FTr  r  r  r  r	  r   r   r   r-   r   r   r   r   r   r   r   r   r   r   r5   r  ) vit_pe_lang_gigantic_patch14_448r  r  s      rV   r  r    sb    !%"'9$/J$ ojoTXYcTngmTnoorW   c                     t        ddddddddd	d
ddddt        t        d            }t        dd| it        |fi |S )Nr  r  r  r   r  rX   FTr  r  r  r  r	  r  r  )#vit_pe_spatial_gigantic_patch14_448r  r  s      rV   r!  r!    sb    !%"'9$/J$ rrW[\fWqjpWqrrrW   )r  )r  Trj  )r   )]r   r  	functoolsr   typingr   r   r   r   r   r	   r
   r   rG   torch.nnr@   torch.nn.functional
functionalre   	timm.datar   r   r   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   _builderr#   	_featuresr$   _manipulater%   	_registryr&   r'   __all__rk  r*   r   r   r(   r   r   r  rp   r  r  r  r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r   rW   rV   <module>r/     s	  4   I I I     d dS S S S S + + # <'G299 GTdryy dN^ryy ^B")) J CS%,,&'C99C C 
#u||
	CR '	\ell*+\yy\ \ 	\
 
#u||
\~ $ S *c T#s(^ ( S#X ( % g& )$+g& )$ 3(+Dg& /"(< 3(	1Dg&$ /"(< 3(	1D%g&2 5d 3(73g&< 6t 3(8=g&F 5d 3(7Gg&T / 31Ug&^ 0 32_g&h / 31ig&r 0 32sg&| / 31}g&J 0 3(PU2Kg&T 1$ 3(PU3Ug&^ 0 3(PU2_g&l ')mg&v (*wg&@ ')Ag&J (*Kg&T ')Ug&b +D -cg&p *4 ,qg&~ +D -g&L ,T .Mg&Z ,T  3.[g&j .t 0kg&x 3D 5yg&F /1Gg&P 4T 4/6Qg&Z 5d 4/7[g&d 5d 47eg&l 24 4/4mg&z &w !({g&H ' !)Ig&V ,3 !, )0 !) ,3 !, /6 !/Ag& gT d    d    d    t #     3    t #  $  3  $ t #     3    t #  $  3  $ 4 c   D s  * T   * T   *  3    T PS  ( d QT  ( d QT  ( 4 c  ( lT l l l, md m m m, p pC p p, md m m m. p pC p p, sD ss s srW   