
    kh                     0   d Z ddlZddlmZmZmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmc mZ ddlmZmZ ddlmZmZmZmZmZmZmZ ddlmZmZmZmZ ddl m!Z! dd	l"m#Z# dd
l$m%Z% ddl&m'Z'm(Z( dgZ)de	e*e*f   dejV                  fdZ, G d dejZ                        Z. G d dejZ                        Z/ G d dejZ                        Z0 G d dejZ                        Z1d5de2dee2ef   fdZ3 e' e3d       e3ddd       e3dd       e3d       e3ddd       e3dd d       e3dd       e3dee!       e3dee!       e3ddee"       e3dd#ee$       e3dd#ee$       e3ddee"      d%      Z4d6d&ee2ejV                  f   d'ejZ                  d(e2d)e5dee2ejV                  f   f
d*Z6d7d+e2d,e5de1fd-Z7e(d7d,e5de1fd.       Z8e(d7d,e5de1fd/       Z9e(d7d,e5de1fd0       Z:e(d7d,e5de1fd1       Z;e(d7d,e5de1fd2       Z<e(d7d,e5de1fd3       Z=e(d7d,e5de1fd4       Z>y)8a   BEiT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)

Model from official source: https://github.com/microsoft/unilm/tree/master/beit

@inproceedings{beit,
title={{BEiT}: {BERT} Pre-Training of Image Transformers},
author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
booktitle={International Conference on Learning Representations},
year={2022},
url={https://openreview.net/forum?id=p-BhZSz59o4}
}

BEiT-v2 from https://github.com/microsoft/unilm/tree/master/beit2

@article{beitv2,
title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers},
author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei},
year={2022},
eprint={2208.06366},
archivePrefix={arXiv},
primaryClass={cs.CV}
}

At this point only the 1k fine-tuned classification weights and model configs have been added,
see original source above for pre-training models and procedure.

Modifications by / Copyright 2021 Ross Wightman, original copyrights below
    N)AnyCallableDictListOptionalSetTupleUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpSwiGLU	LayerNormDropPathtrunc_normal_use_fused_attn)resample_patch_embedresample_abs_pos_embedresize_rel_pos_bias_tablendgrid   )build_model_with_cfg)feature_take_indices)
checkpoint)generate_default_cfgsregister_modelBeitwindow_sizereturnc           	         d| d   z  dz
  d| d   z  dz
  z  dz   }| d   | d   z  }t        j                  t        t        j                  | d         t        j                  | d                     }t        j                  |d      }|dddddf   |dddddf   z
  }|j                  ddd      j                         }|dddddfxx   | d   dz
  z  cc<   |dddddfxx   | d   dz
  z  cc<   |dddddfxx   d| d   z  dz
  z  cc<   t        j                  |dz   fdz  |j                        }|j                  d      |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d<   |S )	a  Generate relative position index for window-based attention.

    Creates a lookup table for relative position indices between all pairs of positions
    within a window, including special handling for cls token interactions.

    Args:
        window_size: Height and width of the attention window.

    Returns:
        Relative position index tensor of shape (window_area+1, window_area+1)
        where +1 accounts for the cls token.
       r   r      N)sizedtype)r   r   )
torchstackr   arangeflattenpermute
contiguouszerosr%   sum)r   num_relative_distancewindow_areacoordscoords_flattenrelative_coordsrelative_position_indexs          L/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/beit.pygen_relative_position_indexr6   <   s    Q/!3KN8JQ8NORSS a.;q>1K[[[^ <ell;WX>>Z[\F]]61-N$Q4Z0>!T1*3MMO%--aA6AACOAq!GA 22Aq!GA 22Aq!GKN 2Q 66#kka/AA/E_MbMbc&5&9&9"&=ABF#%:Q%>AqrE"%:Q%>ABE"$9A$=D!""    c                       e Zd ZU dZej
                  j                  e   ed<   	 	 	 	 	 	 	 dde	de	dedede
de
d	eee	e	f      d
ee	   f fdZdej                  fdZddej                  deej                     dej                  fdZ xZS )	AttentionzMulti-head attention module with optional relative position bias.

    Implements multi-head self-attention with support for relative position bias
    and fused attention operations. Can use either standard or custom head dimensions.
    
fused_attndim	num_headsqkv_biasqkv_bias_separate	attn_drop	proj_dropr   attn_head_dimc	                    t         |           || _        ||z  }	||}	|	| j                  z  }
|	dz  | _        t	               | _        || _        t        j                  ||
dz  d      | _	        |rt        j                  t        j                  |
            | _        | j                  dt        j                  |
      d       t        j                  t        j                  |
            | _        nd| _        d| _        d| _        |r||| _        d|d	   z  d
z
  d|d
   z  d
z
  z  dz   | _        t        j                  t        j                  | j$                  |            | _        | j                  dt)        |      d       nd| _        d| _        d| _        t        j,                  |      | _        t        j                  |
|      | _        t        j,                  |      | _        y)ak  Initialize attention module.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            qkv_bias: If True, add learnable bias to query, key, value projections.
            qkv_bias_separate: If True, use separate bias for q, k, v projections.
            attn_drop: Dropout rate for attention weights.
            proj_drop: Dropout rate for output projection.
            window_size: Window size for relative position bias. If None, no relative position bias.
            attn_head_dim: Dimension per attention head. If None, uses dim // num_heads.
        Ng      r#   F)biask_bias)
persistentr"   r   r   r4   )super__init__r<   scaler   r:   r>   nnLinearqkv	Parameterr'   r-   q_biasregister_bufferv_biasrD   r   r/   relative_position_bias_tabler6   r4   Dropoutr?   projr@   )selfr;   r<   r=   r>   r?   r@   r   rA   head_dimall_head_dim	__class__s              r5   rG   zAttention.__init__d   s   . 	")#$$H$..0%
(*!299S,"2?,,u{{<'@ADK  5;;|+DQV W,,u{{<'@ADKDKDKDK*D*+k!n*<q*@QUVEWZ[E[)\_`)`D&02D66	B1DD-  !:<WXc<dqv w#D04D-+/D(I.IIlC0	I.r7   r    c                 X   | j                   | j                  j                  d         j                  | j                  d   | j                  d   z  dz   | j                  d   | j                  d   z  dz   d      }|j	                  ddd      j                         }|j                  d      S )zGet relative position bias for the attention window.

        Returns:
            Relative position bias tensor of shape (1, num_heads, window_area+1, window_area+1).
        r&   r   r   r"   )rP   r4   viewr   r+   r,   	unsqueezerS   relative_position_biass     r5   _get_rel_pos_biaszAttention._get_rel_pos_bias   s     "&!B!B((--b1"3374Q$"2"21"559Q$"2"21"55924? 	 "8!?!?1a!H!S!S!U%//22r7   xshared_rel_pos_biasc                 N   |j                   \  }}}| j                  | j                  |      }nt        j                  | j                  | j
                  | j                  f      }| j                  r| j                  |      }||z  }n,t        j                  || j                  j                  |      }|j                  ||d| j                  d      j                  ddddd      }|j                  d      \  }}	}
| j                  rgd}| j                   | j#                         }|
||z   }n||}t        j$                  ||	|
|| j&                  r| j(                  j*                  nd	
      }nr|| j,                  z  }||	j/                  dd      z  }| j                   || j#                         z   }|||z   }|j1                  d      }| j)                  |      }||
z  }|j/                  dd      j                  |||      }| j3                  |      }| j5                  |      }|S )a-  Forward pass of attention module.

        Args:
            x: Input tensor of shape (batch_size, num_tokens, dim).
            shared_rel_pos_bias: Optional shared relative position bias from parent module.

        Returns:
            Output tensor of shape (batch_size, num_tokens, dim).
        N)weightrC   r#   r&   r"   r   r              )	attn_mask	dropout_pr;   )shaperM   rK   r'   catrD   rO   r>   Flinearr`   reshaper<   r+   unbindr:   rP   r\   scaled_dot_product_attentiontrainingr?   prH   	transposesoftmaxrR   r@   )rS   r]   r^   BNCrK   r=   qkvrel_pos_biasattns                r5   forwardzAttention.forward   s    ''1a;;((1+Cyy$++t{{DKK!HIH%%hhqkxhhqxHkk!Q4>>26>>q!Q1M**Q-1a??L00<#557&2#/2E#EL$02..1a&.2mm$..**A DJJAB++D00<d4466".11<<B<'D>>$'DqAKK1%%aA.IIaLNN1r7   )   FFrb   rb   NNN)__name__
__module____qualname____doc__r'   jitFinalbool__annotations__intfloatr   r	   rG   Tensorr\   rz   __classcell__rV   s   @r5   r9   r9   \   s    
 		%%
 "&+!!59+/8/8/ 8/ 	8/
  $8/ 8/ 8/ "%S/28/ $C=8/t35<< 36 6HU\\<R 6^c^j^j 6r7   r9   c                        e Zd ZdZddddddddej
                  eddfdededed	e	d
edede	de	de	de
e	   dedede
eeef      de
e   f fdZddej                  de
ej                     dej                  fdZ xZS )BlockzTransformer block with attention and MLP.

    Standard transformer block consisting of multi-head self-attention and MLP
    with residual connections and layer normalization. Supports layer scale and
    stochastic depth regularization.
    F      @rb   Nr;   r<   r=   	mlp_ratio	scale_mlp
swiglu_mlpr@   r?   	drop_pathinit_values	act_layer
norm_layerr   rA   c           	         t         |            ||      | _        t        |||||||      | _        |	dkD  rt        |	      nt        j                         | _         ||      | _	        |r%t        |t        ||z        |r|nd|      | _        n%t        |t        ||z        ||r|nd|      | _        |	dkD  rt        |	      nt        j                         | _        |
rat        j                  |
t!        j"                  |      z        | _        t        j                  |
t!        j"                  |      z        | _        yd\  | _        | _        y)a  Initialize transformer block.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            qkv_bias: If True, add learnable bias to query, key, value projections.
            mlp_ratio: Ratio of MLP hidden dimension to input dimension.
            scale_mlp: If True, apply layer normalization in MLP.
            swiglu_mlp: If True, use SwiGLU activation in MLP.
            proj_drop: Dropout rate for projections.
            attn_drop: Dropout rate for attention.
            drop_path: Drop path rate for stochastic depth.
            init_values: Initial values for layer scale. If None, no layer scale.
            act_layer: Activation function class.
            norm_layer: Normalization layer class.
            window_size: Window size for relative position bias in attention.
            attn_head_dim: Dimension per attention head.
        )r<   r=   r?   r@   r   rA   rb   N)in_featureshidden_featuresr   drop)r   r   r   r   r   )NN)rF   rG   norm1r9   ry   r   rI   Identity
drop_path1norm2r   r   mlpr   
drop_path2rL   r'   onesgamma_1gamma_2)rS   r;   r<   r=   r   r   r   r@   r?   r   r   r   r   r   rA   rV   s                  r5   rG   zBlock.__init__   s   F 	_
#'
	 2;R(9-R[[]_
 #C)O 4)2:	DH  #C)O 4#)2:DH 2;R(9-R[[]<<ejjo(EFDL<<ejjo(EFDL)3&DL$,r7   r]   r^   r    c           	         | j                   h|| j                  | j                  | j                  |      |            z   }|| j	                  | j                  | j                  |                  z   }|S || j                  | j                   | j                  | j                  |      |      z        z   }|| j	                  | j                  | j                  | j                  |            z        z   }|S )a  Forward pass of transformer block.

        Args:
            x: Input tensor of shape (batch_size, num_tokens, dim).
            shared_rel_pos_bias: Optional shared relative position bias.

        Returns:
            Output tensor of shape (batch_size, num_tokens, dim).
        r^   )r   r   ry   r   r   r   r   r   )rS   r]   r^   s      r5   rz   zBlock.forward5  s     <<DOODIIdjjmQdI$effADOODHHTZZ]$;<<A  DOODLL499TZZ]`s93t$tuuADOODLL488DJJqM3J$JKKAr7   r|   )r}   r~   r   r   rI   GELUr   r   r   r   r   r   r	   rG   r'   r   rz   r   r   s   @r5   r   r      s    #!#$!!!+/"$''#,59+/G4G4 G4 	G4
 G4 G4 G4 G4 G4 G4 "%G4  G4 !G4 "%S/2G4 $C=G4R HU\\<R ^c^j^j r7   r   c                   V     e Zd ZdZdeeef   def fdZdej                  fdZ	 xZ
S )RelativePositionBiaszRelative position bias module for window-based attention.

    Generates learnable relative position biases for all pairs of positions
    within a window, including special handling for cls token.
    r   r<   c                    t         |           || _        |d   |d   z  | _        d|d   z  dz
  d|d   z  dz
  z  dz   }t	        j
                  t        j                  ||            | _        | j                  dt        |             y)zInitialize relative position bias module.

        Args:
            window_size: Height and width of the attention window.
            num_heads: Number of attention heads.
        r   r   r"   r#   r4   N)rF   rG   r   r0   rI   rL   r'   r-   rP   rN   r6   )rS   r   r<   r/   rV   s       r5   rG   zRelativePositionBias.__init__O  s     	&&q>KN:!"[^!3a!7AA<NQR<R SVW W,.LLEZ\e9f,g)68ST_8`ar7   r    c                     | j                   | j                  j                  d         j                  | j                  dz   | j                  dz   d      }|j	                  ddd      j                         S )zGenerate relative position bias.

        Returns:
            Relative position bias tensor of shape (num_heads, window_area+1, window_area+1).
        r&   r   r"   r   )rP   r4   rX   r0   r+   r,   rZ   s     r5   rz   zRelativePositionBias.forward^  sm     "&!B!B4C_C_CdCdegCh!i!n!nq $"2"2Q"6"<%--aA6AACCr7   )r}   r~   r   r   r	   r   rG   r'   r   rz   r   r   s   @r5   r   r   H  s8    bE#s(O b bD Dr7   r   c            /           e Zd ZdZddddddddd	d
dddddddedd	dddfdeeeeef   f   deeeeef   f   dedededededede	de
de	de	de
de
de
de
de
d ed!ee
   d"e	d#e	d$e	d%e
f. fd&Zd' Zd(ej                   fd)Zej&                  j(                  d*ee   fd+       Zej&                  j(                  dAd,e	fd-       Zej&                  j(                  dBd.e	d*eeef   fd/       Zej&                  j(                  d*ej                   fd0       ZdCdedee   fd1Z	 	 	 	 	 	 dDd2ej:                  d3eeeee   f      d4e	d5e	d6e	d7ed8e	d*eeej:                     eej:                  eej:                     f   f   fd9Z	 	 	 dEd3eeee   f   d:e	d;e	d*ee   fd<Z d2ej:                  d*ej:                  fd=Z!dBd2ej:                  d>e	d*ej:                  fd?Z"d2ej:                  d*ej:                  fd@Z# xZ$S )Fr   a  BEiT: BERT Pre-Training of Image Transformers.

    Vision Transformer model with support for relative position bias and
    shared relative position bias across layers. Implements both BEiT v1 and v2
    architectures with flexible configuration options.
          r#     avg      Tr   Frb   NgMbP?img_size
patch_sizein_chansnum_classesglobal_pool	embed_dimdepthr<   r=   r   r   r   	drop_ratepos_drop_rateproj_drop_rateattn_drop_ratedrop_path_rater   r   use_abs_pos_embuse_rel_pos_biasuse_shared_rel_pos_biashead_init_scalec                    t         |           || _        || _        |x| _        x| _        | _        d| _        d| _        t        ||||      | _
        | j                  j                  }t        | j                  d      r| j                  j                         n|}t        j                  t!        j"                  dd|            | _        |r-t        j                  t!        j"                  d|dz   |            nd| _        t        j(                  |      | _        |r't-        | j                  j.                  |      | _        nd| _        t!        j2                  d||      D cg c]  }|j5                          }}t        j6                  t9        |      D cg c]4  }t;        |||	|
||||||   |||r| j                  j.                  nd	      6 c}      | _        t9        |      D cg c]  }t?        d
| ||       c}| _         | j                  dk(  }|rt        jB                         n ||      | _"        |r ||      nt        jB                         | _#        t        j(                  |      | _$        |dkD  rt        jJ                  ||      nt        jB                         | _&        | jO                  | jP                         | j&                  tS        | j&                  d       tS        | j$                  d       | jU                          tW        | jL                  t        jJ                        rtS        | jL                  jX                  d       | jL                  jX                  jZ                  j]                  |       | jL                  j^                  jZ                  j]                  |       yyc c}w c c}w c c}w )a  Initialize BEiT model.

        Args:
            img_size: Input image size.
            patch_size: Patch size for patch embedding.
            in_chans: Number of input image channels.
            num_classes: Number of classes for classification head.
            global_pool: Type of global pooling ('avg' or '').
            embed_dim: Embedding dimension.
            depth: Number of transformer blocks.
            num_heads: Number of attention heads.
            qkv_bias: If True, add learnable bias to query, key, value projections.
            mlp_ratio: Ratio of MLP hidden dimension to embedding dimension.
            swiglu_mlp: If True, use SwiGLU activation in MLP.
            scale_mlp: If True, apply layer normalization in MLP.
            drop_rate: Dropout rate.
            pos_drop_rate: Dropout rate for position embeddings.
            proj_drop_rate: Dropout rate for projections.
            attn_drop_rate: Dropout rate for attention.
            drop_path_rate: Stochastic depth rate.
            norm_layer: Normalization layer class.
            init_values: Initial values for layer scale.
            use_abs_pos_emb: If True, use absolute position embeddings.
            use_rel_pos_bias: If True, use relative position bias in attention.
            use_shared_rel_pos_bias: If True, share relative position bias across layers.
            head_init_scale: Scale factor for head initialization.
        r   F)r   r   r   r   
feat_ratioN)ro   )r   r<   r   )r;   r<   r=   r   r   r   r@   r?   r   r   r   r   zblocks.)modulenum_chs	reductionr   {Gz?std)0rF   rG   r   r   num_featureshead_hidden_sizer   num_prefix_tokensgrad_checkpointingr   patch_embednum_patcheshasattrr   rI   rL   r'   r-   	cls_token	pos_embedrQ   pos_dropr   	grid_sizerx   linspaceitem
ModuleListranger   blocksdictfeature_infor   normfc_norm	head_droprJ   headapply_init_weightsr   fix_init_weight
isinstancer`   datamul_rC   )rS   r   r   r   r   r   r   r   r<   r=   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rr]   dpriuse_fc_normrV   s                                 r5   rG   zBeit.__init__q  s   j 	&&ENNND1DN!""'%!	
 &&22-4T5E5E|-TD'')Zdekk!Q	&BCUdekk![1_i&PQjn

]3" 4 ,,66#!D
 !%D!&>5!IJAqvvxJJmm 5\%#  #!##%((a&%':JD,,66PT%# $" QVV[P\^KLD'!yAF^ &&%/%0BKKMj6K	0;z),I.9DqBIIi5bkkm	

4%%&>>%$..c2dnn#.dii+$))**4II!!&&7IINN$$_5 ,A K%# ^s   &N:9N?-Oc                    d }t        | j                        D ]m  \  }} ||j                  j                  j                  j
                  |dz           ||j                  j                  j                  j
                  |dz          o y)zFix initialization weights according to BEiT paper.

        Rescales attention and MLP weights based on layer depth to improve
        training stability.
        c                 R    | j                  t        j                  d|z               y )Ng       @)div_mathsqrt)paramlayer_ids     r5   rescalez%Beit.fix_init_weight.<locals>.rescale  s    JJtyyx01r7   r   N)	enumerater   ry   rR   r`   r   r   fc2)rS   r   r   layers       r5   r   zBeit.fix_init_weight  si    	2  )5 	=OHeEJJOO**//A>EIIMM((--x!|<	=r7   mc                    t        |t        j                        rjt        |j                  d       t        |t        j                        r8|j
                  +t        j                  j                  |j
                  d       yyyt        |t        j                        rUt        j                  j                  |j
                  d       t        j                  j                  |j                  d       yy)zVInitialize model weights.

        Args:
            m: Module to initialize.
        r   r   Nr         ?)	r   rI   rJ   r   r`   rC   init	constant_r   )rS   r   s     r5   r   zBeit._init_weights  s     a#!((,!RYY'AFF,>!!!&&!, -?'2<<(GGaffa(GGahh, )r7   r    c                 j    ddh}| j                         D ]  \  }}d|v s|j                  |        |S )zGet parameter names that should not use weight decay.

        Returns:
            Set of parameter names to exclude from weight decay.
        r   r   rP   )named_parametersadd)rS   nwdn_s       r5   no_weight_decayzBeit.no_weight_decay  sD     K())+ 	DAq-2
	 
r7   enablec                     || _         y)z}Enable or disable gradient checkpointing.

        Args:
            enable: If True, enable gradient checkpointing.
        N)r   )rS   r   s     r5   set_grad_checkpointingzBeit.set_grad_checkpointing  s     #)r7   coarsec                 $    t        dddg      }|S )zCreate parameter group matcher for optimizer parameter groups.

        Args:
            coarse: If True, use coarse grouping.

        Returns:
            Dictionary mapping group names to regex patterns.
        z-^cls_token|pos_embed|patch_embed|rel_pos_bias)z^blocks\.(\d+)N)z^norm)i )stemr   )r   )rS   r  matchers      r5   group_matcherzBeit.group_matcher  s!     A-/CD
 r7   c                     | j                   S )z_Get the classifier head.

        Returns:
            The classification head module.
        )r   )rS   s    r5   get_classifierzBeit.get_classifier)  s     yyr7   c                     || _         ||| _        |dkD  r&t        j                  | j                  |      | _        yt        j
                         | _        y)zReset the classification head.

        Args:
            num_classes: Number of classes for new head.
            global_pool: Global pooling type.
        Nr   )r   r   rI   rJ   r   r   r   )rS   r   r   s      r5   reset_classifierzBeit.reset_classifier2  sF     '"*D>IAoBIIdnnk:	SUS^S^S`	r7   r]   indicesreturn_prefix_tokensr   
stop_early
output_fmtintermediates_onlyc           	      `   |dv sJ d       |dk(  }g }	t        t        | j                        |      \  }
}|j                  \  }}}}| j	                  |      }t        j                  | j                  j                  |j                  d   dd      |fd      }| j                  || j                  z   }| j                  |      }| j                  | j                         nd}t
        j                  j                         s|s| j                  }n| j                  d|dz    }t        |      D ]q  \  }}| j                  r-t
        j                  j                         st!        |||	      }n
 |||	      }||
v sN|	j#                  |r| j%                  |      n|       s | j&                  rD|	D cg c]  }|ddd| j&                  f    }}|	D cg c]  }|dd| j&                  df    }	}|ra| j                  j)                  ||f      \  }}|	D cg c]6  }|j+                  |||d      j-                  dd
dd      j/                         8 }	}t
        j                  j                         s|rt1        t3        |	            }	|r|	S | j%                  |      }||	fS c c}w c c}w c c}w )a
  Forward pass that returns intermediate feature maps.

        Args:
            x: Input image tensor of shape (batch_size, channels, height, width).
            indices: Block indices to return features from. If int, returns last n blocks.
            return_prefix_tokens: If True, return both prefix and spatial tokens.
            norm: If True, apply normalization to intermediate features.
            stop_early: If True, stop at last selected intermediate.
            output_fmt: Output format ('NCHW' or 'NLC').
            intermediates_only: If True, only return intermediate features.

        Returns:
            If intermediates_only is True, returns list of intermediate tensors.
            Otherwise, returns tuple of (final_features, intermediates).
        )NCHWNLCz)Output format must be one of NCHW or NLC.r  r   r&   r   rf   Nr   r#   r"   )r   lenr   rg   r   r'   rh   r   expandr   r   rx   r   is_scriptingr   r   r   appendr   r   dynamic_feat_sizerk   r+   r,   listzip)rS   r]   r
  r  r   r  r  r  rk   intermediatestake_indices	max_indexrr   r   heightwidthrx   r   r   blkyprefix_tokensHWs                           r5   forward_intermediateszBeit.forward_intermediates>  s   2 _,Y.YY,&"6s4;;7G"Qi  gg1feQIIt~~,,QWWQZR@!D!L>>%DNN"AMM!.2.?.?.Kt((*QU99!!#:[[F[[)a-0F' 	BFAs&&uyy/E/E/GsA<H|<L $$TTYYq\qA	B !!ERSQq!D$:$:"::;SMSDQRqQq$"8"8"99:RMR##55vuoFDAq^klYZQYYq!Q3;;Aq!QGRRTlMlyy%%',@ ]M!BCM  IIaL- TR ms   0J!J&;J+
prune_norm
prune_headc                    t        t        | j                        |      \  }}| j                  d|dz    | _        |rt        j                         | _        |r+t        j                         | _        | j                  dd       |S )a=  Prune layers not required for specified intermediate outputs.

        Args:
            indices: Indices of blocks to keep.
            prune_norm: If True, remove final normalization.
            prune_head: If True, remove classification head.

        Returns:
            List of indices that were kept.
        Nr   r    )r   r  r   rI   r   r   r   r	  )rS   r
  r$  r%  r  r  s         r5   prune_intermediate_layerszBeit.prune_intermediate_layers  sj      #7s4;;7G"Qikk.9q=1DI;;=DL!!!R(r7   c                    | j                  |      }t        j                  | j                  j	                  |j
                  d   dd      |fd      }| j                  || j                  z   }| j                  |      }| j                  | j                         nd}| j                  D ]E  }| j                  r-t        j                  j                         st        |||      }< |||      }G | j                  |      }|S )zForward pass through feature extraction layers.

        Args:
            x: Input tensor of shape (batch_size, channels, height, width).

        Returns:
            Feature tensor of shape (batch_size, num_tokens, embed_dim).
        r   r&   r   rf   Nr   )r   r'   rh   r   r  rg   r   r   rx   r   r   r   r  r   r   )rS   r]   rx   r  s       r5   forward_featureszBeit.forward_features  s     QIIt~~,,QWWQZR@!D!L>>%DNN"AMM!.2.?.?.Kt((*QU;; 	=C&&uyy/E/E/GsA<H|<		=
 IIaLr7   
pre_logitsc                    | j                   r=| j                   dk(  r%|dd| j                  df   j                  d      n|dddf   }| j                  |      }| j	                  |      }|r|S | j                  |      S )a7  Forward pass through classification head.

        Args:
            x: Feature tensor of shape (batch_size, num_tokens, embed_dim).
            pre_logits: If True, return features before final linear layer.

        Returns:
            Logits tensor of shape (batch_size, num_classes) or pre-logits.
        r   Nr   rf   r   )r   r   meanr   r   r   )rS   r]   r+  s      r5   forward_headzBeit.forward_head  s     =A=M=MQV=V!T++,,-22q29\]^_ab^b\cALLONN1q0DIIaL0r7   c                 J    | j                  |      }| j                  |      }|S )zForward pass through the model.

        Args:
            x: Input tensor of shape (batch_size, channels, height, width).

        Returns:
            Logits tensor of shape (batch_size, num_classes).
        )r*  r.  )rS   r]   s     r5   rz   zBeit.forward  s)     !!!$a r7   )TFr|   )NFFFr  F)r   FT)%r}   r~   r   r   r   r
   r   r	   strr   r   r   r   rG   r   rI   Moduler   r'   r   ignorer   r   r   r   r   r  r  r	  r   r   r#  r(  r*  r.  rz   r   r   s   @r5   r   r   i  s    5868#$ !!$#!#%$&$&$&#,+/$(%*,1%*1u6CsCx01u6 c5c?23u6 	u6
 u6 u6 u6 u6 u6 u6 u6 u6 u6 u6 !u6  "!u6" "#u6$ "%u6& !'u6( "%)u6* "+u6, #-u6. &*/u60 #1u6n=-ryy - YY
S 
 
 YY)T ) ) YYD T#s(^   YY		  
aC 
ahsm 
a 8<).$$',F ||F  eCcN34F  #'	F 
 F  F  F  !%F  
tELL!5tELL7I)I#JJ	KF T ./$#	3S	>*  	
 
c2%,, 5<< 01ell 1 1 1  %,, r7   urlc                 "    | dddddddddd	d
|S )zCreate a default configuration dictionary for BEiT models.

    Args:
        url: Model weights URL.
        **kwargs: Additional configuration parameters.

    Returns:
        Configuration dictionary.
    r   )r#   r   r   Ng?bicubicT)      ?r7  r7  zpatch_embed.projr   )r4  r   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizer-  r   
first_conv
classifier )r4  kwargss     r5   _cfgrA    s3     =t(  r7   ztimm/)	hf_hub_id)r#     rC  r   )rB  r8  r:  iQU  )rB  r   )r#      rD  )rB  r-  r   )rB  r   r-  r   gffffff?)rB  r:  r-  r   )z)beit_base_patch16_224.in22k_ft_in22k_in1kz)beit_base_patch16_384.in22k_ft_in22k_in1kz$beit_base_patch16_224.in22k_ft_in22kz*beit_large_patch16_224.in22k_ft_in22k_in1kz*beit_large_patch16_384.in22k_ft_in22k_in1kz*beit_large_patch16_512.in22k_ft_in22k_in1kz%beit_large_patch16_224.in22k_ft_in22kz*beitv2_base_patch16_224.in1k_ft_in22k_in1kz$beitv2_base_patch16_224.in1k_ft_in1kz%beitv2_base_patch16_224.in1k_ft_in22kz+beitv2_large_patch16_224.in1k_ft_in22k_in1kz%beitv2_large_patch16_224.in1k_ft_in1kz&beitv2_large_patch16_224.in1k_ft_in22k
state_dictmodelr;  	antialiasc           	      @   | j                  d|       } | j                  d|       } i }| j                         D ]c  \  }}d|v rd|v rf|j                  j                  j                  j
                  \  }}}	}
|j
                  d   |
k7  s|j
                  d   |	k7  rt        ||	|
f||d      }n|d	k(  rQ|j
                  d
   |j                  j
                  d
   k7  r(d
}t        ||j                  j                  |||d      }n|j                  d      r|j                  |dd       }|j
                  |j                  j
                  k7  s|j                  d   |j                  d
   k7  r,t        ||j                  |j                  j
                        }|||<   f |S )a  Filter and process checkpoint state dict for loading.

    Handles resizing of patch embeddings, position embeddings, and relative position
    bias tables when model size differs from checkpoint.

    Args:
        state_dict: Checkpoint state dictionary.
        model: Target model to load weights into.
        interpolation: Interpolation method for resizing.
        antialias: If True, use antialiasing when resizing.

    Returns:
        Filtered state dictionary.
    rF  r   r4   zpatch_embed.proj.weightr&   re   T)r;  rG  verboser   r   )new_sizer   r;  rG  rI  rP   Nir   )new_window_sizenew_bias_shape)getitemsr   rR   r`   rg   r   r   r   r   endswithget_submodulerP   r   r   )rE  rF  r;  rG  out_dictrv   rw   OIr!  r"  r   r   s                r5   checkpoint_filter_fnrT  +  s    4J*5J H  "  1$)$)**//66<<JAq!Qwwr{a1772;!#3(F"/'  +!''!*0E0Ea0H"H !&**44"3+#A ZZ67##AdsG,Aww!88>>>!--PQBRVWVcVcdeVfBf-$%MM#$#A#A#G#G
 A B Or7   variant
pretrainedc                 r    |j                  dd      }t        t        | |ft        t	        |d      d|}|S )zCreate a BEiT model.

    Args:
        variant: Model variant name.
        pretrained: If True, load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        BEiT model instance.
    out_indicesr#   getter)rX  feature_cls)pretrained_filter_fnfeature_cfg)popr   r   rT  r   )rU  rV  r@  rX  rF  s        r5   _create_beitr^  c  sH     **]A.K gz1[hG 	E Lr7   c           
      Z    t        dddddddd      }t        d
d	| it        |fi |}|S )z0BEiT base model @ 224x224 with patch size 16x16.r   r   r   ra   FT皙?r   r   r   r<   r   r   r   r   rV  )beit_base_patch16_224r   r^  rV  r@  
model_argsrF  s       r5   rb  rb  x  sF     B"#GJ fZf4PZKe^dKefELr7   c           
      Z    t        dddddddd      }t        d
d	| it        |fi |}|S )z0BEiT base model @ 384x384 with patch size 16x16.rC  r   r   r   FTr`  r   r   r   r   r<   r   r   r   rV  )beit_base_patch16_384rc  rd  s       r5   rh  rh    sF     s"#GJ fZf4PZKe^dKefELr7   c           	      X    t        ddddddd      }t        d	d| it        |fi |}|S )
z1BEiT large model @ 224x224 with patch size 16x16.r         FTh㈵>r   r   r   r<   r   r   r   rV  )beit_large_patch16_224rc  rd  s       r5   rn  rn    sD     R2$HJ gjgDQ[Lf_eLfgELr7   c           
      Z    t        dddddddd      }t        d
d	| it        |fi |}|S )z1BEiT large model @ 384x384 with patch size 16x16.rC  r   rj  rk  FTrl  rg  rV  )beit_large_patch16_384rc  rd  s       r5   rp  rp    F     t2$HJ gjgDQ[Lf_eLfgELr7   c           
      Z    t        dddddddd      }t        d
d	| it        |fi |}|S )z1BEiT large model @ 512x512 with patch size 16x16.rD  r   rj  rk  FTrl  rg  rV  )beit_large_patch16_512rc  rd  s       r5   rs  rs    rq  r7   c           
      Z    t        dddddddd      }t        d
d	| it        |fi |}|S )z3BEiT v2 base model @ 224x224 with patch size 16x16.r   r   r   ra   FTrl  ra  rV  )beitv2_base_patch16_224rc  rd  s       r5   ru  ru    sF     B"$HJ hzhTR\Mg`fMghELr7   c           	      X    t        ddddddd      }t        d	d| it        |fi |}|S )
z4BEiT v2 large model @ 224x224 with patch size 16x16.r   rj  rk  FTrl  rm  rV  )beitv2_large_patch16_224rc  rd  s       r5   rw  rw    sD     R2$HJ i
idS]NhagNhiELr7   )r'  )r6  Tr0  )?r   r   typingr   r   r   r   r   r   r	   r
   r'   torch.nnrI   torch.nn.functional
functionalri   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__r   r   r6   r2  r9   r   r   r   r1  rA  default_cfgsr   rT  r^  rb  rh  rn  rp  rs  ru  rw  r?  r7   r5   <module>r     sD  P  I I I     A c c c g g * + # <(#U38_ # #@E		 EPaBII aHD299 DBi299 iXc T#s(^ ( %152 26 32
 -1-
 373 37 33
 37 33
 .2. 37"(<3
 -1"(<-
 .2 5;O.
 4817K4
 .217K.
 /3 5;O/u?& ?D5T#u||*;%< 5RYY 5_b 5{ 5  LP  QT  V[  Vb  Vb  Qb  Lc 5p# 4 d * d    d    t $   t $   t $    4    D  r7   