
    kh                     4   d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
mZmZmZmZmZmZmZmZ ddlZddlmZ ddlmc mZ ddlmZ ddlmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% dd	l&m'Z' dd
l(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6 ddgZ7 ejp                  e9      Z:e G d d             Z;de;de;fdZ<	 d@dejz                  dee>e>f   de?deejz                  ee>e>f   f   fdZ@dejz                  fdZAe, G d dej                               ZCe+dddej                  fdejz                  d e>d!e?d"ee>   d#ej                  deejz                     fd$       ZFe+	 	 	 	 dAdejz                  deejz                     d%eGd e>d&e?dejz                  fd'       ZH G d( dej                        ZIdBd)eGd*eJde
fd+ZKd,eeGef   d-eIdeeGef   fd.ZLdCd/eGdeeGef   fd0ZM e/ eMd12       eMd12       eMd12       eM        eM        eM       d3      ZNdDd4eGd5e?deIfd6ZO	 dDd4eGd5e?deIfd7ZPe.dDd5e?deIfd8       ZQe.dDd5e?deIfd9       ZRe.dDd5e?deIfd:       ZSe.dDd5e?deIfd;       ZTe.dDd5e?deIfd<       ZUe.dDd5e?deIfd=       ZVe.dDd5e?deIfd>       ZWe.dDd5e?deIfd?       ZXy)Ea   NaFlex Vision Transformer

An improved version of the Vision Transformer with:
1. Encapsulated embedding and position encoding in a single module
2. Support for linear patch embedding on pre-patchified inputs
3. Support for NaFlex variable aspect, variable resolution
4. Support for FlexiViT variable patch size
5. Support for NaViT fractional/factorized position embedding

Based on ideas from:
- Original Vision Transformer: https://arxiv.org/abs/2010.11929
- FlexiViT: https://arxiv.org/abs/2212.08013
- NaViT: https://arxiv.org/abs/2307.06304
- NaFlex (SigLip-2): https://arxiv.org/abs/2502.14786

Hacked together by / Copyright 2025, Ross Wightman, Hugging Face
    N)	dataclassfieldsreplace)partial)	CallableDictListOptionalSetTupleTypeUnionAny)pad_sequenceIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)AttentionPoolLatentMlp	to_2tupleget_act_layerget_norm_layer	LayerNorm_assert)build_model_with_cfg)feature_take_indices)register_notrace_functionregister_notrace_module)register_modelgenerate_default_cfgs)
checkpointcheckpoint_seqnamed_apply   )Blockglobal_pool_nlcNaFlexVitCfg	NaFlexVitc                   z   e Zd ZU dZdZeeeeef   f   ed<   dZ	eed<   dZ
eed<   dZeed<   d	Zeed
<   dZeed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeeeef      ed<   d Zeed!<   dZ eed"<   dZ!eed#<   dZ"eed$<   dZ#eed%<   dZ$eed&<   dZ%ee   ed'<   d(Z&eed)<   dZ'eed*<   d+Z(eed,<   dZ)eed-<   d.Z*eed/<   dZ+ee   ed0<   dZ,ee   ed1<   dZ-ee   ed2<   dZ.ee   ed3<   dZ/ee   ed4<   dZ0ee   ed5<   dZ1eed6<   y)7r'   zConfiguration for FlexVit model.

    This dataclass contains the bulk of model configuration parameters,
    with core parameters (img_size, in_chans, num_classes, etc.) remaining
    as direct constructor arguments for API compatibility.
       
patch_size   	embed_dim   depth	num_headsg      @	mlp_ratioTqkv_biasFqk_norm	proj_bias        attn_drop_rateNinit_values	drop_ratepos_drop_ratepatch_drop_rateproj_drop_ratedrop_path_rateclass_tokenr   
reg_tokenslearned	pos_embed)r*   r*   pos_embed_grid_sizebicubicpos_embed_interp_modepos_embed_ar_preservingpos_embed_use_grid_sampledynamic_img_padpre_norm
final_normfc_normmapglobal_poolpool_include_prefix weight_initfix_initlinearembed_proj_typeinput_norm_layerembed_norm_layer
norm_layer	act_layerblock_fn	mlp_layerenable_patch_interpolator)2__name__
__module____qualname____doc__r+   r   intr   __annotations__r-   r/   r0   r1   floatr2   boolr3   r4   r6   r7   r
   r8   r9   r:   r;   r<   r=   r>   r@   strrA   rC   rD   rE   rF   rG   rH   rI   rK   rL   rN   rO   rQ   rR   rS   rT   rU   rV   rW   rX        Q/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/naflexvit.pyr'   r'   6   s    /1Jc5c?*+0IsE3OIsIu HdGTItNE $(K%'IuM5 OU NENE KJ Is5=%S/2=!*3*$)T)&+t+ "OT! HdJ"GXd^"K %% KHd $OS#&*hsm*&*hsm* !%J$#Ix}#"Hhsm"#Ix}# ',t+rc   cfgreturnc                     t        | j                  j                               }|j                         D ci c]  \  }}||v s|| }}}|rt	        | fi |} | S c c}}w )zIOverlay kwargs onto config, replacing config values with provided kwargs.)set__dataclass_fields__keysitemsr   )re   kwargsconfig_fieldskvconfig_kwargss         rd   _overlay_kwargsrq   {   sb     005578M&,llnKda]8JQTKMKc+]+J Ls   AATxr+   padc                 Z   | j                   \  }}}}|\  }}|r@||z  dk7  s||z  dk7  r0|||z  z
  |z  }	|||z  z
  |z  }
t        j                  | d|
d|	f      } ||z  ||z  }}| j                  ||||||      j	                  dddddd      j                  |||z  ||z  |z        }|||ffS )ah  Patchify a batch of images.

    Args:
        x: Input tensor of shape [B, C, H, W].
        patch_size: Patch dimensions (patch_h, patch_w).
        pad: Whether to pad images to be divisible by patch size.

    Returns:
        Tuple of (patches, grid_size) where patches has shape [B, N, P*P*C]
        and grid_size is (num_patches_h, num_patches_w).
    r               r$   )shapeFrs   viewpermutereshape)rr   r+   rs   BCHWphpwpad_hpad_wnhnwpatchess                 rd   batch_patchifyr      s      JAq!QFB B!q2v{a"f"a"f"EE!a5)*"Wa2gBffQ2r2r*221aAq!DLLQPRUWPWY[^`Y`cdYdeG RHrc   _coordc           	      (   | d d d d df   j                  d      dz   }| d d d d df   j                  d      dz   }t        ||      D cg c]7  \  }}t        |j                               t        |j                               f9 c}}S c c}}w )Nr   r$   dim)amaxzipr]   item)r   max_ymax_xhws        rd   calculate_naflex_grid_sizesr      s~    1a7O  Q '!+E1a7O  Q '!+E7:5%7HItq!S]CM*IIIs   <Bc            +       V    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(deeeeef   f   dededee   de	de	d	ed
e	deeeeeef   f      dedeeeef      dede	de	dee
ej                        dee	ee
ej                        f   dee
ej                        dedede	ddf* fdZdeeef   fdZd)de	deeeeef   f   fdZdeeef   deeef   fdZdej*                  dej*                  ddfd Zdej*                  dej*                  ddfd!Zdej*                  d"ee   ddfd#Zdej*                  dej*                  ddfd$Zdej*                  dej*                  ddfd%Zdej*                  d"ee   ddfd&Z	 d*dej*                  deej*                     dej*                  fd'Z xZS )+NaFlexEmbedsa4  NaFlex Embedding module for Vision Transformers.

    This module encapsulates the complete embedding process for Vision Transformers,
    supporting both standard and NaFlex (NaViT + FlexiViT) functionality:

    1. Patch embedding (via Conv2d or Linear)
    2. Class and register token preparation
    3. Position embedding addition with interpolation support
    4. Pre-normalization (if requested)
    5. Dropout application

    NaFlex capabilities include:
    - Variable aspect ratio and resolution via patch coordinates
    - Patch type indicators for handling padding tokens in attention
    - Flexible position embedding interpolation for arbitrary grid sizes
    - Support for factorized position embeddings

    The patch embedding can be one of two types:
    - Conv2d-based (default): For standard image inputs [B, C, H, W]
    - Linear-based: For pre-patchified inputs [B, N, P*P*C]

    Args:
        patch_size: Size of patches for patch embedding
        in_chans: Number of input image channels
        embed_dim: Dimensionality of patch embedding
        proj_type: Type of embedding projection layer ('conv' or 'linear')
        input_norm_layer: Normalization layer applied to input (linear mode only)
        proj_norm_layer: Normalization layer applied after projection
        pos_embed: Type of position embedding ('learned', 'factorized', 'rope', 'none')
        pos_drop_rate: Dropout rate for position embeddings
        patch_drop_rate: Dropout rate for patch tokens
        class_token: Whether to include a class token
        reg_tokens: Number of register tokens to include
        bias: Whether to use bias in projection layers
        dynamic_img_pad: Whether to enable dynamic padding for variable resolution
        pos_embed_grid_size: Grid size for position embedding initialization
        pos_embed_interp_mode: Interpolation mode for position embedding resizing
        pos_embed_ar_preserving: Whether to preserve aspect ratio during position embedding interpolation
        default_img_size: Default image size for position embedding grid calculation
    Nr+   in_chansr-   	proj_typer4   r=   r>   rF   default_img_sizer@   rA   rC   rD   rE   rR   proj_norm_layerrT   r9   r:   rX   rf   c                    t         |           || _        || _        || _        || _        || _        t        |      | _        || _	        || _
        || _        || _        |rdnd| _        | xj                  |z  c_        |r*t        j                  t!        j"                  dd|            nd| _        |r*t        j                  t!        j"                  d||            nd| _        d| _        d| _        ||| _        nV|	Tt        |	      | _        t-        t/        | j(                  | j                        D cg c]
  \  }}||z   c}}      | _        |dk(  rx| j                  d   | j                  d   z  |z  }|du r	|J d       |du r|n|xs d}|r ||      nd| _        t        j2                  |||      | _        d| _        d| _        n8|rJ d| _        t        j:                  |||||	      | _        d| _        d| _        | j                  r#dd
lm}  || j                  |||d      | _         nd| _         |du r	|J d       |du r|n|xs d}|r ||      nt        jB                         | _"        |
dv r| j*                  tG        d      d| _$        d| _%        d| _&        |
r|
dk(  rd| _'        n|
dk(  rd| _'        n|
dk(  r| j*                  J | j*                  \  }}d| _'        t        j                  t!        jP                  d||      dz        | _%        t        j                  t!        jP                  d||      dz        | _&        nW| j*                  J | j*                  \  }}t        j                  t!        jP                  d|||      dz        | _$        d| _'        t        jR                  |      | _*        |dkD  r ddl+m,}  ||| j                        | _-        yt        jB                         | _-        yc c}}w )a  Initialize NaFlexEmbeds module.

        Args:
            patch_size: Size of patches for patch embedding.
            in_chans: Number of input image channels.
            embed_dim: Dimensionality of patch embedding.
            proj_type: Type of embedding projection layer ('conv' or 'linear').
            proj_bias: Whether to use bias in projection layers.
            class_token: Whether to include a class token.
            reg_tokens: Number of register tokens to include.
            dynamic_img_pad: Whether to enable dynamic padding for variable resolution.
            default_img_size: Default image size for position embedding grid calculation.
            pos_embed: Type of position embedding ('learned', 'factorized', 'rope', 'none').
            pos_embed_grid_size: Grid size for position embedding initialization.
            pos_embed_interp_mode: Interpolation mode for position embedding resizing.
            pos_embed_ar_preserving: Whether to preserve aspect ratio during interpolation.
            input_norm_layer: Normalization layer applied to input (linear mode only).
            proj_norm_layer: Normalization layer applied after projection.
            norm_layer: Default normalization layer.
            pos_drop_rate: Dropout rate for position embeddings.
            patch_drop_rate: Dropout rate for patch tokens.
            enable_patch_interpolator: Enable dynamic patch size support.
        r$   r   NrP   Tz5`norm_layer` must be given when input_norm_layer=True)biasF)kernel_sizestrider   )PatchEmbedInterpolator)base_patch_sizer   r-   interpolation	antialiasz4`norm_layer` must be given when proj_norm_layer=True)
factorizedr?   zgCannot initialize position embeddings without grid_size.Please provide img_size or pos_embed_grid_size.noneroper   g{Gz?r?   )p)PatchDropout)num_prefix_tokens).super__init__has_class_tokennum_reg_tokensrC   rD   rE   r   r+   r   r-   rF   rX   r   nn	Parametertorchzeros	cls_token	reg_tokenr   rA   tupler   
norm_inputLinearprojflatten	is_linearConv2dtimm.layersr   patch_interpolatorIdentitynorm
ValueErrorr@   pos_embed_ypos_embed_xpos_embed_typerandnDropoutpos_droptimm.layers.patch_dropoutr   
patch_drop)selfr+   r   r-   r   r4   r=   r>   rF   r   r@   rA   rC   rD   rE   rR   r   rT   r9   r:   rX   sr   	patch_dimr   r   r   r   	__class__s                               rd   r   zNaFlexEmbeds.__init__   s   \ 	*(%:"'>$)B&#J/ ".)B& '2q*, HSekk!Q	&BCX\PZekk!Z&KL`d <@>B *':D$)$-.>$?D!',TEZEZ\`\k\kAl-mAa1f-m'nD$   *T__Q-??(JI(D0Z5G HGHH-=-EzL\Ld`d=M.y9SWDO		)YYGDI DL!DN (''"DO		)JU^DI  DL"DN )):&< $!#3'D# '+D# $t+
0B 	CB	CC(74(?*oF]Y]2AOI.r{{}	 11d6N6N6VBC C 263737I/"(D& "(D,&++777++DAq".D!||EKK1i,H3,NOD!||EKK1i,H3,NOD++777++DAq\\%++aAy*IC*OPDN"+D 

]3Q>*"&"8"8DO
 !kkmDO[ .ns   7P
c                 D    t        | j                  | j                        S )zGet feature information for feature extraction.

        Args:
            location: Feature extraction location identifier

        Returns:
            Dictionary containing feature channel count and reduction factor
        )num_chs	reduction)dictr-   r+   )r   locations     rd   feature_infozNaFlexEmbeds.feature_infos  s     DNNdooFFrc   	as_scalarc                 H    |rt        | j                        S | j                  S )zGet the feature reduction ratio (stride) of the patch embedding.

        Args:
            as_scalar: Whether to return the maximum dimension as a scalar

        Returns:
            Feature reduction ratio as scalar or tuple
        )maxr+   )r   r   s     rd   
feat_ratiozNaFlexEmbeds.feat_ratio~  s      t''??"rc   img_sizec                    | j                   rPt        j                  |d   | j                  d   z        t        j                  |d   | j                  d   z        fS |d   | j                  d   z  |d   | j                  d   z  fS )a  Calculate grid (feature) size for given image size.

        Takes into account dynamic padding when enabled.

        Args:
            img_size: Input image size as (height, width)

        Returns:
            Grid size as (grid_height, grid_width)
        r   r$   )rF   mathceilr+   )r   r   s     rd   dynamic_feat_sizezNaFlexEmbeds.dynamic_feat_size  s     99Xa[4??1+==>		(ST+X\XgXghiXjJj@kkkA;$//!"44hqkT__UVEW6WWWrc   rr   patch_coordc           
      z    t        |      } j                  j                  dd \   j                  j                  dddd      j	                          fd}i }t        |      D ]&  \  }}|j                  |g       j                  |       ( |j                         D ]  \  }} ||      }	t        j                  d   |	j                  d         }
ddd|
f   j                  dt        j                  |j                        |	ddd|
f   j                  t        |      dd              y)	a_  Apply learned position embeddings to NaFlex batch in-place.

        Interpolates learned 2D position embeddings for each sample in the batch
        based on their individual grid sizes.

        Args:
            x: Input tensor to add position embeddings to [B, N, C]
            patch_coord: Patch coordinates [B, N, 2] with (y, x) values
        r$   rw   r   ru   c                    | d   k(  r)| d   k(  r!j                   j                  dz  d      }nzj                  rt        t	        |             n| }t        j                  |j                  dd      ddddd| d   d| d   f   j                  d      j                  dd      }|j                  j                  	      S )
a  
            Return a flattened positional-embedding grid at an arbitrary spatial resolution.

            Converts the learned 2-D table stored in NCHW format (pos_embed_nchw) into
            a (1, H*W, C) sequence that matches the requested size.
            r   r$   FTsizemodealign_cornersr   Nru   dtype)r@   r}   rD   r   r   rz   interpolaterC   r   	transposetor   )r   pos_embed_flat_interp_sizeorig_horig_wpos_embed_nchwr   rr   s      rd   	_interp2dz?NaFlexEmbeds._apply_learned_naflex_pos_embed.<locals>._interp2d  s     Q6!Q6(9!%!7!76F?B!O7;7S7SyT3Y]!""%33"'"" Qa(47(*", -4GAJyyA  "$$177$33rc   Ndevicer   )r   r@   ry   r|   r_   	enumerate
setdefaultappendrk   min
index_add_r   	as_tensorr   expandlen)r   rr   r   naflex_grid_sizesr   size_to_indicesbirn   batch_indicesr   seq_lenr   r   r   s   ``         @@@rd   _apply_learned_naflex_pos_embedz,NaFlexEmbeds._apply_learned_naflex_pos_embed  s.    8D--a2//1a;AAC	4 	4* =?01 	9EB&&q"-44R8	9 !0 5 5 7 		A} 'q\N!''!*n&:&:1&=>Ga'kN%%ahh?q(7({+223}3Er2N		rc   c                    |j                   }|j                  \  }}}|j                  d      j                  dz   }| j                  r.|j                  d      }|j                         }	|	x}
}|	|z  x}}n-|j                  d      \  }
}|
|dddf   z  }||dddf   z  }t        j                  |dd|t        j                        }||ddddf<   ||ddddf<   |dz
  |ddddf<   |dz
  |ddddf<   t        j                  ||||
|fd	      }t        j                  | j                  j                  dddd      j                  |d
d
d
      j                         || j                   dd      j#                  |j$                        }t        j&                  ||      j)                  d      }|||dd|d   |d   f   z  }y)a  Apply learned position embeddings to NaFlex batch using grid_sample.

        Uses F.grid_sample for efficient interpolation of learned 2D position embeddings
        based on patch coordinates. Based on proposal by https://github.com/stas-sl

        Args:
            x: Input tensor to add position embeddings to [B, N, C]
            patch_coord: Patch coordinates [B, N, 2] with (y, x) values
        r$   r   r   Nru   rw   )r   r   Fr   r   borderr   r   padding_moder   r   .r   .r$   )r   ry   r   valuesrD   r   r   r   float32rz   affine_gridgrid_sampler@   r|   r   r_   rC   r   r   arange	unsqueeze)r   rr   r   r   r~   Nr   shapesL_iL_globalgrid_size_ygrid_size_xscale_xscale_ythetagridr@   r   s                     rd   +_apply_learned_naflex_pos_embed_grid_samplez8NaFlexEmbeds._apply_learned_naflex_pos_embed_grid_sample  s    ''1aQ'..2''++!+$CxxzH(00K+ (3.Gg'-{{q{'9$K!F1a4L0G!F1a4L0GAq!F%--H aAg aAg 1aAg 1aAg}}UQ;$DTYZMMNN""1aA.55aRDJJL++!
 "177"
 	 \\!F+55a8	Yr1k&1;v3FFGGrc   	grid_sizec                 8   | j                   j                  dd \  }}|d   |k(  r)|d   |k(  r!| j                   j                  d||z  d      }n| j                  rt	        |      }||f}n|}t        j                  | j                   j                  dddd      j                         || j                  dd      d	d	d	d	d	|d   d	|d   f   j                  d      j                  dd      }|j                  |j                  
      }|j                  |       y	)a0  Apply learned position embeddings to standard 2D batch in-place.

        Interpolates learned 2D position embeddings to match the specified grid size.

        Args:
            x: Input tensor to add position embeddings to [B, H*W, C]
            grid_size: Target grid size as [height, width]
        r$   rw   r   r   ru   FTr   Nr   )r@   ry   r}   rD   r   rz   r   r|   r_   rC   r   r   r   r   add_)r   rr   r  r   r   r   Lr   s           rd   _apply_learned_pos_embedz%NaFlexEmbeds._apply_learned_pos_embed  s    --a2Q<6!ilf&<!^^33AvKN ++	N !t(]]&&q!Q288:!//# MYq\M=IaL=02 3:'!*YYq!_  (***9	~rc   c           
         t        |      }t        |      j                  d      k(  sJ | j                  j                  d   | j
                  j                  d   }}i }t        |      D ]&  \  }}|j                  |g       j                  |       ( dt        j                  dt        dt        dt        j                  ffd}	|j                         D ]  \  }}
|\  }}| j                  rt        ||      x}}n||}} |	| j                  ||      ddd|f   } |	| j
                  ||      ddd|f   }|j                  d	      |j                  d      z   }|j!                  dd	      }t#        j                  d   |j                  d         }ddd|f   j%                  dt        j&                  |
j(                  
      |ddd|f   j+                  t        |
      dd               y)ac  Apply factorized position embeddings to NaFlex batch in-place.

        Uses separate Y and X position embedding tables that are interpolated
        and combined for each sample's grid size.

        Args:
            x: Input tensor to add position embeddings to [B, N, C]
            patch_coord: Patch coordinates [B, N, 2] with (y, x) values
        r   r$   table
new_lengthorig_lengthrf   c                    ||k(  r| j                  j                        S t        j                  | j	                  ddd      j                         |dd      j	                  ddd      j                  j                        S )z
            Resample a 1-D positional-embedding table to specified length
            and return it in (1, L, C) layout, dtype matching x.
            r   r   ru   r$   rP   Fr   r   r   r   r   rz   r   r|   r_   r  r  r  rr   s      rd   	_interp1dzBNaFlexEmbeds._apply_factorized_naflex_pos_embed.<locals>._interp1d@  st    
 [(xxaggx..==aA&,,.#	
 gaArrr01rc   Nru   r   r   )r   r   r   r   ry   r   r   r   r   r   Tensorr]   rk   rD   r   r
  r   r   r   r   r   r   )r   rr   r   r   r   r   r   r   rn   r#  r   target_htarget_wlen_ylen_xpe_ype_xposr   s    `                 rd   "_apply_factorized_naflex_pos_embedz/NaFlexEmbeds._apply_factorized_naflex_pos_embed&  s    8D$%222 ))//2D4D4D4J4J14M =?01 	9EB&&q"-44R8	9	1U\\ 	1s 	1 	1QVQ]Q] 	1 !0 5 5 7 	A}!"Hh++ #Hh 77'uT--uf=a(lKDT--uf=a(lKD ..#dnnQ&77C++a#C!''!*ciil3Ga'kN%%ahh?AxxK ''M(:BC	rc   c                    j                   }j                  \  }|j                  d      dz   }| j                  r.|j                  d      }|j                         }|x}}	||z  x}
}n,|j                  d      \  }}	|	|dddf   z  }
||dddf   z  }dt        j
                  dt        j
                  dt        j
                  dt        j
                  ffd	} || j                  |
|	
      } || j                  ||
      }t	        j                  |      j                  d      }||ddd|d   f   ||ddd|d   f   z   z  y)a  Apply factorized position embeddings to NaFlex batch using grid_sample.

        Uses F.grid_sample for efficient interpolation of separate Y and X position
        embedding tables based on patch coordinates. Based on proposal by https://github.com/stas-sl

        Args:
            x: Input tensor to add position embeddings to [B, N, C]
            patch_coord: Patch coordinates [B, N, 2] with (y, x) values
        r$   r   r   Nr  scale
out_lengthrf   c                    | j                  ddd      j                  d      j                  ddd      j                         }t	        j
                  ddj                        }||d d ddf<   |dz
  |d d ddf<   d|d d ddf<   t        j                  |d|fd      }t        j                  ||d	dd
      }|j                  j                        S )Nr   ru   r$   r   rw   r   Fr   bilinearr   r  )r|   r
  r   r_   r   r   r   rz   r  r  r   r   )	r  r.  r/  per  r  r~   r   rr   s	         rd   r#  zNNaFlexEmbeds._apply_factorized_naflex_pos_embed_grid_sample.<locals>._interp1d  s    q!Q'11!4;;Ar2rJPPRBKK1a9E"E!Q'N"QYE!Q'NE!Q'N==Aq*(=USDr4j\deB55>!rc   )r.  r/  r   r  r  )
r   ry   r   rD   r   r$  r   r   r	  r
  )r   rr   r   r   _r  r  r  r  r  r  r  r#  r*  r)  r   r~   r   s    `              @@rd   ._apply_factorized_naflex_pos_embed_grid_samplez;NaFlexEmbeds._apply_factorized_naflex_pos_embed_grid_samplec  sW    ''1a!!a!(1,''++!+$CxxzH(00K+ (3.Gg (.{{1~$K!F1a4L0G!F1a4L0G	"U\\ 	"%,, 	"ELL 	"]b]i]i 	" ))[Q))[Q\\!F+55a8	T"aK//04Aq+fBU8U3VVVrc   c                    | j                   j                  d   | j                  j                  d   }}|\  }}| j                  rt	        ||      x}}n||}}dt
        j                  dt        dt        dt
        j                  ffd}	 |	| j                   ||      ddd|f   }
 |	| j                  ||      ddd|f   }|
j                  d      |j                  d      z   }|j                  dd      }j                  |       y)	a]  Apply factorized position embeddings to standard 2D batch in-place.

        Uses separate Y and X position embedding tables that are interpolated
        and combined for the specified grid size.

        Args:
            x: Input tensor to add position embeddings to [B, H*W, C]
            grid_size: Target grid size as [height, width]
        r$   r  r  r  rf   c                    ||k(  r| j                  j                        S t        j                  | j	                  ddd      j                         |dd      j	                  ddd      j                  j                        S )Nr   r   ru   r$   rP   Fr   r!  r"  s      rd   r#  z;NaFlexEmbeds._apply_factorized_pos_embed.<locals>._interp1d  sr    [(xxaggx..==aA&,,.#	
 gaArrr01rc   Nru   )r   ry   r   rD   r   r   r$  r]   r
  r   r  )r   rr   r  r   r   r%  r&  r'  r(  r#  r)  r*  r@   r   s    `            rd   _apply_factorized_pos_embedz(NaFlexEmbeds._apply_factorized_pos_embed  s	    ))//2D4D4D4J4J14M&(''(33EE#X5E	1U\\ 	1s 	1 	1QVQ]Q] 	1 ))5&9!YhY,G))5&9!YhY,G NN1%q(99	"**1a0	~rc   c           
         d}|j                   d   }| j                  r5|?t        |j                  dk(  d       t	        || j
                  | j                        \  }}n*t        |j                  dk(  xs |j                  dk(  d       | j                  r}|j                  dk(  rnt        | j                  du d	       | j                  || j                  j                  | j                  j                  t        |j                   d
d       d      }nC|j                  d
      }| j                  | j                  |      }| j                  |      }nt        |j                  dk(  d       | j                  r|j                   dd \  }}| j
                  d   || j
                  d   z  z
  | j
                  d   z  }| j
                  d   || j
                  d   z  z
  | j
                  d   z  }t        j                   |d|d|f      }| j                  |      }|j                   dd }| j                  r!|j                  d
      j#                  dd
      }| j%                  |      }| j&                  dk(  rJ|| j)                  ||       n| j*                  r| j-                  ||       n| j/                  ||       no| j&                  dk(  rJ|| j1                  ||       nJ| j*                  r| j3                  ||       n*| j5                  ||       n| j&                  dk(  rJ d       g }	| j6                  ,|	j9                  | j6                  j;                  |dd             | j<                  ,|	j9                  | j<                  j;                  |dd             |	rt?        j@                  |	|gz   d      }| jC                  |      }| jE                  |      }|S )a6  Forward pass for patch embedding with position encoding.

        Args:
            x: Input tensor. Supported formats:
                - [B, C, H, W] for conv mode
                - [B, N, P*P*C] for pre-patchified linear mode (normal)
                - [B, N, Ph, Pw, C] for pre-patchified linear mode (variable patch size)
            patch_coord: Optional patch coordinates [B, N, 2] for NaFlex mode.

        Returns:
            Embedded tensor with position encoding and class/register tokens.
            Shape: [B, num_prefix_tokens + N, embed_dim]
        Nr   rv   z-Expecting 2D image input with input ndim == 4)rs   rx   rw   z/Expecting patchified input with ndim == 3 or 5.z,input norm not supported with patch resizingru   T)r+   r   zConvolutional input must be 4Dr$   r?   )r  r   r   r   zROPE not yet implementedr   r   )#ry   r   r   ndimr   r+   rF   rX   r   r   r   weightr   r   r   rz   rs   r   r   r   r  rE   r  r   r7  r4  r,  r   r   r   r   r   catr   r   )
r   rr   r   r  r~   r   r   r   r   to_cats
             rd   forwardzNaFlexEmbeds.forward  sl   $ *.	GGAJ>>"!%TU-adFZFZ[9 !2qvv{4ef --!&&A+4/1_` ++II$$IINN$QWWQq\2" ,  IIaL??.*AIIaLAFFaK!AB##wwrs|1+a$//!2D.DDXYHZZ+a$//!2D.DDXYHZZEE!a512		!AI||IIaL**1a0 IIaL)+$--a9-E 11DDQT_D`888T  L0$00i0H 11GGWbGc;;A;;W  F*4445 >>%MM$..//2r:;>>%MM$..//2r:;		&A3,A.A MM!OOArc   )r*   rw   r,   NTTr   FNr?   )   r@  rB   FFNNNr5   r5   FTN)rY   rZ   r[   r\   r   r]   r   r
   ra   r`   r   r   Moduler_   r   r   r   r   r   r   r   r$  r   r  r	   r  r,  r4  r7  r?  __classcell__r   s   @rd   r   r      s>   'V 79 '+" $$)FJ&=E)2,1.3:>FJ48#%%'.3+X,c5c?23X, X, 	X,
  }X, X, X, X, "X, 'uS%S/-A'BCX, X, "*%S/!:X, $'X, &*X, (,X,  'tBII7!X," #4$ryy/)B#BC#X,$ !bii1%X,& !'X,( #)X,* (,+X,, 
-X,t	GS#X 	G#D #E#uS#X:N4O #X%S/ XeCHo X"6||6 6 
	6p,H||,H ,H 
	,H\!||! Cy! 
	!F;||; ; 
	;z-W||-W -W 
	-W^(||( Cy( 
	(Z 37f||f "%,,/f 
	frc   r   patch_validr   	symmetricq_lenr   c                     | y| j                         } | j                  \  }}|}|dkD  rA| j                  ||ft        j                         }t        j                  || gd      } ||z  }|r5| j                  d      | j                  d      z  }	|	j                  d      }	n%|xs |}| ddddddf   j                  |d||      }	t        j                  |	|      }
|
j                  |	 t        j                  |      j                         |
S )a3  Creates an attention mask from patch validity information.

    Supports two modes controlled by `symmetric`:
    1. `symmetric=True` (default): Creates a symmetric mask of shape
       [B, 1, seq_len, seq_len]. An attention pair (i, j) is allowed only if
       both token i and token j are valid. Suitable for standard self-attention.
    2. `symmetric=False`: Creates a potentially non-square mask of shape
       [B, 1, q_len, kv_len]. An attention pair (q, k) is allowed only if
       the key/value token k is valid. Query token validity is not checked
       in the mask itself. Useful for cross-attention or specific self-attention
       implementations `q_len` can be specified.

    Used for NaFlex mode to handle variable token counts and padding tokens.

    Args:
        patch_valid: Tensor of shape [B, N] with True for valid patches, False for padding.
        num_prefix_tokens: Number of prefix tokens (class token, register tokens)
            to prepend, which are always considered valid.
        symmetric: If True, create a symmetric mask.
            If False, create an expanded mask based only on key/value validity.
        q_len: Query sequence length override. Only used when `symmetric` is False.
            Defaults to the key/value sequence length (`kv_len`) if None.
        dtype: Dtype of the output attention mask (e.g., torch.float32).

    Returns:
        Attention mask tensor. Additive mask (-inf for masked, 0 for unmasked).
        Shape is [B, 1, seq_len, seq_len] if symmetric=True,
        or [B, 1, q_len, kv_len] if symmetric=False.
    Nr   r   r$   r   r   )r`   ry   new_onesr   r=  r
  r   
zeros_likemasked_fill_finfor   )rF  r   rG  rH  r   r~   r  kv_lenprefix_valid	mask_bool
mask_floats              rd   create_attention_maskrR  %  s   J ""$KDAqF 1"++Q0A,B%**+Uii{ ;C##))"-0E0Ea0HH	''*	 4q 0188AufM	 !!)59JYJE(:(>(>?rc   	pool_typereduce_include_prefixc                    ||dvrt        | |||      } | S |dkD  rF|r9|j                  | j                  d   |      }t        j                  ||gd      }n| dd|df   } |j                  | j                        }|dk(  rN| |j                  d      z  j                  d      }|j                  dd	
      j                  d      }||z  }	|	S |dk(  r| |j                  d      z  j                  d      }|j                  dd	
      j                  d      }||z  }
| j                         }t        j                  |j                        j                  || <   |j                  d      }d|
|z   z  S |dk(  rO| j                         }t        j                  |j                        j                  || <   |j                  d      S J )aP  Global pooling with NaFlex support for masked tokens.

    Applies global pooling while respecting patch validity masks to exclude
    padding tokens from pooling operations.

    Args:
        x: Input tensor with shape [B, N, C]
        patch_valid: Optional validity mask for patches [B, N-num_prefix_tokens]
        pool_type: Type of pooling ('token', 'avg', 'avgmax', 'max')
        num_prefix_tokens: Number of prefix tokens (class/register)
        reduce_include_prefix: Whether to include prefix tokens in pooling reduction

    Returns:
        Pooled tensor with shape [B, C]
    N)avgavgmaxr   rS  r   rT  r   r$   r   rV  r   T)r   keepdim)r   rW  g      ?r   )r&   rJ  ry   r   r=  r   r   r
  sumclampclonerM  r   r   )rr   rF  rS  r   rT  rO  patch_valid_floatmasked_sumsvalid_countspooled
masked_avgmasked_x
masked_maxs                rd   global_pool_naflexrd  j  s   . i/GG/"7	
  1  '//
<MNL))\;$?QGK !&''(A#qww/E,66r::??A?F(,,D,AGGAGN|+	h	,66r::??A?F(,,D,AGGAGN </
 779!&X^^!<!@!@+]]q])
 j:-..	e	779!&X^^!<!@!@+}}}##urc   c                   X    e Zd ZdZ	 	 	 	 d)dee   dededeeeeeef   f      ddf
 fdZ	d*d	Z
d+d
eddfdZej                  j                         d+dededdfd       Zej                  j                  defd       Zej                  j                  d,dedefd       Zej                  j                  d-deddfd       Zej                  j                  dej0                  fd       Zd.dedee   ddfdZ	 	 	 	 	 	 	 	 	 	 d/deej6                  eeej6                  f   f   deeeee   f      dededededededeej6                     d eej6                     d!eej6                     deeej6                     eej6                  eej6                     f   eeef   f   fd"Z	 	 	 d0dej6                  deej6                     d eej6                     d!eej6                     dej6                  f
d#Z	 	 d1dej6                  d$ee   d eej6                     dej6                  fd%Z 	 	 d2dej6                  d&ed eej6                     dej6                  fd'Z!	 	 d1deej6                  eeej6                  f   f   deej6                     d eej6                     dej6                  fd(Z" xZ#S )3r(   az  NaFlexVit: Vision Transformer with NaFlex support for flexible input handling.

    A flexible implementation of Vision Transformer that supports:
    - Standard image classification with various pooling strategies
    - NaFlex functionality for variable aspect ratios and resolutions
    - Linear patch embedding for pre-patchified inputs
    - Multiple position embedding strategies (learned, factorized, rope)
    - Comprehensive attention masking for efficient batch processing
    - Encapsulated embedding and position encoding in FlexEmbeds module
    - Compatible with standard ViT checkpoints through checkpoint filtering
    Nre   r   num_classesr   rf   c                    t         |           |xs
 t               }|rt        |fi |}|j                  dv sJ |j
                  s|j                  dk7  sJ |j                  dv sJ t        |j                        xs t        }t        |j                        }t        |j                        xs t        j                  }|j                  xs t         }	|j"                  xs t$        }
|| _        |j                  | _        |j(                  x| _        x| _        | _        |j
                  rdnd| _        | xj.                  |j0                  z  c_        |j0                  | _        |j
                  | _        |j6                  | _        d| _        t;        d#i d|j<                  d|d	|j(                  d
|j>                  d|j@                   d|j
                  d|j0                  d|d|jB                  d|j                  d|jD                  d|jF                  d|jH                  d|jJ                  d|d|jL                  d|jN                  dtQ        |dd      | _)        |j@                  r ||j(                        nt        jT                         | _+        tY        jZ                  d|j\                  |j^                        D cg c]  }|ja                          }}t        jb                  te        |j^                        D cg c]t  } |	|j(                  |jf                  |jh                  |jj                  |jl                  |jn                  |jp                  |jr                  |jt                  ||   |||
      v c} | _;        | jR                  jy                  d      }te        |j^                        D cg c]  }t{        d| |j(                  |       c}| _>        |j~                  r|j                  s ||j(                        nt        jT                         | _A        |j                  dk(  r4t        | j(                  |jf                  |jh                  ||      | _C        nd | _C        |j                  }||j                  d!k(  }|j~                  r|r ||j(                        nt        jT                         | _@        t        j                  |j                        | _F        |dkD  r t        j                  | j(                  |      nt        jT                         | _H        |j                  d"k7  r| j                  |j                         |j                  r| j                          y y c c}w c c}w c c}w )$a  Initialize NaFlexVit model.

        Args:
            cfg: Model configuration. If None, uses default NaFlexVitCfg.
            in_chans: Number of input image channels.
            num_classes: Number of classification classes.
            img_size: Input image size (for backwards compatibility with classic vit).
            **kwargs: Additional config parameters to override cfg values.
        rM   rV  rW  r   tokenrJ   ri  )rM   r   r?   r   r$   r   Fr+   r   r-   r   r4   r=   r>   r   rF   r@   rA   rC   rD   rE   r   r9   r:   rX   )r   r0   r1   r2   r3   r4   r7   	proj_drop	attn_drop	drop_pathrT   rU   rW   T)r   zblocks.)moduler   r   rJ   )r0   r1   rT   rU   NrV  skiprb   )Mr   r   r'   rq   rK   r=   r@   r   rT   r   rS   r   rU   r   GELUrV   r%   rW   r   rf  r-   num_featureshead_hidden_sizer   r>   r   r   rL   grad_checkpointingr   r+   rQ   rG   rF   rA   rC   rD   rE   r9   r:   getattrembedsr   norm_prer   linspacer<   r/   r   
Sequentialranger0   r1   r2   r3   r4   r7   r;   r6   blocksr   r   r   rH   rI   r   r   	attn_poolr   r8   	head_dropr   headrN   init_weightsrO   fix_init_weight)r   re   r   rf  r   rl   rT   rS   rU   rV   rW   rr   dpripatch_reductionrI   r   s                   rd   r   zNaFlexVit.__init__  s   " 	 #\^!#00C "NNNN#//W"<<<}} EEEE $CNN3@y
)#*>*>?!#--0;BGG	<<(5MM(S	 '??EH]]RRD1DN&)oo1#..0!nn"#&#:#: "' # 
~~

 mm
 ))	

 ,,&
 
 ~~
 &
  //
 mm
 !$ 7 7
 #&";";
 %($?$?
 '*&C&C
 -
  ++!
"  //#
$ '.c3NPU&V%
( 69\\
3==1r{{} "'33E3Esyy!QRAqvvxRRmm  399%!&'   MM------OO,,,,a&%##&' (& ++0040@ 399%
 '!s}}X

 25Js}}-Y[YdYdYf	 ??e#0----%#DN "DN ++?oo.G47NNwz#--0TVT_T_TaCMM2>IAoBIIdnnk:SUS^S^S`	??f$coo.<<  " e S&'(
s   U:A9U?"Vc                 F   dt         j                  dt        ddfd}t        | j                        D ]m  \  }} ||j
                  j                  j                  j                  |dz           ||j                  j                  j                  j                  |dz          o y)z8Apply initialization weight fix with layer-wise scaling.param	_layer_idrf   Nc                 R    | j                  t        j                  d|z               y )Ng       @)div_r   sqrt)r  r  s     rd   rescalez*NaFlexVit.fix_init_weight.<locals>.rescale?  s    JJtyyy12rc   r$   )r   r$  r]   r   ry  attnr   r<  datamlpfc2)r   r  layer_idlayers       rd   r~  zNaFlexVit.fix_init_weight=  s    	35<< 	3C 	3D 	3  )5 	=OHeEJJOO**//A>EIIMM((--x!|<	=rc   r   c                     |dv sJ d|v r t        j                  | j                         nd}t        t	        ||      |        y)zInitialize model weights according to specified scheme.

        Args:
            mode: Initialization mode ('jax', 'jax_nlhb', 'moco', or '')
        )jaxjax_nlhbmocorM   nlhbr5   N)r   logrf  r#   get_init_weights_vit)r   r   	head_biass      rd   r}  zNaFlexVit.init_weightsF  sD     666639T>TXXd..//r	(y94@rc   checkpoint_pathprefixc                 2    ddl m dfd	} || ||       y )Nr$   )_load_weightsc                    t        j                  |d      }t        |t              r	d|v r|d   }t	        |j                               D ]  }|j                  d      r|j                  |      |d|z   <   ,|j                  d      r|j                  |      |d|z   <   U|j                  d      r|j                  |      |d|z   <   ~|j                  d      s|j                  |      |d|d	d
 z   <     | ||      S )z8Adapter function to handle the different model structurecpu)map_location
state_dictr   embeds.r   r@   patch_embedr.   N)r   load
isinstancer   listrj   
startswithpop)modelr  r  r  rn   _orig_load_weightss        rd   _load_weights_adapterz8NaFlexVit.load_pretrained.<locals>._load_weights_adapterU  s    O%HJ*d+
0J'5
 *//+, G<<,0:q0AJy1}-\\+.0:q0AJy1}-\\+.0:q0AJy1}-\\-05?^^A5FJy1RS612G &eZ@@rc   rM   )vision_transformerr  )r   r  r  r  r  s       @rd   load_pretrainedzNaFlexVit.load_pretrainedP  s     	L	A& 	dOV<rc   c                     h d}|S )zGet set of parameter names that should not have weight decay applied.

        Returns:
            Set of parameter names to skip during weight decay
        >   embeds.cls_tokenembeds.pos_embedembeds.reg_tokenrb   )r   	skip_lists     rd   no_weight_decayzNaFlexVit.no_weight_decayj  s     Q	rc   coarsec                      t        dddg      S )zGet parameter group matcher for optimizer parameter grouping.

        Args:
            coarse: Whether to use coarse-grained grouping

        Returns:
            Dictionary mapping group names to regex patterns
        z^embeds)z^blocks\.(\d+)N)z^norm)i )stemry  )r   )r   r  s     rd   group_matcherzNaFlexVit.group_matchert  s     -/CD
 	
rc   enablec                     || _         t        | j                  d      rGt        | j                  j                  d      r&| j                  j                  j	                  |       yyy)zEnable or disable gradient checkpointing for memory efficiency.

        Args:
            enable: Whether to enable gradient checkpointing
        r  set_grad_checkpointingN)rr  hasattrrt  r  r  )r   r  s     rd   r  z NaFlexVit.set_grad_checkpointing  sO     #)4;;.74;;;R;RTl3mKK##::6B 4n.rc   c                     | j                   S )zeGet the classification head module.

        Returns:
            Classification head module
        )r|  )r   s    rd   get_classifierzNaFlexVit.get_classifier  s     yyrc   rK   c                    || _         |=|dv sJ |dk(  r| j                  J d       |dk7  r| j                  d| _        || _        |dkD  r&t        j                  | j
                  |      | _        yt        j                         | _        y)zReset the classification head with new number of classes and pooling.

        Args:
            num_classes: Number of classes for new classification head
            global_pool: Optional new global pooling type
        Nrh  rJ   z=Cannot currently add attention pooling in reset_classifier().r   )rf  rz  rK   r   r   r-   r   r|  )r   rf  rK   s      rd   reset_classifierzNaFlexVit.reset_classifier  s     '""NNNNe#(>]]]u%$..*D!%*D>IAoBIIdnnk:	SUS^S^S`	rc   rr   indicesreturn_prefix_tokensr   
stop_early
output_fmtintermediates_onlyoutput_dictr   rF  	attn_maskc           	         |dv sJ d       |dk(  }g }t        t        | j                        |      \  }}t        |t              r|d   }	|d   }
|d   }J d       |}|j
                  dd	 \  }}| j                  j                  ||f      \  }}|#|
!t        |
| j                  |j                        }| j                  ||	
      }| j                  |      }t        j                  j                         s|s| j                  }n| j                  d	|dz    }t        |      D ]z  \  }}| |||      }n?| j                   r+t        j                  j                         st#        ||      }n ||      }||v sW|j%                  |r| j'                  |      n|       | | j                  rE|D cg c]  }|d	d	d| j                  f    }}|D cg c]  }|d	d	| j                  d	f    }}nd	}|rN|D cg c]C  }|j)                  |j
                  d   ||d      j+                  dddd      j-                         E }}|r*i }||d<   ||r||d<   |s| j'                  |      }||d<   |S t        j                  j                         s|r|t/        t1        ||            }|r|S | j'                  |      }||fS c c}w c c}w c c}w )an   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            return_prefix_tokens: Return both prefix and spatial intermediate tokens
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
            output_dict: Return outputs as a dictionary with 'image_features' and 'image_intermediates' keys
            patch_coord: Optional patch coordinates [B, N, 2] for NaFlex mode
            patch_valid: Optional patch type indicators (1=patch, 0=padding) for NaFlex
            attn_mask: Optional attention mask for masked attention
        Returns:
            A tuple with (final_features, intermediates), a list of intermediate features, or a dictionary containing
            'image_features' and 'image_intermediates' (and optionally 'image_intermediates_prefix')
        )NCHWNLCz)Output format must be one of NCHW or NLC.r  r   rF  r   zWIP, patch mode needs more workr9  Nr:  r$   r  r   r   rw   ru   image_intermediatesimage_intermediates_prefiximage_features)r   r   ry  r  r   ry   rt  r   rR  r   r   ru  r   jitis_scriptingr   rr  r!   r   r   r}   r|   
contiguousr  r   )r   rr   r  r  r   r  r  r  r  r   rF  r  r}   intermediatestake_indices	max_indexr   heightwidthr   r   ry  r  blkyprefix_tokensresult_dictx_finals                               rd   forward_intermediateszNaFlexVit.forward_intermediates  s   F _,Y.YY,&"6s4;;7G"QiaM*KM*K	lG;;;5GGGBCLMFE;;00&%ADAq !8-k4;Q;QSZS`S`aI KK[K9MM! 99!!#:[[F[[)a-0F' 		BFAs$Y/((1G1G1IsA&FL $$TTYYq\qA		B !!ERSQq!D$:$:"::;SMSDQRqQq$"8"8"99:RMR M ' 		!''!*aB/771aCNNPM  K1>K-.(-A<I89 &))A,07,- yy%%',@]E^ ]M!BCM  IIaL-K TRs   /J>K8AKc                    |"t        || j                  |j                        }| j                  ||      }| j	                  |      }|| j
                  D ]  } |||      } nR| j                  r5t        j                  j                         st        | j
                  |      }n| j                  |      }| j                  |      }|S )Nr   r   r:  r  )rR  r   r   rt  ru  ry  rr  r   r  r  r"   r   )r   rr   r   rF  r  r  s         rd   forward_featureszNaFlexVit.forward_features  s     -"&"8"8ggI KK{K3MM! {{ 0Y/0$$UYY-C-C-Et{{A.AAAIIaLrc   rS  c                 V   | j                   ht        || j                  r| j                  nddd|j                        }| j                  s|d d | j                  d f   }| j                  ||      }|S || j
                  n|}t        |||| j                  | j                        }|S )Nr   Fr$   )r   rG  rH  r   r  rX  )rz  rR  rL   r   r   rK   rd  )r   rr   rS  rF  r  s        rd   _poolzNaFlexVit._pool<  s     >>%-<@<T<T$"8"8Z[ggI ++a//001qI6AH(1(9D$$y	"44"&":":
 rc   
pre_logitsc                     | j                  ||      }| j                  |      }| j                  |      }|r|S | j                  |      S )NrF  )r  rI   r{  r|  )r   rr   r  rF  s       rd   forward_headzNaFlexVit.forward_headZ  sF     JJqkJ2LLONN1q0DIIaL0rc   c                     t        |t              r|d   }|d   }|d   }n|}t        || j                  |j                        }| j                  ||||      }| j                  ||      }|S )a6  Forward pass with optional NaFlex support.

        Args:
            x: Input tensor. Supported formats:
                - [B, C, H, W] standard image input
                - [B, N, P*P*C] pre-patchified tensor (flattened patches)
                - [B, N, Ph, Pw, C] pre-patchified tensor (variable patch size)
                - Dict from NaFlex collator
            patch_coord: Optional patch coordinates [B, N, 2] for NaFlex mode.
            patch_valid: Optional patch validity indicators for NaFlex.

        Returns:
            Model output tensor.
        r   rF  r   r  )r   rF  r  r  )r  r   rR  r   r   r  r  )r   rr   r   rF  r   r  s         rd   r?  zNaFlexVit.forwarde  s    ( aM*KM*K	lG G *"44--
	 !!##	 " 
 #  
 rc   )Nrw     N)rf   Nr  FrA  rB  )
NFFFr  FFNNN)NNN)NN)FN)$rY   rZ   r[   r\   r
   r'   r]   r   r   r   r~  ra   r}  r   r  ignorer  r   r  r`   r   r  r  r   rC  r  r  r$  r	   r   r  r  r  r  r?  rD  rE  s   @rd   r(   r(     s   
 +/#>B{#,'{# {# 	{#
 uS%S/%9:;{# 
{#z=A Ad A YY=s =C = = =2 YY   YY
D 
T 
 
 YYCT CT C C YY		  aC ahsm aW[ a( 8<).$$', %262604s U\\4U\\(9#::;s  eCcN34s  #'	s 
 s  s  s  !%s  s  "%,,/s  "%,,/s   -s  
tELL!5tELL7I)I#JDQTVYQYNZ	[s p 372604|| "%,,/ "%,,/	
  - 
D (,26	||  } "%,,/	
 
B  %26		1||	1 	1 "%,,/		1
 
	1 3726	:U\\4U\\(9#::;: "%,,/: "%,,/	:
 
:rc   r   r  c                 H    ddl m}m}m} d| v rt	        ||      S d| v r|S |S )zFFunction imported from vision_transformer.py to maintain compatibilityr$   )init_weights_vit_jaxinit_weights_vit_mocoinit_weights_vit_timmr  )r  r  )r  r  r  r  r   )r   r  r  r  r  s        rd   r  r    s0    ff}+yAA	4$$$$rc   r  r  c                    ddl m} i }| j                         D ]Y  \  }}|dk(  rt        |j                  d      r|j
                  dk(  rd}d}d| v r| d   j                  d   }d| v r| d   j                  d   }||z   }|j                  d   }	|	|z
  }
t        j                  |
      }t        j                  |	      }||k7  r|j                         ro|j                         s_|
}	|ddd|f   }|j                         r| dxx   |z  cc<   |dd||f   }|j                         r| dxx   |z  cc<   |dd|df   }|}t        |      }||z  |	k(  r"|j                  d|||j                  d	         }nt        |j                  j                  d
      rl|j                  j                  j                  \  }}||z  |	k(  r"|j                  d|||j                  d	         }nt        j!                  d|	 d||z   d       ||d<   |dk(  r||d<   |dk(  r||d<   |j#                  d      r9|dd }|dk(  r#|j%                  dd	dd      j'                  d      }d|z   }|||<   U|||<   \ |S )zZHandle state dict conversion from original ViT to the new version with combined embedding.r$   )checkpoint_filter_fnr@   rw   r   r   r   Nru   r  z-Position embedding size mismatch: checkpoint=z, model=z?. Using default initialization and will resize in forward pass.r  r  r  zpatch_embed.r.   zproj.weightr  )r  r  rk   r  rt  r;  ry   r   r  
is_integernumelr]   r}   r  r  _loggerwarningr  r|   r   )r  r  orig_filter_fnout_dictrn   ro   num_cls_tokennum_reg_tokenr   num_patchesnum_patches_no_prefixgrid_size_no_prefixr  cls_token_embreg_token_embr   r   suffixnew_keys                      rd   r  r    s   J H  " A1u||[1affk ! !*,$.{$;$A$A!$DM*,$.{$;$A$A!$DM$1M$A!  ggaj(36G(G%&*ii0E&F# IIk2	'940;;=iFZFZF\ #8K$%a=&8$9M$**,";/=@/$%a})D&D$EM$**,";/=@/!.//0A 3I	N	 y(K7		!Y	1771:FA u||77E$||77AA1q5K/ !		!Q1771: >A $OO"OP[}\dfgjkfkdm n` !a ,-H'(++,H'(++,H'(\\.)rsVF&IIaAq)11!4&(G !HWHQKCAF Orc   urlc                 2    | ddd ddt         t        dddd|S )	Nr  )rw     r  g      ?rB   zembeds.projr|  z
apache-2.0)r  rf  
input_size	pool_sizecrop_pctr   meanstd
first_conv
classifierlicenser   )r  rl   s     rd   _cfgr    s7    #"'%#  rc   ztimm/)	hf_hub_id)z)naflexvit_base_patch16_gap.e300_s576_in1kz-naflexvit_base_patch16_par_gap.e300_s576_in1kz0naflexvit_base_patch16_parfac_gap.e300_s576_in1kz$naflexvit_base_patch16_map.untrainedz'naflexvit_base_patch16_siglip.untrainedz)naflexvit_so400m_patch16_siglip.untrainedvariant
pretrainedc           	      ~   |j                  dd      }|j                  dt                     }t        t              D ch c]  }|j                   }}t	        |      D ci c]  }||v s||j                  |       }}|rt        |fi |}t        t        | |ft        |t        |d      d|}	|	S c c}w c c}w )Nout_indicesrw   re   getter)r  feature_cls)pretrained_filter_fnre   feature_cfg)
r  r'   r   namer  rq   r   r(   r  r   )
r	  r
  rl   r  re   fcfg_field_namesrn   cfg_updatesr  s
             rd   _create_naflexvitr    s    **]A.K
**ULN
+C'-l';<!qvv<O<-1&\RQ/=Q1fjjm#RKRc1[1 7J1[hG	
 E L =Rs   B5 	B:*B:c                     |j                  dd       |j                  dd       |j                  dd      }|j                  dd      }||dk(  rd}d|j                  d	d      ||d
|}t        | |fi |S )a  Create FlexVit model from classic VisionTransformer configuration.

    This function handles the parameter mapping and configuration logic needed
    to create FlexVit models that are compatible with classic VisionTransformer
    configurations and pretrained weights.

    Args:
        variant: Model variant name
        pretrained: Whether to load pretrained weights
        **kwargs: Classic VisionTransformer parameters

    Returns:
        FlexVit model instance
    no_embed_classNdynamic_img_sizerK   ri  rI   rV  Tr=   )rA   r=   rK   rI   )r  getr  )r	  r
  rl   gprI   flex_kwargss         rd   _create_naflexvit_from_classicr  /  s    ( JJ&
JJ!4( 
M7	+BjjD)G2;  $zz-6	
 K Wj@K@@rc   c           
      H    t        dddddddd      }t        d
| |d	|}|S )zCViT-Base with NaFlex functionality and global average pooling.
    r*   r,   r.   h㈵>rV  rv   T)r+   r-   r/   r0   r7   rK   r>   rI   r
  re   )naflexvit_base_patch16_gapr'   r  r
  rl   re   r  s       rd   r   r   X  sC     	C ezWZe^deELrc   c                 J    t        ddddddddd	      }t        d
| |d	|}|S )z]ViT-Base with NaFlex functionality, aspect preserving pos embed, global average pooling.
    r*   r,   r.   r  TrV  rv   )	r+   r-   r/   r0   r7   rD   rK   r>   rI   r  )naflexvit_base_patch16_par_gapr!  r"  s       rd   r$  r$  j  sF      $
C i:[^ibhiELrc   c                 L    t        dddddddddd	
      }t        d| |d
|}|S )zjViT-Base with NaFlex functionality, aspect preserving & factorized pos embed, global average pooling.
    r*   r,   r.   r  Tr   rV  rv   )
r+   r-   r/   r0   r7   rD   r@   rK   r>   rI   r  )!naflexvit_base_patch16_parfac_gapr!  r"  s       rd   r&  r&  }  sI      $C lj^aleklELrc   c           	      F    t        ddddddd      }t        d	| |d|}|S )
zBViT-Base with NaFlex functionality and MAP attention pooling.
    r*   r,   r.   r  rJ   r$   )r+   r-   r/   r0   r7   rK   r>   r  )naflexvit_base_patch16_mapr!  r"  s       rd   r(  r(    s@     C ezWZe^deELrc   c                 L    t        ddddddddd	d

      }t        d| |d|}|S )/  ViT-SO150M2 with NaFlex functionality for variable aspect ratios and resolutions.

    This model supports:
    1. Variable aspect ratios and resolutions via patch coordinates
    2. Position embedding interpolation for arbitrary grid sizes
    3. Explicit patch coordinates and valid token masking
    r*   @        NN@r  Fr$   rV  T)
r+   r-   r/   r0   r1   r7   r2   r>   rK   rI   r  )"naflexvit_so150m2_patch16_reg1_gapr!  r"  s       rd   r/  r/    sI     C mz_bmflmELrc   c                 J    t        ddddddddd	
	      }t        d| |d|}|S )r*  r*   r+  r,  r-  r.  r  Fr$   rJ   )	r+   r-   r/   r0   r1   r7   r2   r>   rK   r  )"naflexvit_so150m2_patch16_reg1_mapr!  r"  s       rd   r1  r1    sF     
C mz_bmflmELrc   c                 D    t        dddddd      }t        d| |d|}|S )	zGViT-Base with NaFlex functionality and SigLIP-style configuration.
    r*   r,   r.   	gelu_tanhrJ   )r+   r-   r/   r0   rU   rK   r  )naflexvit_base_patch16_siglipr!  r"  s       rd   r4  r4    s=     C h*Z]haghELrc   c           	      F    t        ddddddd      }t        d	| |d|}|S )
zUViT-SO400M with NaFlex functionality for variable aspect ratios and resolutions.
    r*   i     gZӼ@r3  rJ   )r+   r-   r/   r0   r1   rU   rK   r  )naflexvit_so400m_patch16_siglipr!  r"  s       rd   r7  r7    s@     C jJ\_jcijELrc   rA  )Nri  r$   F)r  r5   r  r  )Yr\   loggingr   dataclassesr   r   r   	functoolsr   typingr   r   r	   r
   r   r   r   r   r   r   torch.nnr   torch.nn.functional
functionalrz   torch.nn.utils.rnnr   	timm.datar   r   r   r   r   r   r   r   r   r   timm.models._builderr   timm.models._featuresr   timm.models._features_fxr   r   timm.models._registryr   r    timm.models._manipulater!   r"   r#   r  r%   r&   __all__	getLoggerrY   r  r'   rq   r$  r]   r`   r   r   rC  r   r  r   rR  ra   rd  r(   r_   r  r  r  default_cfgsr  r  r   r$  r&  r(  r/  r1  r4  r7  rb   rc   rd   <module>rI     s2  $   2 2  O O O     + E   6 6 W G K K 6;
' '

H
% A, A, A,H	 	L 	 <<#s(O  5<<sCx()	@J J s	299 s	 s	l  "##"]]A\\AA A }	A
 {{A ellA AH  /3 !"&+E<<Eell+E E 	E
  $E \\E EPl		 l^	%s 	%u 	%x 	%IT#s(^ II I$sTWx. IXc T#s(^ " %152 6:6 9=9 -1F/3v15& "s  9 * !&A&A&A 	&AR 4 i  " t )  $ $ Y  & 4 i    4 i  0 4 i  . d     9  rc   