
    khv                     B   d Z ddlmZ ddlmZmZmZmZ ddlZddl	m
Z
 ddlm
c mZ ddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ ddlmZmZ d	d
lmZ d	dlmZ d	dl m!Z! d	dl"m#Z#m$Z$ d	dl%m&Z&m'Z' dgZ( G d de
jR                        Z* G d de
jR                        Z+ G d de
jR                        Z, G d de
jR                        Z- G d de
jR                        Z. G d de
jR                        Z/dedee0e0f   fdZ1e!dedee0e0f   d e0d!e0fd"       Z2 G d# d$e
jR                        Z3 G d% d&e
jR                        Z4 G d' d(e
jR                        Z5 G d) de
jR                        Z6d>d*Z7d+ Z8d?d,Z9d@d-Z: e& e:d./       e:d./       e:d./       e:        e:        e:        e:d0dd12       e:d3dd12      d4      Z;e'd?d5e6fd6       Z<e'd?d5e6fd7       Z=e'd?d5e6fd8       Z>e'd?d5e6fd9       Z?e'd?d5e6fd:       Z@e'd?d5e6fd;       ZAe'd?d5e6fd<       ZBe'd?d5e6fd=       ZCy)Aaf   DaViT: Dual Attention Vision Transformers

As described in https://arxiv.org/abs/2204.03645

Input size invariant transformer architecture that combines channel and spacial
attention in each block. The attention mechanisms used are linear in complexity.

DaViT model defs and weights adapted from https://github.com/dingmyu/davit, original copyright below

    )partial)ListOptionalTupleUnionN)TensorIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPath	to_2tupletrunc_normal_MlpLayerNorm2dget_norm_layeruse_fused_attn)NormMlpClassifierHeadClassifierHead   )build_model_with_cfg)feature_take_indices)register_notrace_function)
checkpointcheckpoint_seq)generate_default_cfgsregister_modelDaVitc                   :     e Zd Zddededef fdZdefdZ xZS )
ConvPosEncdimkactc                     t         t        |           t        j                  |||d|dz  |      | _        |rt        j                         | _        y t        j                         | _        y )Nr      )kernel_sizestridepaddinggroups)	superr   __init__nnConv2dprojGELUIdentityr"   )selfr    r!   r"   	__class__s       M/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/davit.pyr*   zConvPosEnc.__init__#   sS    j$(*IIF
	 !$2779    xc                 P    | j                  |      }|| j                  |      z   }|S N)r-   r"   )r0   r4   feats      r2   forwardzConvPosEnc.forward0   s&    yy|r3   )   F)	__name__
__module____qualname__intboolr*   r   r8   __classcell__r1   s   @r2   r   r   "   s)    7C 7C 7$ 7 r3   r   c                   8     e Zd ZdZdddef fd	ZdefdZ xZS )Stemz Size-agnostic implementation of 2D image to patch embedding,
        allowing input size to be adjusted during model forward operation
    r9   `      c                     t         |           t        |      }|| _        || _        || _        |d   dk(  sJ t        j                  ||d|d      | _         ||      | _	        y )Nr   rD      r9   r%   r&   r'   )
r)   r*   r   r&   in_chsout_chsr+   r,   convnorm)r0   rH   rI   r&   
norm_layerr1   s        r2   r*   zStem.__init__;   sl     	6"ayA~~II
	 w'	r3   r4   c                 h   |j                   \  }}}}| j                  d   || j                  d   z  z
  | j                  d   z  }| j                  d   || j                  d   z  z
  | j                  d   z  }t        j                  |d|d|f      }| j	                  |      }| j                  |      }|S )Nr   r   )shaper&   FpadrJ   rK   )r0   r4   BCHWpad_rpad_bs           r2   r8   zStem.forwardQ   s    WW
1aQ!dkk!n"44AFQ!dkk!n"44AFEE!a5)*IIaLIIaLr3   )	r:   r;   r<   __doc__r   r*   r   r8   r?   r@   s   @r2   rB   rB   6   s'     "(, r3   rB   c                   0     e Zd Zdef fd	ZdefdZ xZS )
Downsampler9   c                     t         |           || _        || _         ||      | _        |dz  dk(  | _        t        j                  |||d| j
                  rdn|dz        | _        y )Nr$   r   rG   )	r)   r*   rH   rI   rK   even_kr+   r,   rJ   )r0   rH   rI   r%   rL   r1   s        r2   r*   zDownsample.__init__\   sf     	v&	!Ao*II#A+*:
	r3   r4   c                    |j                   \  }}}}| j                  |      }| j                  rI| j                  j                  \  }}|||z  z
  |z  }|||z  z
  |z  }	t        j                  |d|d|	f      }| j                  |      }|S )Nr   )rN   rK   r[   rJ   r%   rO   rP   )
r0   r4   rQ   rR   rS   rT   k_hk_wrU   rV   s
             r2   r8   zDownsample.forwardq   s    WW
1aIIaL;;yy,,HC1s7]c)E1s7]c)Ea!UQ./AIIaLr3   )r:   r;   r<   r   r*   r   r8   r?   r@   s   @r2   rY   rY   [   s    
 "
*	 	r3   rY   c                   &     e Zd Zd fd	Zd Z xZS )ChannelAttentionV2c                     t         |           || _        ||z  | _        || _        t        j                  ||dz  |      | _        t        j                  ||      | _        y )Nr9   bias)	r)   r*   r(   head_dimdynamic_scaler+   Linearqkvr-   )r0   r    	num_headsqkv_biasre   r1   s        r2   r*   zChannelAttentionV2.__init__   sS    y(*99S#'9IIc3'	r3   c                 :   |j                   \  }}}| j                  |      j                  ||d| j                  || j                  z        j	                  ddddd      }|j                  d      \  }}}| j                  r	||dz  z  }n|| j                  dz  z  }|j                  dd      |z  }	|	j                  d	      }	|	|j                  dd      z  j                  dd      }|j                  dd      j                  |||      }| j                  |      }|S )
Nr9   r$   r   r   rD         r    )rN   rg   reshaper(   permuteunbindre   rd   	transposesoftmaxr-   
r0   r4   rQ   NrR   rg   qr!   vattns
             r2   r8   zChannelAttentionV2.forward   s
   ''1ahhqk!!!Q4;;T[[8HIQQRSUVXY[\^_`**Q-1aAIADMMT))A{{2r"Q&|||#AKKB''222r:KK1%%aA.IIaLr3   )   TT)r:   r;   r<   r*   r8   r?   r@   s   @r2   r`   r`   }   s    (r3   r`   c                   ,     e Zd Zd fd	ZdefdZ xZS )ChannelAttentionc                     t         |           || _        ||z  }|dz  | _        t	        j
                  ||dz  |      | _        t	        j
                  ||      | _        y )Nrk   r9   rb   )r)   r*   rh   scaler+   rf   rg   r-   )r0   r    rh   ri   rd   r1   s        r2   r*   zChannelAttention.__init__   sU    ")#%
99S#'9IIc3'	r3   r4   c                 
   |j                   \  }}}| j                  |      j                  ||d| j                  || j                  z        j	                  ddddd      }|j                  d      \  }}}|| j                  z  }|j                  dd      |z  }	|	j                  d      }	|	|j                  dd      z  j                  dd      }|j                  dd      j                  |||      }| j                  |      }|S )	Nr9   r$   r   r   rD   rl   rm   rn   )
rN   rg   ro   rh   rp   rq   r}   rr   rs   r-   rt   s
             r2   r8   zChannelAttention.forward   s    ''1ahhqk!!!Q4>>1;NOWWXY[\^_abdef**Q-1a

N{{2r"Q&|||#AKKB''222r:KK1%%aA.IIaLr3   )ry   F)r:   r;   r<   r*   r   r8   r?   r@   s   @r2   r{   r{      s    ( r3   r{   c                   d     e Zd Zdddej                  ej
                  dddf fd	ZdefdZ xZ	S )ChannelBlock      @F        Tc                    t         |           t        |d|	      | _        || _         ||      | _        |
rt        nt        } ||||      | _        |dkD  rt        |      nt        j                         | _        t        |d|	      | _        | j                  r\ ||      | _        t        |t!        ||z        |      | _        |dkD  rt        |      | _        y t        j                         | _        y d | _        d | _        d | _        y Nr9   )r    r!   r"   )rh   ri   r   )in_featureshidden_features	act_layer)r)   r*   r   cpe1ffnnorm1r`   r{   rx   r   r+   r/   
drop_path1cpe2norm2r   r=   mlp
drop_path2)r0   r    rh   	mlp_ratiori   	drop_pathr   rL   r   cpe_actv2
attn_layerr1   s               r2   r*   zChannelBlock.__init__   s     	3!9	_
+-'3C

	
 2;R(9-R[[]3!9	88#CDJ #C)O 4#DH
 6?^hy1DODODJDH"DOr3   r4   c                 `   |j                   \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }| j                  |      }|| j                  |      z   }| j                  |j                  dd      j                  ||||            }| j                  w|j                  d      j                  dd      }|| j                  | j                  | j                  |                  z   }|j                  dd      j                  ||||      }|S )Nr$   r   )rN   r   flattenrr   r   rx   r   r   viewr   r   r   )r0   r4   rQ   rR   rS   rT   curs          r2   r8   zChannelBlock.forward   s    WW
1aIIaL  #--a3jjmiin$$IIakk!Q',,Q1a8988		!&&q!,ADOODHHTZZ]$;<<AAq!&&q!Q2Ar3   )
r:   r;   r<   r+   r.   	LayerNormr*   r   r8   r?   r@   s   @r2   r   r      s7     gg||&#P r3   r   r4   window_sizec                     | j                   \  }}}}| j                  |||d   z  |d   ||d   z  |d   |      } | j                  dddddd      j                         j                  d|d   |d   |      }|S )z
    Args:
        x: (B, H, W, C)
        window_size (int): window size
    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    r   r   r9   r$   rD      rl   rN   r   rp   
contiguous)r4   r   rQ   rS   rT   rR   windowss          r2   window_partitionr      s     JAq!Q	q!{1~%{1~qKN7JKXYN\]^Aii1aAq)446;;BAP[\]P^`abGNr3   r   rS   rT   c                     | j                   d   }| j                  d||d   z  ||d   z  |d   |d   |      }|j                  dddddd      j                         j                  d|||      }|S )z
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image
    Returns:
        x: (B, H, W, C)
    rl   r   r   r9   r$   rD   r   r   )r   r   rS   rT   rR   r4   s         r2   window_reverser     s     	bARk!n,a;q>.A;q>S^_`SacdeA			!Q1a#..055b!QBAHr3   c                   j     e Zd ZU dZej
                  j                  e   ed<   d fd	Z	de
fdZ xZS )WindowAttentiona   Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.
    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
    
fused_attnc                 4   t         |           || _        || _        || _        ||z  }|dz  | _        t               | _        t        j                  ||dz  |      | _
        t        j                  ||      | _        t        j                  d      | _        y )Nrk   r9   rb   rl   rn   )r)   r*   r    r   rh   r}   r   r   r+   rf   rg   r-   Softmaxrs   )r0   r    r   rh   ri   rd   r1   s         r2   r*   zWindowAttention.__init__  s~    &")#%
(*99S#'9IIc3'	zzb)r3   r4   c                    |j                   \  }}}| j                  |      j                  ||d| j                  || j                  z        j	                  ddddd      }|j                  d      \  }}}| j                  rt        j                  |||      }n:|| j                  z  }||j                  dd      z  }	| j                  |	      }	|	|z  }|j                  dd      j                  |||      }| j                  |      }|S )Nr9   r$   r   r   rD   rm   rl   )rN   rg   ro   rh   rp   rq   r   rO   scaled_dot_product_attentionr}   rr   rs   r-   )
r0   r4   B_ru   rR   rg   rv   r!   rw   rx   s
             r2   r8   zWindowAttention.forward+  s    77Aqhhqk!!"aDNNA<OPXXYZ\]_`bcefg**Q-1a??..q!Q7ADJJAB++D<<%DqAKK1%%b!Q/IIaLr3   T)r:   r;   r<   rW   torchjitFinalr>   __annotations__r*   r   r8   r?   r@   s   @r2   r   r     s.     		%%* r3   r   c                   h     e Zd ZdZddddej
                  ej                  ddf fd	Zdefd	Z	 xZ
S )
SpatialBlocka<   Windows Block.
    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    rF   r   Tr   Fc                 l   t         |           || _        |	| _        || _        t        |      | _        || _        t        |d|
      | _	         ||      | _
        t        || j                  ||      | _        |dkD  rt        |      nt        j                         | _        t        |d|
      | _        | j                  r^ ||      | _        t'        ||z        }t)        |||      | _        |dkD  rt        |      | _        y t        j                         | _        y d | _        d | _        d | _        y r   )r)   r*   r    r   rh   r   r   r   r   r   r   r   rx   r   r+   r/   r   r   r   r=   r   r   r   )r0   r    rh   r   r   ri   r   r   rL   r   r   mlp_hidden_dimr1   s               r2   r*   zSpatialBlock.__init__K  s    	"$[1"3!9	_
#	
	 2;R(9-R[[]3!9	88#CDJ y1N .#DH
 6?^hy1DODODJDH"DOr3   r4   c           	      4   |j                   \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }|j                  ||||      }dx}}| j                  d   || j                  d   z  z
  | j                  d   z  }	| j                  d   || j                  d   z  z
  | j                  d   z  }
t        j                  |dd||	||
f      }|j                   \  }}}}t        || j                        }|j                  d| j                  d   | j                  d   z  |      }| j                  |      }|j                  d| j                  d   | j                  d   |      }t        || j                  ||      }|d d d |d |d d f   j                         }|j                  |||z  |      }|| j                  |      z   }| j                  |j                  dd      j                  ||||            }| j                  w|j                  d      j                  dd      }|| j!                  | j                  | j#                  |                  z   }|j                  dd      j                  ||||      }|S )Nr$   r   r   rl   )rN   r   r   rr   r   r   r   rO   rP   r   rx   r   r   r   r   r   r   r   )r0   r4   rQ   rR   rS   rT   shortcutpad_lpad_trU   rV   _HpWp	x_windowsattn_windowss                   r2   r8   zSpatialBlock.forwardx  sy   WW
1a99Q<''*44Q:JJx FF1aA!!!$q4+;+;A+>'>>$BRBRSTBUU!!!$q4+;+;A+>'>>$BRBRSTBUUEE!aE5%78ww2r1$Q(8(89	NN2t'7'7':T=M=Ma=P'PRST	 yy+ $((T-=-=a-@$BRBRSTBUWXY<)9)92rB a!RaRlO&&(FF1a!eQtq))IIakk!Q',,Q1a8988		!&&q!,ADOODHHTZZ]$;<<AAq!&&q!Q2Ar3   )r:   r;   r<   rW   r+   r.   r   r*   r   r8   r?   r@   s   @r2   r   r   >  s<    
  gg||+#Z% %r3   r   c                        e Zd Zddddddddeej
                  ddd	ddf fd
	Zej                  j                  dd       Z
defdZ xZS )
DaVitStager   Tspatialchannelr9   rF   r   )r   r   Fr$   c                    t         |           d| _        |rt        ||||      | _        nt        j                         | _        	 g }t        |      D ]  }ddlm	} g }t        |      D ]a  \  }}|dk(  r)|j                  dt        ||||	|
|   ||||	      f       4|dk(  s:|j                  d	t        ||||	|
|   ||||
	      f       c |r+|j                  t        j                   ||                   |j                  t        j                  |D cg c]  }|d   	 c}         t        j                  | | _        y c c}w )NF)r%   rL   r   )OrderedDictr   spatial_block)	r    rh   r   ri   r   rL   r   r   r   r   channel_block)	r    rh   r   ri   r   rL   r   r   r   r   )r)   r*   grad_checkpointingrY   
downsampler+   r/   rangecollectionsr   	enumerateappendr   r   
Sequentialblocks)r0   rH   rI   depthr   
attn_typesrh   r   r   ri   drop_path_ratesrL   norm_layer_clr   r   down_kernel_sizenamed_blockschannel_attn_v2stage_blocks	block_idxr   dual_attention_blockattn_idx	attn_typebr1   s                            r2   r*   zDaVitStage.__init__  so   ( 	"' (FVcmnDO kkmDO	 u 	ZI/#% '0'< #)	)(//,#"+"+!)"1)"<#0 '$/
C 
1 
 )+(//,#"+"+!)"1)"<#0 '*
C 
1 
2 ##BMM+>R2S$TU##BMMBV3WQAaD3W$XY?	Z@ mm\2 4Xs   Ec                     || _         y r6   )r   )r0   enables     r2   set_grad_checkpointingz!DaVitStage.set_grad_checkpointing  s
    "(r3   r4   c                     | j                  |      }| j                  r6t        j                  j	                         st        | j                  |      }|S | j                  |      }|S r6   )r   r   r   r   is_scriptingr   r   r0   r4   s     r2   r8   zDaVitStage.forward  sS    OOA""599+A+A+Ct{{A.A  AAr3   r   )r:   r;   r<   r   r+   r   r*   r   r   ignorer   r   r8   r?   r@   s   @r2   r   r     sg    
 -"",,!%F3P YY) ) r3   r   c                   .    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zd Zej                  j                  dd       Z	ej                  j                  dd       Z
ej                  j                  dej                  fd       Zdded	ee   fd
Z	 	 	 	 	 ddej$                  deeeee   f      dededededeeej$                     eej$                  eej$                     f   f   fdZ	 	 	 ddeeee   f   dedefdZd ZddefdZd Z xZS )r   a   DaViT
        A PyTorch implementation of `DaViT: Dual Attention Vision Transformers`  - https://arxiv.org/abs/2204.03645
        Supports arbitrary input sizes and pyramid feature extraction

    Args:
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        depths (tuple(int)): Number of blocks in each stage. Default: (1, 1, 3, 1)
        embed_dims (tuple(int)): Patch embedding dimension. Default: (96, 192, 384, 768)
        num_heads (tuple(int)): Number of attention heads in different layers. Default: (3, 6, 12, 24)
        window_size (int): Window size. Default: 7
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        drop_path_rate (float): Stochastic depth rate. Default: 0.1
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
    c                 f   t         |           t        |      }|t        |      cxk(  rt        |      k(  sJ  J t        t	        |      |
      }t        t	        |	      |
      }	|| _        |d   x| _        | _        || _        d| _	        g | _
        t        ||d   |      | _        |d   }t        j                  d|t        |            j!                  |      D cg c]  }|j#                          }}g }t%        |      D ]l  }||   }t'        ||f||   |dkD  |||   |||||   ||	|||||d}|}|j)                  |       | xj                  t+        |d|dz   z  d| 	      gz  c_
        n t-        j.                  | | _        |r@ || j                        | _        t5        | j                  ||| j                  
      | _        nBt-        j8                         | _        t;        | j                  ||| j                  |      | _        | j=                  | j>                         y c c}w )N)epsrl   Fr   )rL   )r   r   r   rh   r   r   ri   r   rL   r   r   r   r   r   r   r$   zstages.)num_chs	reductionmodule)	pool_type	drop_rate)r   r   rL   ) r)   r*   lenr   r   num_classesnum_featureshead_hidden_sizer   r   feature_inforB   stemr   linspacesumsplittolistr   r   r   dictr+   r   stagesnorm_prer   headr/   r   apply_init_weights)r0   in_chansdepths
embed_dimsrh   r   r   ri   rL   r   norm_epsr   r   r   r   r   r   r   drop_path_rater   global_poolhead_norm_first
num_stagesrH   r4   dprr   irI   stager1   s                                 r2   r*   zDaVit.__init__  s6   0 	_
S^:s6{:::::^J7XF
} =8L&4>rNBD1""':a=ZH	A#(>>!^S[#Q#W#WX^#_`aqxxz``z" 	cA mG Qiq5%#A,'#! #A%+!1 /)#E& FMM% $w!ac(U\]^\_S`"a!bb/	c2 mmV,
 &t'8'89DM&!!%..	DI KKMDM-!!%..%DI 	

4%%&c as   "H.c                    t        |t        j                        rjt        |j                  d       t        |t        j                        r8|j
                  +t        j                  j                  |j
                  d       y y y y )Ng{Gz?)stdr   )
isinstancer+   rf   r   weightrc   init	constant_)r0   ms     r2   r   zDaVit._init_weightsa  sZ    a#!((,!RYY'AFF,>!!!&&!, -?' $r3   c                 2    t        d|rd      S g d      S )Nz^stemz^stages\.(\d+)))z^stages\.(\d+).downsample)r   )z^stages\.(\d+)\.blocks\.(\d+)N)z	^norm_pre)i )r   r   )r   )r0   coarses     r2   group_matcherzDaVit.group_matcherg  s'    (.$
 	
5
 	
r3   c                 X    || _         | j                  D ]  }|j                  |        y )N)r   )r   r   r   )r0   r   r  s      r2   r   zDaVit.set_grad_checkpointingr  s.    "([[ 	8E(((7	8r3   returnc                 .    | j                   j                  S r6   )r   fc)r0   s    r2   get_classifierzDaVit.get_classifierx  s    yy||r3   r   r   c                 J    || _         | j                  j                  ||       y r6   )r   r   reset)r0   r   r   s      r2   reset_classifierzDaVit.reset_classifier|  s    &		[1r3   r4   indicesrK   
stop_early
output_fmtintermediates_onlyc                 r   |dv sJ d       g }t        t        | j                        |      \  }}	| j                  |      }t        | j                        dz
  }
t        j
                  j                         s|s| j                  }n| j                  d|	dz    }t        |      D ]u  \  }}| j                  r+t        j
                  j                         st        ||      }n ||      }||v sJ|r||
k(  r| j                  |      }n|}|j                  |       w |r|S |
k(  r| j                  |      }||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )NCHWzOutput shape must be NCHW.r   N)r   r   r   r   r   r   r   r   r   r   r   r   )r0   r4   r  rK   r  r  r  intermediatestake_indices	max_indexlast_idxr   feat_idxr  x_inters                  r2   forward_intermediateszDaVit.forward_intermediates  s,   * Y&D(DD&"6s4;;7G"Qi IIaLt{{#a'99!!#:[[F[[)a-0F(0 
	.OHe&&uyy/E/E/Gua(!H<'H0"mmA.GG$$W-
	.   xa A-r3   
prune_norm
prune_headc                     t        t        | j                        |      \  }}| j                  d|dz    | _        |rt        j                         | _        |r| j                  dd       |S )z@ Prune layers not required for specified intermediates.
        Nr   r    )r   r   r   r+   r/   r   r  )r0   r  r$  r%  r  r  s         r2   prune_intermediate_layerszDaVit.prune_intermediate_layers  s]     #7s4;;7G"Qikk.9q=1KKMDM!!!R(r3   c                     | j                  |      }| j                  r5t        j                  j	                         st        | j                  |      }n| j                  |      }| j                  |      }|S r6   )r   r   r   r   r   r   r   r   r   s     r2   forward_featureszDaVit.forward_features  sW    IIaL""599+A+A+Ct{{A.AAAMM!r3   
pre_logitsc                 N    |r| j                  |d      S | j                  |      S )NT)r+  )r   )r0   r4   r+  s      r2   forward_headzDaVit.forward_head  s$    0:tyyty,L		!Lr3   c                 J    | j                  |      }| j                  |      }|S r6   )r*  r-  r   s     r2   r8   zDaVit.forward  s'    !!!$a r3   )r9   r   r   r9   r   rC           r9            rF   rD   Tlayernorm2d	layernormgh㈵>r   TFr$   FFr   r     avgFFr   r6   )NFFr  F)r   FT)r:   r;   r<   rW   r*   r   r   r   r   r  r   r+   Moduler  r=   r   strr  r   r   r   r>   r   r#  r(  r*  r-  r8   r?   r@   s   @r2   r   r     s   & *$$%-!!-W'r- YY
 
 YY8 8
 YY		  2C 2hsm 2 8<$$',3 ||3  eCcN343  	3 
 3  3  !%3  
tELL!5tELL7I)I#JJ	K3 n ./$#	3S	>*  	 M$ Mr3   c                 N   dd l }i }| j                         D ]
  \  }}|j                  |      r|j                  |d      }n,|j	                  dd|      }|j	                  dd|      }|j                  dd      }|j                  d	d
      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|||<    |S )Nr   r'  zconvs.([0-9]+)stages.\1.downsamplezblocks.([0-9]+)stages.\1.blocksdownsample.projdownsample.convstages.0.downsampler   zwindow_attn.norm.znorm1.zwindow_attn.fn.zattn.zchannel_attn.norm.zchannel_attn.fn.z	ffn.norm.znorm2.zffn.fn.net.zmlp.zconv1.fn.dwz	cpe1.projzconv2.fn.dwz	cpe2.proj)reitems
startswithreplacesub)
state_dictmodelprefixrE  out_dictr!   rw   s          r2   _convert_florence2rN    s&   H  " 1<<		&"%AFF$&=qAFF%':A>II'):;II+V4 II)84II'1II*H5II('2IIk8,IImV,IIm[1IIm[1'* Or3   c                    d| v r| S d| v r| d   } d| v rt        | |      S ddl}i }| j                         D ]  \  }}|j                  dd|      }|j                  dd	|      }|j	                  d
d      }|j	                  dd      }|j	                  dd      }|j	                  dd      }|j	                  dd      }|j	                  dd      }|||<    |S )z  Remap MSFT checkpoints -> timm zhead.fc.weightrJ  z vision_tower.convs.0.proj.weightr   Nzpatch_embeds.([0-9]+)r@  zmain_blocks.([0-9]+)rA  rB  rC  rD  r   zhead.zhead.fc.znorms.z
head.norm.zcpe.0r   zcpe.1r   )rN  rE  rF  rI  rH  )rJ  rK  rE  rM  r!   rw   s         r2   checkpoint_filter_fnrP    s    :%z!-
)Z7!*e44H  " 	1FF+-DaHFF*,?CII'):;II+V4IIgz*IIh-IIgv&IIgv&	 Or3   c           	         t        d t        |j                  dd            D              }|j                  d|      }|j                  dd      }| j	                  d      rd}t        t        | |ft        t        d|	      |d
|}|S )Nc              3   &   K   | ]	  \  }}|  y wr6    ).0r  r   s      r2   	<genexpr>z _create_davit.<locals>.<genexpr>  s     \da\s   r   r/  out_indicespretrained_strictT_flF)flatten_sequentialrV  )pretrained_filter_fnfeature_cfgrW  )	tupler   getpopendswithr   r   rP  r   )variant
pretrainedkwargsdefault_out_indicesrV  strictrK  s          r2   _create_davitre    s    \i

8\8Z.[\\**],?@KZZ+T2F  2DkJ  E Lr3   c                 0    | dddddt         t        ddd
|S )	Nr:  )r9      rg  )rF   rF   gffffff?bicubicz	stem.convzhead.fc)
urlr   
input_size	pool_sizecrop_pctinterpolationmeanr  
first_conv
classifierr	   )ri  rb  s     r2   _cfgrq  "  s0    =v9%.B!  r3   ztimm/)	hf_hub_idzmicrosoft/Florence-2-base)r9   r3  r3  )rr  r   rj  zmicrosoft/Florence-2-large)zdavit_tiny.msft_in1kzdavit_small.msft_in1kzdavit_base.msft_in1kdavit_large
davit_hugedavit_giantzdavit_base_fl.msft_florence2zdavit_huge_fl.msft_florence2r  c           	      L    t        ddd      }t        dd| it        |fi |S )Nr/  r0  r4  r   r   rh   ra  )
davit_tinyr   re  ra  rb  
model_argss      r2   rx  rx  B  s0    \6IUcdJ[*[Z@ZSY@Z[[r3   c           	      L    t        ddd      }t        dd| it        |fi |S )Nr   r   	   r   r0  r4  rw  ra  )davit_smallry  rz  s      r2   r  r  H  s0    \6IUcdJ\:\jA[TZA[\\r3   c           	      L    t        ddd      }t        dd| it        |fi |S )Nr}              rD   ry          rw  ra  )
davit_basery  rz  s      r2   r  r  N  s0    \6KWefJ[*[Z@ZSY@Z[[r3   c           	      L    t        ddd      }t        dd| it        |fi |S )Nr}  )r1  r2  r3     )r5  r6  r7  0   rw  ra  )rs  ry  rz  s      r2   rs  rs  T  s0    \6KWfgJ\:\jA[TZA[\\r3   c           	      L    t        ddd      }t        dd| it        |fi |S )Nr}  r  r  r  i   ry   r  r  @   rw  ra  )rt  ry  rz  s      r2   rt  rt  Z  s0    \6LXghJ[*[Z@ZSY@Z[[r3   c           	      L    t        ddd      }t        dd| it        |fi |S )N)r   r   r6  r9   )r2  r3  r  i   )r6  r7  r  rC   rw  ra  )ru  ry  rz  s      r2   ru  ru  `  s0    ]7MYijJ\:\jA[TZA[\\r3   c           	      T    t        ddddddd      }t        d	d| it        |fi |S )
Nr}  r  r  r6  r9   Tr   r   rh   r   r   r   r   ra  )davit_base_flry  rz  s      r2   r  r  g  s=    (=DtJ ^Z^4
C]V\C]^^r3   c           	      T    t        ddddddd      }t        d	d| it        |fi |S )
Nr}  r  r  r6  r9   Tr  ra  )davit_huge_flry  rz  s      r2   r  r  p  s?     (>/DtJ ^Z^4
C]V\C]^^r3   )zvision_tower.r<  )r'  )DrW   	functoolsr   typingr   r   r   r   r   torch.nnr+   torch.nn.functional
functionalrO   r   	timm.datar
   r   timm.layersr   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r   __all__r=  r   rB   rY   r`   r{   r   r=   r   r   r   r   r   r   rN  rP  re  rq  default_cfgsrx  r  r  rs  rt  ru  r  r  rS  r3   r2   <module>r     s  	  / /      A l l l = * + 3 3 <) ("299 "J D >ryy 4:299 :z U38_  F sCx S S   )bii )X_299 _DS Sl^BII ^B84* % ! 6&6$(--%1 %).-%1& ( \e \ \
 ]u ] ]
 \e \ \
 ]u ] ]
 \e \ \
 ]u ] ] _ _ _ _ _ _r3   