
    khP                        d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlmc mZ ddlmZmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
l m!Z!m"Z" dgZ# G d dejH                        Z% G d dejH                        Z& G d dejH                        Z' G d dejH                        Z( G d dejH                        Z) G d dejH                        Z*de+dedejH                  fdZ,dZdee-df   de.de-dee-e-e-e-f   fd Z/ G d! d"ejH                        Z0 G d# d$ejH                        Z1	 	 	 	 	 	 	 	 d[d%ed&e-d'e-d(ee-   d)e-d*e-d+e-d,e-d-e.d.e2d/e.d0e.d1edejf                  fd2Z4	 	 	 	 d\d%ed&e-d'e-d(ee-   d)e-d-e.d.e2d/e.d0e.d1edejf                  fd3Z5 G d4 dejH                        Z6d]d5e+d6e2d1ede6fd7Z7d^d8e+d1edee+ef   fd9Z8 e" e8d:d;d<=       e8d:d>d?d@A       e8d:dBd<=       e8d:dCd?d@A       e8d:dDd<=       e8d:dEd?dFA       e8d:dGd<=       e8d:dHdIdFA       e8d:dJd<=       e8d:dKdIdFA       e8d:dLdIdMA      dN      Z9e!d]d6e2d1ede6fdO       Z:e!d]d6e2d1ede6fdP       Z;e!d]d6e2d1ede6fdQ       Z<e!d]d6e2d1ede6fdR       Z=e!d]d6e2d1ede6fdS       Z>e!d]d6e2d1ede6fdT       Z?e!d]d6e2d1ede6fdU       Z@e!d]d6e2d1ede6fdV       ZAe!d]d6e2d1ede6fdW       ZBe!d]d6e2d1ede6fdX       ZCe!d]d6e2d1ede6fdY       ZDy)_a5   Vision OutLOoker (VOLO) implementation

Paper: `VOLO: Vision Outlooker for Visual Recognition` - https://arxiv.org/abs/2106.13112

Code adapted from official impl at https://github.com/sail-sg/volo, original copyright in comment below

Modifications and additions for timm by / Copyright 2022, Ross Wightman
    N)AnyCallableDictListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPathMlp	to_2tuple	to_ntupletrunc_normal_use_fused_attn   )build_model_with_cfg)feature_take_indices)
checkpoint)register_modelgenerate_default_cfgsVOLOc                        e Zd ZdZ	 	 	 	 	 	 ddededededededed	ef fd
Zdej                  dej                  fdZ
 xZS )OutlookAttentionz,Outlook attention mechanism for VOLO models.dim	num_headskernel_sizepaddingstrideqkv_bias	attn_drop	proj_dropc	                    t         
|           ||z  }	|| _        || _        || _        || _        |	dz  | _        t        j                  |||      | _	        t        j                  ||dz  |z        | _
        t        j                  |      | _        t        j                  ||      | _        t        j                  |      | _        t        j                  |||      | _        t        j"                  ||d      | _        y)a  Initialize OutlookAttention.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            kernel_size: Kernel size for attention computation.
            padding: Padding for attention computation.
            stride: Stride for attention computation.
            qkv_bias: Whether to use bias in linear layers.
            attn_drop: Attention dropout rate.
            proj_drop: Projection dropout rate.
              ࿩bias   )r   r   r    T)r   r    	ceil_modeN)super__init__r   r   r   r    scalennLinearvattnDropoutr"   projr#   Unfoldunfold	AvgPool2dpool)selfr   r   r   r   r    r!   r"   r#   head_dim	__class__s             L/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/volo.pyr+   zOutlookAttention.__init__*   s    . 	)#"&%
3(3IIc;!#3i#?@	I.IIc3'	I.iiKQWXLLVFdS	    xreturnc                    |j                   \  }}}}| j                  |      j                  dddd      }t        j                  || j
                  z        t        j                  || j
                  z        }}| j                  |      j                  || j                  || j                  z  | j                  | j                  z  ||z        j                  ddddd      }| j                  |j                  dddd            j                  dddd      }	| j                  |	      j                  |||z  | j                  | j                  | j                  z  | j                  | j                  z        j                  ddddd      }	|	| j                  z  }	|	j                  d      }	| j                  |	      }	|	|z  j                  ddddd      j                  ||| j                  z  | j                  z  ||z        }t        j                   |||f| j                  | j"                  | j
                        }| j%                  |j                  dddd            }| j'                  |      }|S )	Forward pass.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Output tensor of shape (B, H, W, C).
        r      r      r(   r   )output_sizer   r   r    )shaper/   permutemathceilr    r4   reshaper   r   r6   r0   r,   softmaxr"   Ffoldr   r2   r#   )
r7   r<   BHWCr/   hwr0   s
             r:   forwardzOutlookAttention.forwardS   s!    WW
1aFF1IaAq)yyT[[)499Q_+E1KKN""t~~qDNN2t///Q88?1aA8N 	
 yy1aA./771aCyy&&q1udnnd&6&69I9I&It///118Aq!Q1G 	 djj |||#~~d#AXq!Q1-55aT=M=M9MPTP`P`9`bcfgbghFF11a&d6F6FPTP\P\eiepepqIIaii1a+,NN1r;   )r@   r   r   F        rT   )__name__
__module____qualname____doc__intboolfloatr+   torchTensorrS   __classcell__r9   s   @r:   r   r   '   s    6  !"!!'T'T 'T 	'T
 'T 'T 'T 'T 'TR   %,,  r;   r   c                        e Zd ZdZdddddej
                  ej                  dfdededed	ed
edededede	de	de
f fdZdej                  dej                  fdZ xZS )	Outlookerz9Outlooker block that combines outlook attention with MLP.r         @rT   Fr   r   r   r    r   	mlp_ratior"   	drop_path	act_layer
norm_layerr!   c           	      r   t         |            |
|      | _        t        |||||||      | _        |dkD  rt        |      nt        j                         | _         |
|      | _	        t        |t        ||z        |	      | _        |dkD  rt        |      | _        yt        j                         | _        y)af  Initialize Outlooker block.

        Args:
            dim: Input feature dimension.
            kernel_size: Kernel size for outlook attention.
            padding: Padding for outlook attention.
            stride: Stride for outlook attention.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio for MLP hidden dimension.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth drop rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer type.
            qkv_bias: Whether to use bias in linear layers.
        )r   r   r    r!   r"   rT   in_featureshidden_featuresre   N)r*   r+   norm1r   r0   r   r-   Identity
drop_path1norm2r   rY   mlp
drop_path2)r7   r   r   r   r    r   rc   r"   rd   re   rf   r!   r9   s               r:   r+   zOutlooker.__init__y   s    : 	_
$#
	 2;R(9-R[[]_
i0

 2;R(9-R[[]r;   r<   r=   c                     || j                  | j                  | j                  |                  z   }|| j                  | j	                  | j                  |                  z   }|S zoForward pass.

        Args:
            x: Input tensor.

        Returns:
            Output tensor.
        rm   r0   rk   rp   ro   rn   r7   r<   s     r:   rS   zOutlooker.forward   Q     		$**Q- 899A 788r;   )rU   rV   rW   rX   r-   GELU	LayerNormrY   r[   r   rZ   r+   r\   r]   rS   r^   r_   s   @r:   ra   ra   v   s    C !!!"$''#%<<"0S0S 0S 	0S
 0S 0S 0S 0S 0S  0S !0S 0Sd %,, r;   ra   c                        e Zd ZU dZej
                  j                  e   ed<   	 	 	 	 dde	de	dede
de
f
 fdZd	ej                  d
ej                  fdZ xZS )	Attentionz!Multi-head self-attention module.
fused_attnr   r   r!   r"   r#   c                 J   t         |           || _        ||z  }|dz  | _        t	               | _        t        j                  ||dz  |      | _        t        j                  |      | _
        t        j                  ||      | _        t        j                  |      | _        y)a,  Initialize Attention module.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            qkv_bias: Whether to use bias in QKV projection.
            attn_drop: Attention dropout rate.
            proj_drop: Projection dropout rate.
        r%   r@   r&   N)r*   r+   r   r,   r   rz   r-   r.   qkvr1   r"   r2   r#   )r7   r   r   r!   r"   r#   r8   r9   s          r:   r+   zAttention.__init__   s    " 	")#%
(*99S#'9I.IIc3'	I.r;   r<   r=   c                    |j                   \  }}}}| j                  |      j                  |||z  d| j                  || j                  z        j	                  ddddd      }|j                  d      \  }}}	| j                  r<t        j                  |||	| j                  r| j                  j                  nd      }nL|| j                  z  }||j                  dd	      z  }
|
j                  d	
      }
| j                  |
      }
|
|	z  }|j                  dd      j                  ||||      }| j                  |      }| j!                  |      }|S )r?   r@   rA   r   r   r(   rT   )	dropout_prB   rC   )rE   r|   rI   r   rF   unbindrz   rK   scaled_dot_product_attentiontrainingr"   pr,   	transposerJ   r2   r#   )r7   r<   rM   rN   rO   rP   r|   qkr/   r0   s              r:   rS   zAttention.forward   s0    WW
1ahhqk!!!QUAt~~qDNN?RS[[\]_`bcefhij**Q-1a??..1a.2mm$..**A
 DJJAq{{2r**D<<B<'D>>$'DqAKK1%%aAq1IIaLNN1r;   )   FrT   rT   )rU   rV   rW   rX   r\   jitFinalrZ   __annotations__rY   r[   r+   r]   rS   r^   r_   s   @r:   ry   ry      sw    +		%%
 "!!// / 	/
 / /8 %,, r;   ry   c                        e Zd ZdZddddej
                  ej                  fdededede	d	ed
ede
de
f fdZdej                  dej                  fdZ xZS )Transformerz9Transformer block with multi-head self-attention and MLP.      @FrT   r   r   rc   r!   r"   rd   re   rf   c	                 l   t         	|            ||      | _        t        ||||      | _        |dkD  rt        |      nt        j                         | _         ||      | _	        t        |t        ||z        |      | _        |dkD  rt        |      | _        yt        j                         | _        y)a  Initialize Transformer block.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio for MLP hidden dimension.
            qkv_bias: Whether to use bias in QKV projection.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth drop rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer type.
        )r   r!   r"   rT   rh   N)r*   r+   rk   ry   r0   r   r-   rl   rm   rn   r   rY   ro   rp   )
r7   r   r   rc   r!   r"   rd   re   rf   r9   s
            r:   r+   zTransformer.__init__   s    . 	_
cYU^_	1:R(9-R[[]_
3C)O8LXab1:R(9-R[[]r;   r<   r=   c                     || j                  | j                  | j                  |                  z   }|| j                  | j	                  | j                  |                  z   }|S rr   rs   rt   s     r:   rS   zTransformer.forward  ru   r;   )rU   rV   rW   rX   r-   rv   rw   rY   r[   rZ   r   r+   r\   r]   rS   r^   r_   s   @r:   r   r      s    C  ""!!"$''#%<<SS S 	S
 S S S  S !S@ %,, r;   r   c                        e Zd ZdZ	 	 	 	 	 ddededee   dededef fdZd	e	j                  d
e	j                  fdZ xZS )ClassAttentionz6Class attention mechanism for class token interaction.r   r   r8   r!   r"   r#   c                    t         |           || _        ||| _        n||z  }|| _        |dz  | _        t        j                  || j                  | j                  z  dz  |      | _        t        j                  || j                  | j                  z  |      | _        t        j                  |      | _
        t        j                  | j                  | j                  z  |      | _        t        j                  |      | _        y)a{  Initialize ClassAttention.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            head_dim: Dimension per head. If None, computed as dim // num_heads.
            qkv_bias: Whether to use bias in QKV projection.
            attn_drop: Attention dropout rate.
            proj_drop: Projection dropout rate.
        Nr%   rA   r&   )r*   r+   r   r8   r,   r-   r.   kvr   r1   r"   r2   r#   )r7   r   r   r8   r!   r"   r#   r9   s          r:   r+   zClassAttention.__init__.  s    & 	"$DMi'H$DM%
))C!?!!C(S3 >XNI.IIdmmdnn<cB	I.r;   r<   r=   c                    |j                   \  }}}| j                  |      j                  ||d| j                  | j                        j                  ddddd      }|j                  d      \  }}| j                  |ddddddf         j                  || j                  d| j                        | j                  z  }||j                  dd      z  }	|	j                  d	      }	| j                  |	      }	|	|z  j                  dd      j                  |d| j                  | j                  z        }
| j                  |
      }
| j                  |
      }
|
S )
zForward pass.

        Args:
            x: Input tensor of shape (B, N, C) where first token is class token.

        Returns:
            Class token output of shape (B, 1, C).
        rA   r   r@   r   r(   Nr   rB   rC   )rE   r   rI   r   r8   rF   r   r   r,   r   rJ   r"   r2   r#   )r7   r<   rM   NrP   r   r   r/   r   r0   	cls_embeds              r:   rS   zClassAttention.forwardP  s)    ''1aWWQZ1aGOOPQSTVWYZ\]^yy|1FF1QAX;''4>>1dmmLtzzY1;;r2&&|||#~~d#AX((A.66q!T]]T^^=[\	IIi(	NN9-	r;   )r   NFrT   rT   )rU   rV   rW   rX   rY   r   rZ   r[   r+   r\   r]   rS   r^   r_   s   @r:   r   r   +  sv    @
 &*"!! / /  / sm	 /
  /  /  /D %,, r;   r   c                        e Zd ZdZddddddej
                  ej                  fdededee   d	e	d
e
de	de	de	dedef fdZdej                  dej                  fdZ xZS )
ClassBlockz3Class block that combines class attention with MLP.Nr   FrT   r   r   r8   rc   r!   dropr"   rd   re   rf   c                 r   t         |            |
|      | _        t        ||||||      | _        |dkD  rt        |      nt        j                         | _         |
|      | _	        t        |t        ||z        |	|      | _        |dkD  rt        |      | _        yt        j                         | _        y)a1  Initialize ClassBlock.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            head_dim: Dimension per head. If None, computed as dim // num_heads.
            mlp_ratio: Ratio for MLP hidden dimension.
            qkv_bias: Whether to use bias in QKV projection.
            drop: Dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth drop rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer type.
        )r   r8   r!   r"   r#   rT   )ri   rj   re   r   N)r*   r+   rk   r   r0   r   r-   rl   rm   rn   r   rY   ro   rp   )r7   r   r   r8   rc   r!   r   r"   rd   re   rf   r9   s              r:   r+   zClassBlock.__init__l  s    6 	_
"
	 2;R(9-R[[]_
i0	
 2;R(9-R[[]r;   r<   r=   c                 $   |ddddf   }|| j                  | j                  | j                  |                  z   }|| j                  | j	                  | j                  |                  z   }t        j                  ||ddddf   gd      S )zForward pass.

        Args:
            x: Input tensor of shape (B, N, C) where first token is class token.

        Returns:
            Output tensor with updated class token.
        Nr   rC   )rm   r0   rk   rp   ro   rn   r\   cat)r7   r<   r   s      r:   rS   zClassBlock.forward  s~     a!eH			$**Q-0H II	I9N0O PP	yy)Qq!"uX.A66r;   )rU   rV   rW   rX   r-   rv   rw   rY   r   r[   rZ   r   r+   r\   r]   rS   r^   r_   s   @r:   r   r   i  s    = '+!"!!"$''#%<<.S.S .S sm	.S
 .S .S .S .S .S  .S !.S`7 7%,, 7r;   r   
block_typekargsr=   c                 $    | dk(  rt        di |S y)zGet block based on type.

    Args:
        block_type: Type of block ('ca' for ClassBlock).
        **kargs: Additional keyword arguments for block.

    Returns:
        The requested block module.
    caN )r   )r   r   s     r:   	get_blockr     s     T"E"" r;   size.lamr,   c                    | d   |z  }| d   |z  }t        j                  |t         j                        }t        j                  |t         j                        }t        j                  d|z
        }||z  j	                         }||z  j	                         }	t        j
                  d|d      }
t        j
                  d|d      }t        j                  |
|dz  z
  d|      }t        j                  ||	dz  z
  d|      }t        j                  |
|dz  z   d|      }t        j                  ||	dz  z   d|      }|j                         |j                         |j                         |j                         fS )a1  Get random bounding box for token labeling.

    Reference: https://github.com/zihangJiang/TokenLabeling

    Args:
        size: Input tensor size tuple.
        lam: Lambda parameter for cutmix.
        scale: Scaling factor.

    Returns:
        Bounding box coordinates (bbx1, bby1, bbx2, bby2).
    r   rA   )dtype      ?r   r   )r\   tensorfloat32sqrtrY   randintclampitem)r   r   r,   rO   rN   W_tH_tcut_ratcut_wcut_hcxcybbx1bby1bbx2bby2s                   r:   	rand_bboxr     s3    	Q5AQ5A
,,q
.C
,,q
.Cjjc"G7]!E7]!E 
q!T	"B	q!T	"B;;rEQJ1-D;;rEQJ1-D;;rEQJ1-D;;rEQJ1-D99;		TYY[$))+==r;   c                        e Zd ZdZ	 	 	 	 	 	 	 ddededededededef fd	Zd
ej                  dej                  fdZ	 xZ
S )
PatchEmbedz6Image to patch embedding with multi-layer convolution.img_size	stem_convstem_stride
patch_sizein_chans
hidden_dim	embed_dimc                 n   t         |           |dv sJ |rt        j                  t        j                  ||d|dd      t        j
                  |      t        j                  d      t        j                  ||dddd      t        j
                  |      t        j                  d      t        j                  ||dddd      t        j
                  |      t        j                  d      	      | _        nd	| _        t        j                  ||||z  ||z  
      | _        ||z  ||z  z  | _	        y	)a  Initialize PatchEmbed.

        Different from ViT which uses 1 conv layer, VOLO uses multiple conv layers for patch embedding.

        Args:
            img_size: Input image size.
            stem_conv: Whether to use stem convolution layers.
            stem_stride: Stride for stem convolution.
            patch_size: Patch size (must be 4, 8, or 16).
            in_chans: Number of input channels.
            hidden_dim: Hidden dimension for stem convolution.
            embed_dim: Output embedding dimension.
        )r(   r         r@   F)r   r    r   r'   T)inplacer   Nr   r    )
r*   r+   r-   
SequentialConv2dBatchNorm2dReLUconvr2   num_patches)	r7   r   r   r   r   r   r   r   r9   s	           r:   r+   zPatchEmbed.__init__  s   . 	Z'''		(JAk[\chiz*%		*jaST[`az*%		*jaST[`az*%
DI DIII	z[/HQ[_jQjl	$
2x:7MNr;   r<   r=   c                 b    | j                   | j                  |      }| j                  |      }|S )zForward pass.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Output tensor of shape (B, embed_dim, H', W').
        )r   r2   rt   s     r:   rS   zPatchEmbed.forward	  s-     99 		!AIIaLr;   )   Fr   r   r@   @     )rU   rV   rW   rX   rY   rZ   r+   r\   r]   rS   r^   r_   s   @r:   r   r     s    @  #   *O*O *O 	*O
 *O *O *O *OX %,, r;   r   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ xZ	S )

Downsamplez#Downsampling module between stages.in_embed_dimout_embed_dimr   c                 ^    t         |           t        j                  ||||      | _        y)zInitialize Downsample.

        Args:
            in_embed_dim: Input embedding dimension.
            out_embed_dim: Output embedding dimension.
            patch_size: Patch size for downsampling.
        r   N)r*   r+   r-   r   r2   )r7   r   r   r   r9   s       r:   r+   zDownsample.__init__  s'     	IIlMzZde	r;   r<   r=   c                 x    |j                  dddd      }| j                  |      }|j                  dddd      }|S )zForward pass.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Output tensor of shape (B, H', W', C').
        r   r@   r   rA   )rF   r2   rt   s     r:   rS   zDownsample.forward&  s>     IIaAq!IIaLIIaAq!r;   )rA   )
rU   rV   rW   rX   rY   r+   r\   r]   rS   r^   r_   s   @r:   r   r     s?    -	fS 	f 	f# 	f %,, r;   r   block_fnindexr   layersr   r   r   r    rc   r!   r"   drop_path_ratekwargsc                     g }t        ||         D ]E  }||t        |d|       z   z  t        |      dz
  z  }|j                   | |||||||	|
|	             G t        j                  | }|S )a  Generate outlooker layers for stage 1.

    Args:
        block_fn: Block function to use (typically Outlooker).
        index: Index of current stage.
        dim: Feature dimension.
        layers: List of layer counts for each stage.
        num_heads: Number of attention heads.
        kernel_size: Kernel size for outlook attention.
        padding: Padding for outlook attention.
        stride: Stride for outlook attention.
        mlp_ratio: Ratio for MLP hidden dimension.
        qkv_bias: Whether to use bias in QKV projection.
        attn_drop: Attention dropout rate.
        drop_path_rate: Stochastic depth drop rate.
        **kwargs: Additional keyword arguments.

    Returns:
        Sequential module containing outlooker blocks.
    Nr   )r   r   r    r   rc   r!   r"   rd   rangesumappendr-   r   )r   r   r   r   r   r   r   r    rc   r!   r"   r   r   blocks	block_idx	block_dprs                   r:   outlooker_blocksr   5  s    F F6%=) 	"i#fVen2E&EF#f+XY/Z	h#

 
	 ]]F#FMr;   c	                     g }
t        ||         D ]B  }||t        |d|       z   z  t        |      dz
  z  }|
j                   | ||||||             D t        j                  |
 }
|
S )ae  Generate transformer layers for stage 2.

    Args:
        block_fn: Block function to use (typically Transformer).
        index: Index of current stage.
        dim: Feature dimension.
        layers: List of layer counts for each stage.
        num_heads: Number of attention heads.
        mlp_ratio: Ratio for MLP hidden dimension.
        qkv_bias: Whether to use bias in QKV projection.
        attn_drop: Attention dropout rate.
        drop_path_rate: Stochastic depth drop rate.
        **kwargs: Additional keyword arguments.

    Returns:
        Sequential module containing transformer blocks.
    Nr   )rc   r!   r"   rd   r   )r   r   r   r   r   rc   r!   r"   r   r   r   r   r   s                r:   transformer_blocksr   j  s    : F6%=) 		"i#fVen2E&EF#f+XY/Z	h
 		 ]]F#FMr;   c            -           e Zd ZdZddddddddd	d	d
dddddej
                  ddddfdee   dedededededede	ee      de	ee      de
edf   de
edf   dedededed ed!ed"ed#e	e
edf      d$ed%ed&ef, fd'Zd(ej                  d)dfd*Zej$                  j&                  d)efd+       Zej$                  j&                  dCd,ed)eeef   fd-       Zej$                  j&                  dDd.ed)dfd/       Zej$                  j&                  d)ej                  fd0       ZdEdede	e   d)dfd1Zd2ej8                  d)ej8                  fd3Zd2ej8                  d)ej8                  fd4Zd2ej8                  d)eej8                  e
ej8                  ej8                  e
eeeef   f   f   fd5Z 	 	 	 	 	 dFd2ej8                  d6e	eeee   f      d7ed8ed9ed:ed)eeej8                     e
ej8                  eej8                     f   f   fd;Z!	 	 	 dGd6eeee   f   d<ed=ed)ee   fd>Z"d2ej8                  d)ej8                  fd?Z#dCd2ej8                  d@ed)ej8                  fdAZ$d2ej8                  d)ej8                  fdBZ% xZ&S )Hr   zVision Outlooker (VOLO) model.r   r@     tokenr   r   N)TFFFrb   FrT   )r   r   TrA   r   r   r   num_classesglobal_poolr   stem_hidden_dim
embed_dimsr   downsamples.outlook_attentionrc   r!   	drop_ratepos_drop_rateattn_drop_rater   rf   post_layersuse_aux_headuse_mix_tokenpooling_scalec                 |   t         |           t        |      } t        |      |      }t	        |      }|| _        || _        || _        || _        |d   x| _	        | _
        |rd| _        |dk(  sJ d       d| _        t        dd||||d   	      | _        |}|d   |z  |z  |d
   |z  |z  f}t        j                   t#        j$                  d
|d   |d
   |d               | _        t        j(                  |      | _        g | _        g | _        g }d}t1        t        |            D ]  }||   r"t3        t4        |||   ||	|   ||   |||	      }n"t7        t8        |||   ||	|   ||   ||||
      }|j;                  |       | j,                  j;                  |       | j.                  j;                  t=        ||   |d|              |d
z  }|
|   s|j;                  t?        ||   ||d
z      d             |dz  }|d
z  } t        j@                  |      | _!        d| _"        |t        j@                  t1        t        |            D cg c]!  }tG        ||   |d   |	d   |d   ||d|      # c}      | _"        t        j                   t#        j$                  d
d
|d               | _$        tK        | jH                  d       |r?|dkD  r t        jL                  | j                  |      nt        jN                         | _(        nd| _(         || j                        | _)        t        j(                  |      | _*        |dkD  r t        jL                  | j                  |      nt        jN                         | _+        tK        | j&                  d       | jY                  | jZ                         yc c}w )a/  Initialize VOLO model.

        Args:
            layers: Number of blocks in each stage.
            img_size: Input image size.
            in_chans: Number of input channels.
            num_classes: Number of classes for classification.
            global_pool: Global pooling type ('token', 'avg', or '').
            patch_size: Patch size for patch embedding.
            stem_hidden_dim: Hidden dimension for stem convolution.
            embed_dims: List of embedding dimensions for each stage.
            num_heads: List of number of attention heads for each stage.
            downsamples: Whether to downsample between stages.
            outlook_attention: Whether to use outlook attention in each stage.
            mlp_ratio: Ratio for MLP hidden dimension.
            qkv_bias: Whether to use bias in QKV projection.
            drop_rate: Dropout rate.
            pos_drop_rate: Position embedding dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth drop rate.
            norm_layer: Normalization layer type.
            post_layers: Post-processing layer types.
            use_aux_head: Whether to use auxiliary head.
            use_mix_token: Whether to use token mixing for training.
            pooling_scale: Pooling scale factor.
        rB   r   r   z)return all tokens if mix_token is enabledFTrA   r   )r   r   r   r   r   r   r   )r   )rc   r!   r"   rf   )rc   r!   r   r"   rf   znetwork.)num_chs	reductionmoduleNrT   )r   r   rc   r!   r"   rd   rf   {Gz?std).r*   r+   lenr   r   r   r   	mix_tokenr   num_featureshead_hidden_sizebetagrad_checkpointingr   patch_embedr-   	Parameterr\   zeros	pos_embedr1   pos_drop
stage_endsfeature_infor   r   ra   r   r   r   dictr   
ModuleListnetworkpost_networkr   	cls_tokenr   r.   rl   aux_headnorm	head_dropheadapply_init_weights)r7   r   r   r   r   r   r   r   r   r   r   r   rc   r!   r   r   r   r   rf   r   r   r   r   
num_layersr
patch_gridr  r   istager9   s                                 r:   r+   zVOLO.__init__  s   f 	[
)Ij))4	X&&&&*4>rNBD1DI')V+VV)"'%!& m
  qkZ/=@(1+Q[B[_lBlm
ekk!Z]JqMS]^`Sa&bc

]3 	s6{# $	A #(qMaL'l%,)
 +qMaL'l%#1,) NN5!OO""9-$$T*Q-1W_`i_jUk%lmNI1~z*Q-AE9JANOQQ	I$	L }}W- !" " s;/0/  N"2'm'm%, )+/ !D  \\%++aJrN*KLDN$..c2 ITWXBIId&7&7E^`^i^i^kDM DMt001	 I.ALqBIId//=VXVaVaVc	dnn#.

4%%&7/s   &N9mr=   c                    t        |t        j                        rjt        |j                  d       t        |t        j                        r8|j
                  +t        j                  j                  |j
                  d       yyyy)z\Initialize weights for modules.

        Args:
            m: Module to initialize.
        r   r   Nr   )
isinstancer-   r.   r   weightr'   init	constant_)r7   r  s     r:   r  zVOLO._init_weights7  s\     a#!((,!RYY'AFF,>!!!&&!, -?' $r;   c                 
    ddhS )zwGet set of parameters that should not have weight decay.

        Returns:
            Set of parameter names.
        r	  r  r   r7   s    r:   no_weight_decayzVOLO.no_weight_decayB  s     [))r;   coarsec                 &    t        dddgg d      S )zGet parameter grouping for optimizer.

        Args:
            coarse: Whether to use coarse grouping.

        Returns:
            Parameter grouping dictionary.
        z ^cls_token|pos_embed|patch_embed)z^network\.(\d+)\.(\d+)N)z^network\.(\d+)r   ))z
^cls_tokenr(  )z^post_network\.(\d+)N)z^norm)i )stemr   blocks2)r  )r7   r&  s     r:   group_matcherzVOLO.group_matcherK  s$     41*
 	
r;   enablec                     || _         y)zqSet gradient checkpointing.

        Args:
            enable: Whether to enable gradient checkpointing.
        N)r  )r7   r,  s     r:   set_grad_checkpointingzVOLO.set_grad_checkpointingb  s     #)r;   c                     | j                   S )zYGet classifier module.

        Returns:
            The classifier head module.
        )r  r$  s    r:   get_classifierzVOLO.get_classifierk  s     yyr;   c                 6   || _         ||| _        |dkD  r t        j                  | j                  |      nt        j
                         | _        | j                  ?|dkD  r t        j                  | j                  |      nt        j
                         | _        yy)zReset classifier head.

        Args:
            num_classes: Number of classes for new classifier.
            global_pool: Global pooling type.
        Nr   )r   r   r-   r.   r  rl   r  r  )r7   r   r   s      r:   reset_classifierzVOLO.reset_classifiert  s~     '"*DALqBIId//=VXVaVaVc	==$ITWXBIId&7&7E^`^i^i^kDM %r;   r<   c                 P   t        | j                        D ]i  \  }}|dk(  r || j                  z   }| j                  |      }| j                  r+t
        j                  j                         st        ||      }b ||      }k |j                  \  }}}}|j                  |d|      }|S )zForward pass through token processing stages.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Token tensor of shape (B, N, C).
        rA   rB   )	enumerater  r	  r
  r  r\   r   is_scriptingr   rE   rI   )r7   r<   idxblockrM   rN   rO   rP   s           r:   forward_tokenszVOLO.forward_tokens  s     $DLL1 	JCax&MM!$&&uyy/E/E/Gua(!H	 WW
1aIIaQr;   c                 2   |j                   \  }}}| j                  j                  |dd      }t        j                  ||gd      }| j
                  D ]A  }| j                  r+t        j                  j                         st        ||      }: ||      }C |S )zForward pass through class attention blocks.

        Args:
            x: Input token tensor of shape (B, N, C).

        Returns:
            Output tensor with class token of shape (B, N+1, C).
        rB   r   rC   )
rE   r  expandr\   r   r  r  r   r5  r   )r7   r<   rM   r   rP   
cls_tokensr7  s          r:   forward_clszVOLO.forward_cls  s     ''1a^^**1b"5
IIz1o1-&& 	E&&uyy/E/E/Gua(!H		
 r;   c                    	 | j                  |      }|j                  dddd      }| j                  r0| j                  r#t        j
                  j                  | j                  | j                        j                         }|j                  d   | j                  z  |j                  d   | j                  z  }}t        |j                         || j                        \  }}}}|j                         }	| j                  |z  | j                  |z  }}
| j                  |z  | j                  |z  }}|j                  d      dd|
|||ddf   |	dd|
|||ddf<   |	}nd\  }}}}| j                  |      }| j                   | j#                  |      }| j%                  |      }| j&                  dk(  r|j)                  d	      }n| j&                  d
k(  r
|dddf   }n|}| j*                  |S | j+                  |ddddf         }| j                  s|d|j-                  d      d   z  z   S | j                  r| j                  r|j/                  |j                  d   |j                  d         }|j                         }	|j                  d      dd||||ddf   |	dd||||ddf<   |	}|j/                  |j                  d   ||z  |j                  d         }||||||ffS )a  Forward pass for training with mix token support.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            If training with mix_token: tuple of (class_token, aux_tokens, bbox).
            Otherwise: class_token tensor.
        r   rA   r@   r   )r,   N)r   r   r   r   avgrC   r         ?rB   )r  rF   r  r   r\   distributionsBetar  samplerE   r   r   r   cloneflipr8  r  r<  r  r   meanr  maxrI   )r7   r<   r   patch_hpatch_wr   r   r   r   temp_xsbbx1sbby1sbbx2sbby2x_clsx_auxs                   r:   forward_trainzVOLO.forward_train  s   	 QIIaAq! >>dmm%%**499dii@GGIC wwqzT-?-??tOaOaAaWG%.qvvxDDVDV%W"D$dWWYF--4d6H6H46O5E--4d6H6H46O5E56VVAYq%+uUZ{\]?]5^F1eEk5;12A%/"D$d " (  #AIIaLu$FFqFME(adGEE== La12h'}}31a000>>dmmMM%++a.'7EKKPROTE[[]F16Aq$t)TRVYXY?Y1ZF1d4idA-.EMM%++a.'G2CU[[QS_UE edD$555r;   indicesr  
stop_early
output_fmtintermediates_onlyc           	         |dv sJ d       g }t        t        | j                        |      \  }}	|D 
cg c]  }
| j                  |
    }}
| j                  |	   }	|j                  \  }}}}| j	                  |      j                  dddd      }t        j                  j                         s|s| j                  }n| j                  d|	dz    }t        |      D ]  \  }}|dk(  r || j                  z   }| j                  |      }| j                  r+t        j                  j                         st        ||      }n ||      }||v so|r|dk\  r| j                  |      }n|}|j!                  |j                  dddd              |r|S |j                  \  }}}}|j#                  |d|      }| j$                  | j'                  |      }| j                  |      }||fS c c}
w )	a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )NCHWzOutput format must be NCHW.r   rA   r@   r   NrB   )r   r   r  rE   r  rF   r\   r   r5  r  r4  r	  r
  r  r   r  r   rI   r  r<  )r7   r<   rQ  r  rR  rS  rT  intermediatestake_indices	max_indexr  rM   _heightwidthr  r6  r7  x_interrN   rO   rP   s                         r:   forward_intermediateszVOLO.forward_intermediates  s   * Y&E(EE&"6s4??7KW"Ui4@Aq*AAOOI.	  gg1feQ''1a3 99!!#:llGll>IM2G#G, 	BJCax&MM!$&&uyy/E/E/Gua(!Hl"C1H"iilGG$$W__Q1a%@A	B     WW
1aIIaQ(  #AIIaL-O Bs   G"
prune_norm
prune_headc                 (   	 t        t        | j                        |      \  }}| j                  |   }| j                  d|dz    | _        |rt	        j
                         | _        |r+t	        j                         | _        | j                  dd       |S )aH  Prune layers not required for specified intermediates.

        Args:
            indices: Indices of intermediate layers to keep.
            prune_norm: Whether to prune normalization layer.
            prune_head: Whether to prune classification head.

        Returns:
            List of kept intermediate indices.
        Nr   r    )
r   r   r  r  r-   rl   r  r  r  r2  )r7   rQ  r_  r`  rX  rY  s         r:   prune_intermediate_layerszVOLO.prune_intermediate_layers*  s|     	"6s4??7KW"UiOOI.	||NY]3DI "D!!!R(r;   c                     | j                  |      j                  dddd      }| j                  |      }| j                  | j	                  |      }| j                  |      }|S )zForward pass through feature extraction.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Feature tensor.
        r   rA   r@   r   )r  rF   r8  r  r<  r  rt   s     r:   forward_featureszVOLO.forward_featuresF  sg     Q''1a3 " (  #AIIaLr;   
pre_logitsc                 L   | j                   dk(  r|j                  d      }n| j                   dk(  r
|dddf   }n|}| j                  |      }|r|S | j                  |      }| j                  4| j	                  |ddddf         }|d|j                  d      d   z  z   }|S )zForward pass through classification head.

        Args:
            x: Input feature tensor.
            pre_logits: Whether to return pre-logits features.

        Returns:
            Classification logits or pre-logits features.
        r>  r   rC   r   Nr   r?  )r   rE  r  r  r  rF  )r7   r<   rf  outauxs        r:   forward_headzVOLO.forward_headZ  s     u$&&Q&-C(AqD'CCNN1Jiin==$--!QR%)Ccggajm++C
r;   c                 L    	 | j                  |      }| j                  |      }|S )zForward pass (simplified, without mix token training).

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Classification logits.
        )re  rj  rt   s     r:   rS   zVOLO.forwardt  s-     	@!!!$a r;   F)T)N)NFFrV  F)r   FT)'rU   rV   rW   rX   r-   rw   r   rY   strr   r   rZ   r[   r   r+   Moduler  r\   r   ignoresetr%  r   r   r+  r.  r0  r2  r]   r8  r<  r	   rP  r^  rc  re  rj  rS   r^   r_   s   @r:   r   r     s)   (
  #&#%.2-1,G2M""!#%$&$&#%<<5A!%"'!"/\'I\' \' 	\'
 \' \' \' !\' !c+\'  S	*\' tSy)\'  %T3Y/\' \' \' \'  !!\'" "#\'$ "%\'& !'\'( "%S/2)\'* +\',  -\'. /\'|	-ryy 	-T 	- YY* * * YY
D 
T#s(^ 
 
, YY)T )T ) ) YY		  lC lhsm lW[ l  .U\\ ell &;6u|| ;6ellE%,,X]XdXdfkloqtvy{~l~fJ  EA  7A  1B ;6@ 8<$$',? ||?  eCcN34?  	? 
 ?  ?  !%?  
tELL!5tELL7I)I#JJ	K? F ./$#	3S	>*  	
 
c8%,, 5<< (ell   4 %,, r;   variant
pretrainedc                 d    |j                  dd      }t        t        | |fdt        |d      i|S )zCreate VOLO model.

    Args:
        variant: Model variant name.
        pretrained: Whether to load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        VOLO model instance.
    out_indicesr@   feature_cfggetter)rt  feature_cls)popr   r   r  )rq  rr  r   rt  s       r:   _create_volory    sE     **]A.K [hG	
  r;   urlc                 2    | ddddddt         t        ddd	|S )
zCreate model configuration.

    Args:
        url: URL for pretrained weights.
        **kwargs: Additional configuration options.

    Returns:
        Model configuration dictionary.
    r   )r@   r   r   NQ?bicubicTzpatch_embed.conv.0)r  r  )rz  r   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizerE  r   
first_conv
classifierr
   )rz  r   s     r:   _cfgr    s5     =t)%.B*:N  r;   ztimm/zLhttps://github.com/sail-sg/volo/releases/download/volo_1/d1_224_84.2.pth.tarr|  )	hf_hub_idrz  r  zLhttps://github.com/sail-sg/volo/releases/download/volo_1/d1_384_85.2.pth.tarr   )r@   r   r   )r  rz  r  r~  zLhttps://github.com/sail-sg/volo/releases/download/volo_1/d2_224_85.2.pth.tarzLhttps://github.com/sail-sg/volo/releases/download/volo_1/d2_384_86.0.pth.tarzLhttps://github.com/sail-sg/volo/releases/download/volo_1/d3_224_85.4.pth.tarzLhttps://github.com/sail-sg/volo/releases/download/volo_1/d3_448_86.3.pth.tar)r@     r  zLhttps://github.com/sail-sg/volo/releases/download/volo_1/d4_224_85.7.pth.tarzMhttps://github.com/sail-sg/volo/releases/download/volo_1/d4_448_86.79.pth.targffffff?zMhttps://github.com/sail-sg/volo/releases/download/volo_1/d5_224_86.10.pth.tarzLhttps://github.com/sail-sg/volo/releases/download/volo_1/d5_448_87.0.pth.tarzMhttps://github.com/sail-sg/volo/releases/download/volo_1/d5_512_87.07.pth.tar)r@      r  )zvolo_d1_224.sail_in1kzvolo_d1_384.sail_in1kzvolo_d2_224.sail_in1kzvolo_d2_384.sail_in1kzvolo_d3_224.sail_in1kzvolo_d3_448.sail_in1kzvolo_d4_224.sail_in1kzvolo_d4_448.sail_in1kzvolo_d5_224.sail_in1kzvolo_d5_448.sail_in1kzvolo_d5_512.sail_in1kc                 >    t        ddddd|}t        dd| i|}|S )VOLO-D1 model, Params: 27M.r(   r(   r   rA      r   r   r         r  r  r   r   r   rr  r   )volo_d1_224r  ry  rr  r   
model_argsmodels       r:   r  r    4     p\6JVepiopJL:LLELr;   c                 >    t        ddddd|}t        dd| i|}|S )r  r  r  r  r  rr  r   )volo_d1_384r  r  s       r:   r  r    r  r;   c                 >    t        ddddd|}t        dd| i|}|S )VOLO-D2 model, Params: 59M.r  r(   
   r(      r  r  r  r   r   r   r   r  rr  r   )volo_d2_224r  r  s       r:   r  r    4     q]7KWfqjpqJL:LLELr;   c                 >    t        ddddd|}t        dd| i|}|S )r  r  r  r  r  rr  r   )volo_d2_384r  r  s       r:   r  r    r  r;   c                 >    t        ddddd|}t        dd| i|}|S )VOLO-D3 model, Params: 86M.r   r   r   r(   r  r  r  rr  r   )volo_d3_224r  r  s       r:   r  r    r  r;   c                 >    t        ddddd|}t        dd| i|}|S )r  r  r  r  r  rr  r   )volo_d3_448r  r  s       r:   r  r    r  r;   c                 >    t        ddddd|}t        dd| i|}|S )VOLO-D4 model, Params: 193M.r  r      r  r  r  r   r   r   r  rr  r   )volo_d4_224r  r  s       r:   r  r    4     r]7KWgrkqrJL:LLELr;   c                 >    t        ddddd|}t        dd| i|}|S )r  r  r  r  r  rr  r   )volo_d4_448r  r  s       r:   r  r    r  r;   c           	      B    t        ddddddd|}t        d	d| i|}|S )
jVOLO-D5 model, Params: 296M.

    stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5.
    r  r     r(   r  r  r(      r   r   r   rc   r   rr  r   )volo_d5_224r  r  s       r:   r  r    B      4+?K[S4,24J L:LLELr;   c           	      B    t        ddddddd|}t        d	d| i|}|S )
r  r  r  r  r(   r  r  rr  r   )volo_d5_448r  r  s       r:   r  r  )  r  r;   c           	      B    t        ddddddd|}t        d	d| i|}|S )
r  r  r  r  r(   r  r  rr  r   )volo_d5_512r  r  s       r:   r  r  6  r  r;   r   )r   r@   r   rA   rb   Fr   rT   )rb   Fr   rT   rl  )rb  )ErX   rG   typingr   r   r   r   r   r   r	   r\   torch.nnr-   torch.nn.functional
functionalrK   	timm.datar   r   timm.layersr   r   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__rn  r   ra   ry   r   r   r   rm  r   rY   r[   r   r   r   rZ   r   r   r   r   ry  r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  r   r;   r:   <module>r     sa  *  D D D     A Z Z * + # <(Lryy L^@		 @F>		 >B.")) .b;RYY ;|?7 ?7D## # #		 #>E#s(O >% > >E#sTWY\J\D] >B; ;| D  "222 2 S		2
 2 2 2 2 2 2 2 2 2 ]]2v  "))) ) S		)
 ) ) ) ) ) ) ]])Xj299 jZ# 4 3 4 *c # $sCx. ( %!Z "Z0 "Z "Z0 "Z "Z0 "Z "[-1 "[ "Z-1 "[-1S-& -` D C D   D C D   D C D   D C D   D C D   D C D   D C D   D C D   	D 	C 	D 	 	 	D 	C 	D 	 	 	D 	C 	D 	 	r;   