
    kh`}                        d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlmc mZ d dlmZmZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lmZmZ d d
l m!Z! d dl"m#Z#m$Z$ d dl%m&Z& g dZ'dee(e(f   de(de(de(dee(e(f   f
dZ)dee(e(f   de(deee(e(f      fdZ*de(de(dej&                  fdZ+ G d dejX                        Z- G d dejX                        Z. G d dejX                        Z/ G d d ejX                        Z0 G d! d"ejX                        Z1 G d# d$ejX                        Z2 G d% d&ejX                        Z3 G d' d(ejX                        Z4 G d) d*ejX                        Z5	 	 d<d+e(d,ee(   d-ee(   d.e6d/e(d0e(d1e	e   d2e7d3ede5fd4Z8 G d5 d6e      Z9 e        ed7e9jt                  f8      dd9d:d1e	e9   d2e7d3ede5fd;              Z;y)=    N)OrderedDict)partial)AnyCallableListOptionalSequenceTuple)nnTensor)register_modelWeightsWeightsEnum)_IMAGENET_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)Conv2dNormActivationSqueezeExcitation)StochasticDepth)ImageClassificationInterpolationMode)_log_api_usage_once)MaxVitMaxVit_T_Weightsmaxvit_t
input_sizekernel_sizestridepaddingreturnc                 R    | d   |z
  d|z  z   |z  dz   | d   |z
  d|z  z   |z  dz   fS )Nr          )r   r   r   r   s       U/var/www/teggl/fontify/venv/lib/python3.12/site-packages/torchvision/models/maxvit.py_get_conv_output_shaper&      sJ    	A	$q7{	2v=A	A	$q7{	2v=A     n_blocksc                     g }t        | ddd      }t        |      D ]!  }t        |ddd      }|j                  |       # |S )zQUtil function to check that the input size is correct for a MaxVit configuration.   r"   r#   )r&   rangeappend)r   r(   shapesblock_input_shape_s        r%   _make_block_input_shapesr0       sQ    F.z1aC8_ )23DaAN'() Mr'   heightwidthc                    t        j                  t        j                  t        j                  |       t        j                  |      gd            }t        j                  |d      }|d d d d d f   |d d d d d f   z
  }|j                  ddd      j                         }|d d d d dfxx   | dz
  z  cc<   |d d d d dfxx   |dz
  z  cc<   |d d d d dfxx   d|z  dz
  z  cc<   |j                  d      S )Nij)indexingr#   r"   r   )torchstackmeshgridarangeflattenpermute
contiguoussum)r1   r2   coordscoords_flatrelative_coordss        r%   _get_relative_position_indexrB   *   s    [[f)=u||E?R(S^bcdF--*K!!Q*-AtQJ0GGO%--aA6AACOAq!G
*Aq!G	)Aq!GE	A-r""r'   c                        e Zd ZdZ	 ddededededededej                  f   d	edej                  f   d
eddf fdZ	de
de
fdZ xZS )MBConva=  MBConv: Mobile Inverted Residual Bottleneck.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        stride (int): Stride of the depthwise convolution.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        p_stochastic_dropout (float): Probability of stochastic depth.
    in_channelsout_channelsexpansion_ratiosqueeze_ratior   activation_layer.
norm_layerp_stochastic_dropoutr    Nc	                    t         |           |  |dk7  xs ||k7  }	|	rTt        j                  ||ddd      g}
|dk(  rt        j                  d|d      g|
z   }
t        j
                  |
 | _        nt        j                         | _        t        ||z        }t        ||z        }|rt        |d      | _
        nt        j                         | _
        t               } ||      |d	<   t        ||ddd
||d       |d<   t        ||d|d|||d 	      |d<   t        ||t        j                        |d<   t        j                  ||dd      |d<   t        j
                  |      | _        y )Nr#   T)r   r   biasr"   r*   r   r   r   rowmodepre_normr   )r   r   r   rI   rJ   inplaceconv_a)r   r   r   rI   rJ   groupsrS   conv_b)
activationsqueeze_excitation)rE   rF   r   rM   conv_c)super__init__r   Conv2d	AvgPool2d
SequentialprojIdentityintr   stochastic_depthr   r   r   SiLUlayers)selfrE   rF   rG   rH   r   rI   rJ   rK   should_projr_   mid_channelssqz_channels_layers	__class__s                 r%   r[   zMBConv.__init__C   st    	 	k@[L%@IIk<QqW[\]D{61MNQUUt,DIDI</9:<-78$34Hu$UD!$&KKMD!-(5
0-!	
 1-!

 ):,acahah(i$%II,\ghostmmG,r'   xc                 n    | j                  |      }| j                  | j                  |            }||z   S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H / stride, W / stride].
        )r_   rb   rd   re   rk   ress      r%   forwardzMBConv.forward   s2     iil!!$++a.1Qwr'   )        )__name__
__module____qualname____doc__ra   floatr   r   Moduler[   r   ro   __classcell__rj   s   @r%   rD   rD   5   s    , '*;-;- ;- 	;-
 ;- ;- #3		>2;- S"))^,;- $;- 
;-z	 	F 	r'   rD   c                   d     e Zd ZdZdedededdf fdZdej                  fdZd	edefd
Z	 xZ
S )$RelativePositionalMultiHeadAttentionzRelative Positional Multi-Head Attention.

    Args:
        feat_dim (int): Number of input features.
        head_dim (int): Number of features per head.
        max_seq_len (int): Maximum sequence length.
    feat_dimhead_dimmax_seq_lenr    Nc                 b   t         |           ||z  dk7  rt        d| d|       ||z  | _        || _        t        t        j                  |            | _        || _	        t        j                  || j                  | j                  z  dz        | _        |dz  | _        t        j                  | j                  | j                  z  |      | _        t        j                  j!                  t#        j$                  d| j                  z  dz
  d| j                  z  dz
  z  | j                  ft"        j&                              | _        | j+                  d	t-        | j                  | j                               t"        j                  j.                  j1                  | j(                  d
       y )Nr   z
feat_dim: z  must be divisible by head_dim: r*   g      r"   r#   )dtyperelative_position_index{Gz?std)rZ   r[   
ValueErrorn_headsr|   ra   mathsqrtsizer}   r   Linearto_qkvscale_factormerge	parameter	Parameterr7   emptyfloat32relative_position_bias_tableregister_bufferrB   inittrunc_normal_)re   r{   r|   r}   rj   s       r%   r[   z-RelativePositionalMultiHeadAttention.__init__   sU    	h!#z(3ST\S]^__8+ 		+./	&ii$,,*F*JK$dNYYt}}t||;XF
,.LL,B,BKK!dii-!+DII0ABDLLQY^YfYfg-
) 	68TUYU^U^`d`i`i8jk##D$E$E4#Pr'   c                    | j                   j                  d      }| j                  |   j                  | j                  | j                  d      }|j	                  ddd      j                         }|j                  d      S )Nr6   r"   r   r#   )r   viewr   r}   r<   r=   	unsqueeze)re   
bias_indexrelative_biass      r%   get_relative_positional_biaszARelativePositionalMultiHeadAttention.get_relative_positional_bias   ss    1166r:
99*EJJ4K[K[]a]m]moqr%--aA6AAC&&q))r'   rk   c                    |j                   \  }}}}| j                  | j                  }}| j                  |      }t	        j
                  |dd      \  }	}
}|	j                  |||||      j                  ddddd      }	|
j                  |||||      j                  ddddd      }
|j                  |||||      j                  ddddd      }|
| j                  z  }
t	        j                  d|	|
      }| j                         }t        j                  ||z   d      }t	        j                  d	||      }|j                  ddddd      j                  ||||      }| j                  |      }|S )
z
        Args:
            x (Tensor): Input tensor with expected layout of [B, G, P, D].
        Returns:
            Tensor: Output tensor with expected layout of [B, G, P, D].
        r*   r6   )dimr   r#   r"      z!B G H I D, B G H J D -> B G H I Jz!B G H I J, B G H J D -> B G H I D)shaper   r|   r   r7   chunkreshaper<   r   einsumr   Fsoftmaxr   )re   rk   BGPDHDHqkvqkvdot_prodpos_biasouts                  r%   ro   z,RelativePositionalMultiHeadAttention.forward   sX    WW
1admm2kk!n++c1"-1aIIaAq"%--aAq!<IIaAq"%--aAq!<IIaAq"%--aAq!<!!!<< CQJ44699X0b9ll>!Lkk!Q1a(00Aq!<jjo
r'   )rq   rr   rs   rt   ra   r[   r7   r   r   ro   rw   rx   s   @r%   rz   rz      s[    QQ Q 	Q
 
Q8*ell * F r'   rz   c                   h     e Zd ZdZdededdf fdZdej                  dej                  fdZ xZ	S )	SwapAxeszPermute the axes of a tensor.abr    Nc                 >    t         |           || _        || _        y N)rZ   r[   r   r   )re   r   r   rj   s      r%   r[   zSwapAxes.__init__   s    r'   rk   c                 \    t        j                  || j                  | j                        }|S r   )r7   swapaxesr   r   rm   s      r%   ro   zSwapAxes.forward   s!    nnQ/
r'   )
rq   rr   rs   rt   ra   r[   r7   r   ro   rw   rx   s   @r%   r   r      s;    '# # $ 
 %,, r'   r   c                   8     e Zd ZdZd fdZdededefdZ xZS )WindowPartitionzB
    Partition the input tensor into non-overlapping windows.
    r    c                 "    t         |           y r   rZ   r[   re   rj   s    r%   r[   zWindowPartition.__init__       r'   rk   pc                     |j                   \  }}}}|}|j                  ||||z  |||z  |      }|j                  dddddd      }|j                  |||z  ||z  z  ||z  |      }|S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
            p (int): Number of partitions.
        Returns:
            Tensor: Output tensor with expected layout of [B, H/P, W/P, P*P, C].
        r   r"   r   r*      r#   r   r   r<   )re   rk   r   r   Cr   Wr   s           r%   ro   zWindowPartition.forward   s|     WW
1aIIaAFAqAvq1IIaAq!Q'IIa!q&Q!V,a!eQ7r'   r    N	rq   rr   rs   rt   r[   r   ra   ro   rw   rx   s   @r%   r   r      s'     C F r'   r   c            
       @     e Zd ZdZd	 fdZdededededef
dZ xZS )
WindowDepartitionzo
    Departition the input tensor of non-overlapping windows into a feature volume of layout [B, C, H, W].
    r    c                 "    t         |           y r   r   r   s    r%   r[   zWindowDepartition.__init__  r   r'   rk   r   h_partitionsw_partitionsc                     |j                   \  }}}}|}	||}}
|j                  ||
||	|	|      }|j                  dddddd      }|j                  |||
|	z  ||	z        }|S )ar  
        Args:
            x (Tensor): Input tensor with expected layout of [B, (H/P * W/P), P*P, C].
            p (int): Number of partitions.
            h_partitions (int): Number of vertical partitions.
            w_partitions (int): Number of horizontal partitions.
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H, W].
        r   r   r#   r*   r"   r   r   )re   rk   r   r   r   r   r   PPr   r   HPWPs               r%   ro   zWindowDepartition.forward  st     gg1b!|BIIaRAq)IIaAq!Q'IIaBFBF+r'   r   r   rx   s   @r%   r   r      s6     C s # RX r'   r   c                        e Zd ZdZdededededeeef   deded	ej                  f   d
ed	ej                  f   de
de
de
ddf fdZdedefdZ xZS )PartitionAttentionLayera  
    Layer for partitioning the input tensor into non-overlapping windows and applying attention to each window.

    Args:
        in_channels (int): Number of input channels.
        head_dim (int): Dimension of each attention head.
        partition_size (int): Size of the partitions.
        partition_type (str): Type of partitioning to use. Can be either "grid" or "window".
        grid_size (Tuple[int, int]): Size of the grid to partition the input tensor into.
        mlp_ratio (int): Ratio of the  feature size expansion in the MLP layer.
        activation_layer (Callable[..., nn.Module]): Activation function to use.
        norm_layer (Callable[..., nn.Module]): Normalization function to use.
        attention_dropout (float): Dropout probability for the attention layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        p_stochastic_dropout (float): Probability of dropping out a partition.
    rE   r|   partition_sizepartition_type	grid_size	mlp_ratiorI   .rJ   attention_dropoutmlp_dropoutrK   r    Nc           	         t         |           ||z  | _        || _        |d   |z  | _        || _        || _        |dvrt        d      |dk(  r|| j                  c| _        | _	        n| j                  |c| _        | _	        t               | _        t               | _        |dk(  rt        dd      nt        j                          | _        |dk(  rt        dd      nt        j                          | _        t        j&                   ||      t)        |||dz        t        j*                  |	            | _        t        j&                  t        j.                  |      t        j0                  |||z         |       t        j0                  ||z  |      t        j*                  |
            | _        t5        |d	
      | _        y )Nr   )gridwindowz0partition_type must be either 'grid' or 'window'r   r   r"   rO   rP   )rZ   r[   r   r|   n_partitionsr   r   r   r   gr   partition_opr   departition_opr   r   r`   partition_swapdepartition_swapr^   rz   Dropout
attn_layer	LayerNormr   	mlp_layerr   stochastic_dropout)re   rE   r|   r   r   r   r   rI   rJ   r   r   rK   rj   s               r%   r[   z PartitionAttentionLayer.__init__,  s   " 	"h. %aLN:,"!33OPPX%+T->->NDFDF!..NDFDF+-/12@F2Jhr2.PRP[P[P]4Bf4LR 0RTR]R]R_--{# 1hXYHYZJJ()
 LL%IIk;#:;IIkI-{;JJ{#
 #22FU"Sr'   rk   c                    | j                   d   | j                  z  | j                   d   | j                  z  }}t        j                  | j                   d   | j                  z  dk(  xr | j                   d   | j                  z  dk(  dj	                  | j                   | j                               | j                  || j                        }| j                  |      }|| j                  | j                  |            z   }|| j                  | j                  |            z   }| j                  |      }| j                  || j                  ||      }|S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H, W].
        r   r#   z[Grid size must be divisible by partition size. Got grid size of {} and partition size of {})r   r   r7   _assertformatr   r   r   r   r   r   r   )re   rk   ghgws       r%   ro   zPartitionAttentionLayer.forwardf  s!    "dff,dnnQ.?466.IBNN1&!+Oq0ADFF0Ja0Oipp	
 a("''(:;;''q(9::!!!$4662r2r'   )rq   rr   rs   rt   ra   strr
   r   r   rv   ru   r[   r   ro   rw   rx   s   @r%   r   r     s    "8T8T 8T
 8T 8T c?8T 8T #3		>28T S"))^,8T !8T 8T $8T  
!8Tt F r'   r   c                        e Zd ZdZdededededededej                  f   d	edej                  f   d
edededededede	eef   ddf fdZ
dedefdZ xZS )MaxVitLayera  
    MaxVit layer consisting of a MBConv layer followed by a PartitionAttentionLayer with `window` and a PartitionAttentionLayer with `grid`.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        stride (int): Stride of the depthwise convolution.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Ratio of the MLP layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        attention_dropout (float): Dropout probability for the attention layer.
        p_stochastic_dropout (float): Probability of stochastic depth.
        partition_size (int): Size of the partitions.
        grid_size (Tuple[int, int]): Size of the input feature grid.
    rE   rF   rH   rG   r   rJ   .rI   r|   r   r   r   rK   r   r   r    Nc                 2   t         |           t               }t        ||||||||      |d<   t	        |||d||	|t
        j                  ||
|      |d<   t	        |||d||	|t
        j                  ||
|      |d<   t        j                  |      | _        y )N)rE   rF   rG   rH   r   rI   rJ   rK   MBconvr   )rE   r|   r   r   r   r   rI   rJ   r   r   rK   window_attentionr   grid_attention)	rZ   r[   r   rD   r   r   r   r^   rd   )re   rE   rF   rH   rG   r   rJ   rI   r|   r   r   r   rK   r   r   rd   rj   s                   r%   r[   zMaxVitLayer.__init__  s    * 	)m "#%+'-!!5	
x &=$)#-||/#!5&
!" $;$)!-||/#!5$
  mmF+r'   rk   c                 (    | j                  |      }|S z
        Args:
            x (Tensor): Input tensor of shape (B, C, H, W).
        Returns:
            Tensor: Output tensor of shape (B, C, H, W).
        rd   )re   rk   s     r%   ro   zMaxVitLayer.forward  s     KKNr'   )rq   rr   rs   rt   ra   ru   r   r   rv   r
   r[   r   ro   rw   rx   s   @r%   r   r     s    (?, ?, 	?,
 ?, ?, ?, S"))^,?, #3		>2?, ?, ?, ?, !?,  $!?,$ %?,& c?'?,( 
)?,B F r'   r   c                        e Zd ZdZdedededededej                  f   dedej                  f   d	ed
edededede	eef   dede
e   ddf fdZdedefdZ xZS )MaxVitBlocka(  
    A MaxVit block consisting of `n_layers` MaxVit layers.

     Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Ratio of the MLP layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        attention_dropout (float): Dropout probability for the attention layer.
        p_stochastic_dropout (float): Probability of stochastic depth.
        partition_size (int): Size of the partitions.
        input_grid_size (Tuple[int, int]): Size of the input feature grid.
        n_layers (int): Number of layers in the block.
        p_stochastic (List[float]): List of probabilities for stochastic depth for each layer.
    rE   rF   rH   rG   rJ   .rI   r|   r   r   r   r   input_grid_sizen_layersp_stochasticr    Nc                 p   t         |           t        |      |k(  st        d| d| d      t	        j
                         | _        t        |ddd      | _        t        |      D ]L  \  }}|dk(  rdnd}| xj                  t        |dk(  r|n||||||||||	|
|| j                  |	      gz  c_        N y )
Nz'p_stochastic must have length n_layers=z, got p_stochastic=.r*   r"   r#   rN   r   )rE   rF   rH   rG   r   rJ   rI   r|   r   r   r   r   r   rK   )rZ   r[   lenr   r   
ModuleListrd   r&   r   	enumerater   )re   rE   rF   rH   rG   rJ   rI   r|   r   r   r   r   r   r   r   idxr   r   rj   s                     r%   r[   zMaxVitBlock.__init__  s    , 	< H,FxjPcdpcqqrsttmmo/QWXbcd- 	FC(QFKK/2ax\!-"/$3!)%5%' +&7#1"nn)* K	r'   rk   c                 8    | j                   D ]
  } ||      } |S r   r   )re   rk   layers      r%   ro   zMaxVitBlock.forward,  s%     [[ 	EaA	r'   )rq   rr   rs   rt   ra   ru   r   r   rv   r
   r   r[   r   ro   rw   rx   s   @r%   r   r     s    *1 1 	1
 1 1 S"))^,1 #3		>21 1 1 1 !1  !1" sCx#1& '1( 5k)1* 
+1f	 	F 	r'   r   c            !            e Zd ZdZdej
                  ddddddfdeeef   ded	ed
ee   dee   dede	de
edej                  f      dedej                  f   de	de	dede	de	deddf  fdZdedefdZd Z xZS )r   ay  
    Implements MaxVit Transformer from the `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_ paper.
    Args:
        input_size (Tuple[int, int]): Size of the input image.
        stem_channels (int): Number of channels in the stem.
        partition_size (int): Size of the partitions.
        block_channels (List[int]): Number of channels in each block.
        block_layers (List[int]): Number of layers in each block.
        stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value.
        squeeze_ratio (float): Squeeze ratio in the SE Layer. Default: 0.25.
        expansion_ratio (float): Expansion ratio in the MBConv bottleneck. Default: 4.
        norm_layer (Callable[..., nn.Module]): Normalization function. Default: None (setting to None will produce a `BatchNorm2d(eps=1e-3, momentum=0.01)`).
        activation_layer (Callable[..., nn.Module]): Activation function Default: nn.GELU.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Expansion ratio of the MLP layer. Default: 4.
        mlp_dropout (float): Dropout probability for the MLP layer. Default: 0.0.
        attention_dropout (float): Dropout probability for the attention layer. Default: 0.0.
        num_classes (int): Number of classes. Default: 1000.
    Ng      ?r   rp   i  r   stem_channelsr   block_channelsblock_layersr|   stochastic_depth_probrJ   .rI   rH   rG   r   r   r   num_classesr    c                    t         |           t        |        d}|t        t        j
                  dd      }t        |t        |            }t        |      D ]3  \  }}|d   |z  dk7  s|d   |z  dk7  st        d| d| d	| d
| d	       t	        j                  t        ||dd||	dd       t        ||ddd d d            | _        t        |ddd      }|| _        t	        j                         | _        |g|d d z   }|}t#        j$                  d|t'        |            j)                         }d}t+        |||      D ]\  \  }}}| j                   j-                  t/        |||
|||	|||||||||||z                 | j                   d   j0                  }||z  }^ t	        j                  t	        j2                  d      t	        j4                         t	        j6                  |d         t	        j8                  |d   |d         t	        j:                         t	        j8                  |d   |d            | _        | j?                          y )Nr*   gMbP?g{Gz?)epsmomentumr   r#   zInput size z
 of block z$ is not divisible by partition size zx. Consider changing the partition size or the input size.
Current configuration yields the following block input sizes: r   r"   F)r   rJ   rI   rM   rS   T)r   rJ   rI   rM   rN   r6   )rE   rF   rH   rG   rJ   rI   r|   r   r   r   r   r   r   r   )rM   ) rZ   r[   r   r   r   BatchNorm2dr0   r   r   r   r^   r   stemr&   r   r   blocksnplinspacer>   tolistzipr,   r   r   AdaptiveAvgPool2dFlattenr   r   Tanh
classifier_init_weights)re   r   r  r   r  r  r|   r  rJ   rI   rH   rG   r   r   r   r	  input_channelsblock_input_sizesr  block_input_sizerE   rF   r   p_idx
in_channelout_channel
num_layersrj   s                              r%   r[   zMaxVit.__init__M  s   : 	D!  TDIJ
 5Z^ATU%./@%A 	!C!"^3q8<LQ<OR`<`de<e !"2!3:cUBfgufv wUUfTgghj 	 MM %!1	 !}ad]ahl
	" ,JAaYZ[
, mmo$os(;;%
 {{1&;S=NOVVX36{LR^3_ 	 /JZKK *!,"/$3)%5%' +&7#1$.'!-eej6H!I$ R22JZE)	 0 --  #JJLLL+,IInR(.*<=GGIIInR(+EB
 	r'   rk   c                 |    | j                  |      }| j                  D ]
  } ||      } | j                  |      }|S r   )r  r  r  )re   rk   blocks      r%   ro   zMaxVit.forward  s>    IIaL[[ 	EaA	OOAr'   c                    | j                         D ]l  }t        |t        j                        rbt        j                  j                  |j                  d       |j                  Vt        j                  j                  |j                         t        |t        j                        rUt        j                  j                  |j                  d       t        j                  j                  |j                  d       t        |t        j                        st        j                  j                  |j                  d       |j                  Dt        j                  j                  |j                         o y )Nr   r   r#   r   )modules
isinstancer   r\   r   normal_weightrM   zeros_r  	constant_r   )re   ms     r%   r  zMaxVit._init_weights  s     	+A!RYY'd366%GGNN166*Ar~~.!!!((A.!!!&&!,Aryy)d366%GGNN166*	+r'   )rq   rr   rs   rt   r   GELUr
   ra   r   ru   r   r   rv   r[   r   ro   r  rw   rx   s   @r%   r   r   8  s%   J :>57WW#!" #&7t #s(Ot
 t t S	t 3it t  %t" Xc299n56#t$ #3		>2%t( )t* +t. /t0 1t2 !3t6 7t8 
9tl F +r'   r   r  r  r  r  r   r|   weightsprogresskwargsc                 d   |dt        |dt        |j                  d                |j                  d   d   |j                  d   d   k(  sJ t        |d|j                  d          |j                  dd      }	t	        d| ||||||	d|}
|"|
j                  |j                  |d	
             |
S )Nr	  
categoriesmin_sizer   r#   r      r2  )r  r  r  r  r|   r   r   T)r,  
check_hashr$   )r   r   metapopr   load_state_dictget_state_dict)r  r  r  r  r   r|   r+  r,  r-  r   models              r%   _maxvitr9    s    $ fmSl9S5TU||J'*gll:.Fq.IIIIflGLL4LML*5J 	#%!3%	 	E g44hSW4XYLr'   c                   j    e Zd Z ed eeddej                        edddddd	d
idddd      Z	e	Z
y)r   z9https://download.pytorch.org/models/maxvit_t-bc5ab103.pthr2  )	crop_sizeresize_sizeinterpolationir1  zLhttps://github.com/pytorch/vision/tree/main/references/classification#maxvitzImageNet-1KgT@g|?5.X@)zacc@1zacc@5gZd;@gK7]@zThese weights reproduce closely the results of the paper using a similar training recipe.
            They were trained with a BatchNorm2D momentum of 0.99 instead of the more correct 0.01.)r/  
num_paramsr0  recipe_metrics_ops
_file_size_docs)url
transformsr4  N)rq   rr   rs   r   r   r   r   BICUBICr   IMAGENET1K_V1DEFAULTr$   r'   r%   r   r     sb    G3CO`OhOh
 /""d##  !g
M. Gr'   r   
pretrained)r+  T)r+  r,  c                 \    t         j                  |       } t        ddg dg dddd| |d|S )	a  
    Constructs a maxvit_t architecture from
    `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_.

    Args:
        weights (:class:`~torchvision.models.MaxVit_T_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.MaxVit_T_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.maxvit.MaxVit``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/maxvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.MaxVit_T_Weights
        :members:
    @   )rK        i   )r"   r"   r   r"       g?   )r  r  r  r|   r  r   r+  r,  r$   )r   verifyr9  )r+  r,  r-  s      r%   r   r     sH    . %%g.G 
*!!
 
 
r'   )NF)<r   collectionsr   	functoolsr   typingr   r   r   r   r	   r
   numpyr  r7   torch.nn.functionalr   
functionalr   r   torchvision.models._apir   r   r   torchvision.models._metar   torchvision.models._utilsr   r   torchvision.ops.miscr   r    torchvision.ops.stochastic_depthr   torchvision.transforms._presetsr   r   torchvision.utilsr   __all__ra   r&   r0   rB   rv   rD   rz   r   r   r   r   r   r   r   ru   boolr9  r   rG  r   r$   r'   r%   <module>r`     sX    #  A A      H H 9 T H < R 1uS#X S RU `c hmnqsvnvhw sCx C DQVWZ\_W_Q`La # #S #U\\ #TRYY TnF299 FR
ryy 
bii 4		 <ebii eP^")) ^BR")) Rj^+RYY ^+Z &*'' I	'
 s)' !' ' ' k"' ' '  !'T{ 6 ,0@0N0N!OP6:T !"23 !d !]` !ek ! Q !r'   