
    kh
                       d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZmZ ddlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZm Z  ddlm!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddlm'Z'm(Z(m)Z)m*Z*m+Z+ ddlm,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9 ddl:m;Z;m<Z< g dZ=e G d d             Z>e G d d             Z?e G d d             Z@ G d dej                        ZB G d dej                        ZC G d d ej                        ZD G d! d"ej                        ZE G d# d$ej                        ZFdd&ej                  d'eGd(eGd)dfd*ZH G d+ d,ej                        ZIdd&ej                  d'eGd(eGd)dfd-ZJd.eeK   d/eKd)eKfd0ZL G d1 d2ej                        ZM G d3 d4ej                        ZNd5ej                  d6eeK   d)ej                  fd7ZPe6d8ej                  d6eeK   d9eeK   d)ej                  fd:       ZQd5ej                  d;eeK   d)ej                  fd<ZRe6d8ej                  d;eeK   d9eeK   d)ej                  fd=       ZSd>e>d6eeKeKf   d)ee   fd?ZT G d@ dAej                        ZU G dB dCej                        ZVd5ej                  d6eeK   d)ej                  fdDZWe6d8ej                  d6eeK   d9eeK   d)ej                  fdE       ZXd5ej                  d;eeK   d)ej                  fdFZYe6d8ej                  d;eeK   d9eeK   d)ej                  fdG       ZZ G dH dIej                        Z[ G dJ dKej                        Z\ G dL dMej                        Z] G dN dOej                        Z^ G dP dQej                        Z_d>e>d9eeKeKf   d)e>fdRZ`d>e@dSed)e@fdTZa G dU dVej                        Zb	 	 	 	 	 	 	 	 	 	 	 	 dd\eGd]eGd^ecd_ecd`eGdaeGdbecdceGddeGdeeed   dfeGdgeKd)eeGef   fdhZe	 	 	 	 	 	 	 	 	 	 	 	 dd\eGd]eGd^ecdjeddaeGdceGddeGd6eeeKeKf      dkeKdeeed   dfeGdgeKd)eeGef   fdlZf	 	 	 	 	 	 	 	 	 	 	 dd\eGd]eGdaeGdoeGdceGddeGd6eeeKeKf      dpecdeeedeededf   f   dfeGdgeKd)eeGef   fdqZgd)eeGef   fdrZh eidi ds e@ddtdudvdw efdYdxy      dz e@ddtd{dvdw efdWdYdx|      d} e@dd~ddvdw eedYdX      d e@dd~ddvdw eedidYdX      d e@dddddw eedid      d e@dddddw eediddm      d e@dd~ddvdw eedidYdXd      d e@ddtd{dvdw efdYdxdnd      d e@dd~ddvdw eedidn      d e@dd~ddvdw eeddYdXdnd      d e@dd~ddvdw eedidnd[      d e@dddddw eediddmdn      d e@dddddw eediddmdn      d e@ddtd{dvdd ee       d e@ddtd{dvdd egdZd      d e@d~dudd      d e@d~ddd      d e@dddd      d e@dddd      d e@dddd      d e@dddd      d e@dddddd ef       d e@ddtdddvd ef       d e@ddtdddvd ef       d e@ddtdddvd ef       d e@dddddd efdn      d e@ddtdddvd efdn      d e@ddtdddvd efdn      d e@dd~dddvd efdndm      d e@dd~dddvddƜ efdn      d e@ddtdddvddȜ eg       d e@ddtdddvd eg       d e@dd~dddd eg       d e@dd~dddddȜ egdYdZͫ      d e@dddddd egdYЫ      d e@dddddddƜ egdYЫ      d e@ddtddddYd[dל eh       d e@dd~ddddYddל eh       d e@dd~ddddYddל eh       d e@ddddddYddל eh       d e@ddddddYddל eh       ZjdeeGej                  f   dej                  d)eeGej                  f   fdބZkddeGdeeG   decdSed)ebf
dZlddeGdSed)eeGef   fdZm e;i d emd%      d emddd      d emdd      d emdd      d emd      d emd      d emd      d emddddd      d emddeed      d emdd d      d emd%      d emdd      d emdd      d emd%      d emd%      d emdd	d      d
 emdd      i d emdd      d emdd      d emdd      d emd%      d emd%      d emd%      d emd%      d emd%      d emd%      d emd%dd      d emdddd      d emdd      d emd%dd      d  emd%dd      d! emdd"dd      d# emdd$dd      d% emdd&dd      i d' emdd(d      d) emd%dd      d* emd      d+ emddddd      d, emdd      d- emdd.dd      d/ emd%dd      d0 emdd1dd      d2 emddd3      d4 emd      d5 emddddd      d6 emd%      d7 emdd      d8 emdee9      d: emddddd      d; emdd<d=dd      d> emdee9      i d? emddddd      d@ emdd<d=dd      dA emdee9      dB emddddd      dC emdd<d=dd      dD emdee9      dE emddddd      dF emdd<d=dd      dG emddH      dI emddddd      dJ emdd<d=dd      dK emddH      dL emddddd      dM emdd<ddN      dO emddH      dP emddddd      dQ emdd<d=dd            Zne<ddecdSed)ebfdR       Zoe<ddecdSed)ebfdS       Zpe<ddecdSed)ebfdT       Zqe<ddecdSed)ebfdU       Zre<ddecdSed)ebfdV       Zse<ddecdSed)ebfdW       Zte<ddecdSed)ebfdX       Zue<ddecdSed)ebfdY       Zve<ddecdSed)ebfdZ       Zwe<ddecdSed)ebfd[       Zxe<ddecdSed)ebfd\       Zye<ddecdSed)ebfd]       Zze<ddecdSed)ebfd^       Z{e<ddecdSed)ebfd_       Z|e<ddecdSed)ebfd`       Z}e<ddecdSed)ebfda       Z~e<ddecdSed)ebfdb       Ze<ddecdSed)ebfdc       Ze<ddecdSed)ebfdd       Ze<ddecdSed)ebfde       Ze<ddecdSed)ebfdf       Ze<ddecdSed)ebfdg       Ze<ddecdSed)ebfdh       Ze<ddecdSed)ebfdi       Ze<ddecdSed)ebfdj       Ze<ddecdSed)ebfdk       Ze<ddecdSed)ebfdl       Ze<ddecdSed)ebfdm       Ze<ddecdSed)ebfdn       Ze<ddecdSed)ebfdo       Ze<ddecdSed)ebfdp       Ze<ddecdSed)ebfdq       Ze<ddecdSed)ebfdr       Ze<ddecdSed)ebfds       Ze<ddecdSed)ebfdt       Ze<ddecdSed)ebfdu       Ze<ddecdSed)ebfdv       Ze<ddecdSed)ebfdw       Ze<ddecdSed)ebfdx       Ze<ddecdSed)ebfdy       Ze<ddecdSed)ebfdz       Ze<ddecdSed)ebfd{       Ze<ddecdSed)ebfd|       Ze<ddecdSed)ebfd}       Ze<ddecdSed)ebfd~       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Ze<ddecdSed)ebfd       Zy(  a   MaxVit and CoAtNet Vision Transformer - CNN Hybrids in PyTorch

This is a from-scratch implementation of both CoAtNet and MaxVit in PyTorch.

99% of the implementation was done from papers, however last minute some adjustments were made
based on the (as yet unfinished?) public code release https://github.com/google-research/maxvit

There are multiple sets of models defined for both architectures. Typically, names with a
 `_rw` suffix are my own original configs prior to referencing https://github.com/google-research/maxvit.
These configs work well and appear to be a bit faster / lower resource than the paper.

The models without extra prefix / suffix' (coatnet_0_224, maxvit_tiny_224, etc), are intended to
match paper, BUT, without any official pretrained weights it's difficult to confirm a 100% match.

Papers:

MaxViT: Multi-Axis Vision Transformer - https://arxiv.org/abs/2204.01697
@article{tu2022maxvit,
  title={MaxViT: Multi-Axis Vision Transformer},
  author={Tu, Zhengzhong and Talebi, Hossein and Zhang, Han and Yang, Feng and Milanfar, Peyman and Bovik, Alan and Li, Yinxiao},
  journal={ECCV},
  year={2022},
}

CoAtNet: Marrying Convolution and Attention for All Data Sizes - https://arxiv.org/abs/2106.04803
@article{DBLP:journals/corr/abs-2106-04803,
  author    = {Zihang Dai and Hanxiao Liu and Quoc V. Le and Mingxing Tan},
  title     = {CoAtNet: Marrying Convolution and Attention for All Data Sizes},
  journal   = {CoRR},
  volume    = {abs/2106.04803},
  year      = {2021}
}

Hacked together by / Copyright 2022, Ross Wightman
    N)OrderedDict)	dataclassreplacefield)partial)AnyCallableDictListOptionalSetTupleUnion)nn)Final)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)MlpConvMlpDropPath	LayerNormClassifierHeadNormMlpClassifierHead)create_attnget_act_layerget_norm_layerget_norm_act_layercreate_conv2dcreate_pool2d)trunc_normal_tf_	to_2tupleextend_tuplemake_divisible_assert)	RelPosMlp
RelPosBiasRelPosBiasTfuse_fused_attnresize_rel_pos_bias_table   )build_model_with_cfg)feature_take_indices)register_notrace_function)named_applycheckpoint_seq)generate_default_cfgsregister_model)
MaxxVitCfgMaxxVitConvCfgMaxxVitTransformerCfgMaxxVitc                   d   e Zd ZU dZdZeed<   dZeed<   dZ	e
ed<   dZeed<   dZeed	<   dZeed
<   dZe
ed<   dZe
ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeeeef      ed<   dZeeeef      ed<   dZeed<   dZeed<   dZee
   ed<   dZeed<   dZeed<   d Zeed!<   d"Ze
ed#<   d$ Z y)%r4   z-Configuration for MaxxVit transformer blocks.    dim_headT
head_first      @expand_ratioexpand_firstshortcut_bias	attn_bias        	attn_drop	proj_dropavg2	pool_typebiasrel_pos_type   rel_pos_dimpartition_ratioNwindow_size	grid_sizeFno_block_attnuse_nchw_attninit_valuesgelu	act_layerlayernorm2d
norm_layer	layernormnorm_layer_clư>norm_epsc                     | j                   t        | j                         | _         | j                  9t        | j                        | _        | j                   | j                  | _         y y y N)rJ   r!   rI   selfs    O/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/maxxvit.py__post_init__z#MaxxVitTransformerCfg.__post_init__V   s\    >>%&t~~6DN'()9)9:D~~%!%!1!1 & (    )!__name__
__module____qualname____doc__r8   int__annotations__r9   boolr;   floatr<   r=   r>   r@   rA   rC   strrE   rG   rH   rI   r   r   rJ   rK   rL   rM   rO   rQ   rS   rU   r[    r\   rZ   r4   r4   =   s    7HcJL%L$M4ItIuIuIsL#KOS-1K%S/*1+/Ixc3h(/M4M4#'K%'Is#J#$M3$He2r\   r4   c                   <   e Zd ZU dZdZeed<   dZeed<   dZ	e
ed<   dZeed	<   d
Zeed<   dZe
ed<   dZe
ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZe
ed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZeed <   dZeed!<   dZeed"<   d#Zee   ed$<   d% Zy#)&r3   z-Configuration for MaxxVit convolution blocks.mbconv
block_typer:   r;   Texpand_output   kernel_sizer*   
group_sizeFpre_norm_actoutput_biasdwstride_moderB   rC   downsample_pool_type padding
attn_earlyse
attn_layersiluattn_act_layer      ?
attn_ratiorT   rM   rN   rO   rQ   rS   NrU   c                    | j                   dv sJ | j                   dk(  }| j                  s|rdnd| _        | j                  s	|sd| _        | j                  |rdnd| _        | j                  xs | j
                  | _        y )N)rh   convnextrh   batchnorm2drP   rR   h㈵>rT   )ri   rQ   rS   rU   rr   rC   )rY   
use_mbconvs     rZ   r[   zMaxxVitConvCfg.__post_init__w   st    "8888__0
/9m}DO!!*!,D== $.DDDM$($=$=$O!r\   )r]   r^   r_   r`   ri   re   rb   r;   rd   rj   rc   rl   ra   rm   rn   ro   rq   rC   rr   rt   ru   rw   ry   r{   rM   r   rO   rQ   rS   rU   r[   rf   r\   rZ   r3   r3   _   s    7JL%M4KJL$KKIs &#&GSJJ NC J#'K%'IsJM3 $Hhuo$
Pr\   r3   c                       e Zd ZU dZdZeedf   ed<   dZeedf   ed<   dZ	ee
eeedf   f   df   ed<   d	Ze
eeeef   f   ed
<   dZeed<    ee      Zeed<    ee      Zeed<   dZee   ed<   dZeed<   y)r2   z!Configuration for MaxxVit models.`           .	embed_dim   rk      r   depths)Cr   Tr   ri   @   
stem_widthF	stem_bias)default_factoryconv_cfgtransformer_cfgNhead_hidden_sizevit_effweight_init)r]   r^   r_   r`   r   r   ra   rb   r   ri   r   re   r   r   rc   r   r3   r   r4   r   r   r   r   rf   r\   rZ   r2   r2      s    +!4IuS#X4*FE#s(O*:NJeCsCx01367N.0Jc5c?*+0It$^DHnD-2CX-YO*Y&*hsm* K r\   r2   c                        e Zd ZU dZee   ed<   	 	 	 	 	 	 	 	 ddedee   dedededed	ee	   d
e
de
f fdZddej                  deej                     dej                  fdZ xZS )Attention2dz)Multi-head attention for 2D NCHW tensors.
fused_attndimdim_outr8   rD   r<   r9   rel_pos_clsr@   rA   c
                    t         |           |xs |}|r|n|}
|
|z  | _        || _        || _        |dz  | _        t               | _        t        j                  ||
dz  d|      | _
        |r || j                        nd| _        t        j                  |      | _        t        j                  |
|d|      | _        t        j                  |	      | _        y)  
        Args:
            dim: Input dimension.
            dim_out: Output dimension (defaults to input dimension).
            dim_head: Dimension per attention head.
            bias: Whether to use bias in qkv and projection.
            expand_first: Whether to expand channels before or after qkv.
            head_first: Whether heads are first in tensor layout.
            rel_pos_cls: Relative position class to use.
            attn_drop: Attention dropout rate.
            proj_drop: Projection dropout rate.
              rk   r*   rD   	num_headsN)super__init__r   r8   r9   scaler(   r   r   Conv2dqkvrel_posDropoutr@   projrA   rY   r   r   r8   rD   r<   r9   r   r@   rA   dim_attn	__class__s              rZ   r   zAttention2d.__init__   s    0 	.S*7!X- $%
(*99S(Q,=@K{T^^<QUI.IIh>	I.r\   xshared_rel_posreturnc                    |j                   \  }}}}| j                  rP| j                  |      j                  || j                  | j
                  dz  d      j                  dd      \  }}}	nK| j                  |      j                  |d| j                  | j
                  d      j                  d      \  }}}	| j                  rd }
| j                  | j                  j                         }
n||}
t        j                  j                  j                  |j!                  dd      j#                         |j!                  dd      j#                         |	j!                  dd      j#                         |
| j$                  r| j&                  j(                  nd      j!                  dd      j                  |d||      }n|| j*                  z  }|j!                  dd      |z  }| j                  | j                  |      }n|||z   }|j-                  d      }| j'                  |      }|	|j!                  dd      z  j                  |d||      }| j/                  |      }| j1                  |      }|S )	Nrk   r   r   r*   r?   	attn_mask	dropout_p)shaper9   r   viewr   r8   chunkreshapeunbindr   r   get_biastorchr   
functionalscaled_dot_product_attention	transpose
contiguoustrainingr@   pr   softmaxr   rA   )rY   r   r   Br   HWqkvr>   attns               rZ   forwardzAttention2d.forward   s   WW
1a??hhqk&&q$..$--!:KRPVVWX^_V`GAq!hhqk))!QrRYYZ[\GAq!??I||' LL113	+*	##@@B#..0B#..0B#..0#.2mm$..** A  iB2q! 4  DJJA;;r2&*D||'||D)+n,<<B<'D>>$'DT^^B++11!RA>AIIaLNN1r\   Nr7   TTTNr?   r?   rW   r]   r^   r_   r`   r   rc   rb   ra   r   r	   rd   r   r   Tensorr   __classcell__r   s   @rZ   r   r      s    3d
 &*!%#.2!!%/%/ c]%/ 	%/
 %/ %/ %/ "(+%/ %/ %/N# #x7M #Y^YeYe #r\   r   c                        e Zd ZU dZee   ed<   	 	 	 	 	 	 	 	 ddedee   dedededed	ee	   d
e
de
f fdZddej                  deej                     dej                  fdZ xZS )AttentionClz/Channels-last multi-head attention (B, ..., C).r   r   r   r8   rD   r<   r9   r   r@   rA   c
                    t         |           |xs |}|r||kD  r|n|}
|
|z  dk(  sJ d       |
|z  | _        || _        || _        |dz  | _        t               | _        t        j                  ||
dz  |      | _
        |r || j                        nd| _        t        j                  |      | _        t        j                  |
||      | _        t        j                  |	      | _        y)r   r   z(attn dim should be divisible by head_dimr   rk   r   r   N)r   r   r   r8   r9   r   r(   r   r   Linearr   r   r   r@   r   rA   r   s              rZ   r   zAttentionCl.__init__   s    0 	.S*w}7#("a'S)SS'!X- $%
(*99S(Q,T:@K{T^^<QUI.IIhd;	I.r\   r   r   r   c                 V   |j                   d   }|j                   d d }| j                  r`| j                  |      j                  |d| j                  | j
                  dz        j                  dd      j                  dd      \  }}}n[| j                  |      j                  |dd| j                  | j
                        j                  dd      j                  d      \  }}}| j                  r~d }| j                  | j                  j                         }n||}t        j                  j                  j!                  ||||| j"                  r| j$                  j&                  nd      }ns|| j(                  z  }||j                  d	d      z  }	| j                  | j                  |	|
      }	n||	|z   }	|	j+                  d      }	| j%                  |	      }	|	|z  }|j                  dd      j                  |dz         }| j-                  |      }| j/                  |      }|S )Nr   r   rk   r*   r   r   r?   r   r   r   )r   )r   r9   r   r   r   r8   r   r   r   r   r   r   r   r   r   r   r   r   r@   r   r   r   r   rA   )
rY   r   r   r   restore_shaper   r   r   r>   r   s
             rZ   r   zAttentionCl.forward  s   GGAJ??hhqk&&q"dnndmma>OPZZ[\^_`ffghnofpGAq!hhqk))!RDNNDMMR\\]^`abiijklGAq!??I||' LL113	+*	##@@1a#.2mm$..** A A DJJAq{{2r**D||'||D|H+n,<<B<'D>>$'DqAKK1%%me&;<IIaLNN1r\   r   rW   r   r   s   @rZ   r   r      s    9d
 &*!%#.2!!&/&/ c]&/ 	&/
 &/ &/ &/ "(+&/ &/ &/P# #x7M #Y^YeYe #r\   r   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ
 xZS )

LayerScalezPer-channel scaling layer.r   rM   inplacec                     t         |           || _        t        j                  |t        j                  |      z        | _        yz
        Args:
            dim: Number of channels.
            init_values: Initial scaling value.
            inplace: Whether to perform inplace operations.
        Nr   r   r   r   	Parameterr   onesgammarY   r   rM   r   r   s       rZ   r   zLayerScale.__init__8  4     	\\+

3"?@
r\   r   r   c                 ^    | j                   }| j                  r|j                  |      S ||z  S rW   )r   r   mul_rY   r   r   s      rZ   r   zLayerScale.forwardC  s(    

 $qvve};!e);r\   r   Fr]   r^   r_   r`   ra   rd   rc   r   r   r   r   r   r   s   @rZ   r   r   5  s?    $	AC 	Ae 	AT 	A< <%,, <r\   r   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ
 xZS )
LayerScale2dz)Per-channel scaling layer for 2D tensors.r   rM   r   c                     t         |           || _        t        j                  |t        j                  |      z        | _        yr   r   r   s       rZ   r   zLayerScale2d.__init__K  r   r\   r   r   c                     | j                   j                  dddd      }| j                  r|j                  |      S ||z  S )Nr*   r   )r   r   r   r   r   s      rZ   r   zLayerScale2d.forwardV  s7    

2q!, $qvve};!e);r\   r   r   r   s   @rZ   r   r   H  s?    3	AC 	Ae 	AT 	A< <%,, <r\   r   c                   x     e Zd ZdZ	 	 	 ddededededef
 fdZdej                  d	ej                  fd
Z
 xZS )Downsample2da5  A downsample pooling module supporting several maxpool and avgpool modes.

    * 'max' - MaxPool2d w/ kernel_size 3, stride 2, padding 1
    * 'max2' - MaxPool2d w/ kernel_size = stride = 2
    * 'avg' - AvgPool2d w/ kernel_size 3, stride 2, padding 1
    * 'avg2' - AvgPool2d w/ kernel_size = stride = 2
    r   r   rC   rt   rD   c                    t         |           |dv sJ |dk(  rt        ddd|xs d      | _        nS|dk(  rt        dd|xs d	      | _        n6|d
k(  rt        d
ddd|xs d      | _        nt        d
d|xs d	      | _        ||k7  rt	        j
                  ||d|      | _        yt	        j                         | _        y)z
        Args:
            dim: Input dimension.
            dim_out: Output dimension.
            pool_type: Type of pooling operation.
            padding: Padding mode.
            bias: Whether to use bias in expansion conv.
        )maxmax2avgrB   r   rk   r   r*   )rl   stridert   r   r   )rt   r   F)rl   r   count_include_padrt   r   N)r   r   r   poolr   r   expandIdentity)rY   r   r   rC   rt   rD   r   s         rZ   r   zDownsample2d.__init__d  s      	::::%e1glYZ[DI& %eQ1EDI%%1Q%QXQ]\]_DI &eQ1EDI'>))C!$?DK++-DKr\   r   r   c                 J    | j                  |      }| j                  |      }|S rW   )r   r   rY   r   s     rZ   r   zDownsample2d.forward  s!    IIaLKKNr\   )rB   rs   T)r]   r^   r_   r`   ra   re   rc   r   r   r   r   r   r   s   @rZ   r   r   [  sd     $(( ( 	(
 ( (B %,, r\   r   rs   modulenameschemer   c                    t        | t        j                  t        j                  f      r|dk(  rbt        j                  j                  | j                  d       | j                  *t        j                  j                  | j                         yy|dk(  rNt        | j                  d       | j                  *t        j                  j                  | j                         yy|dk(  r`t        j                  j                  | j                         | j                  *t        j                  j                  | j                         yyt        j                  j                  | j                         | j                  Zd|v r,t        j                  j                  | j                  d       yt        j                  j                  | j                         yyy)	z&Initialize transformer module weights.normal{Gz?stdNtrunc_normalxavier_normalmlprT   )
isinstancer   r   r   initnormal_weightrD   zeros_r    xavier_normal_xavier_uniform_)r   r   r   s      rZ   _init_transformerr    s5   &299bii01XGGOOFMMsO3{{&v{{+ '~%V]]4{{&v{{+ '&GG""6==1{{&v{{+ ' GG##FMM2{{&D=GGOOFKKTO:GGNN6;;/	 '! 2r\   c                        e Zd ZdZdd e       dfdedededee   d	ed
ef fdZ	dde
ddfdZddej                  deej                     dej                  fdZ xZS )TransformerBlock2daY  Transformer block with 2D downsampling.

    '2D' NCHW tensor layout

    Some gains can be seen on GPU using a 1D / CL block, BUT w/ the need to switch back/forth to NCHW
    for spatial pooling, the benefit is minimal so ended up using just this variant for CoAt configs.

    This impl was faster on TPU w/ PT XLA than the 1D experiment.
    r*   Nr?   r   r   r   r   cfg	drop_pathc                    t         	|           t        t        |j                        |j
                        }t        |j                        }|dk(  rnt        |||j                  |j                        | _        t        j                  t        d ||      fdt        |||j                        fg            | _        n-||k(  sJ t        j                          | _         ||      | _        t#        |||j$                  |j&                  |j(                  ||j*                  |j,                        | _        |j0                  rt3        ||j0                        nt        j                          | _        |d	kD  rt7        |      nt        j                          | _         ||      | _        t=        |t?        ||j@                  z        ||j,                  
      | _!        |j0                  rt3        ||j0                        nt        j                          | _"        |d	kD  rt7        |      | _#        yt        j                          | _#        y)a  
        Args:
            dim: Input dimension.
            dim_out: Output dimension.
            stride: Stride for downsampling.
            rel_pos_cls: Relative position class.
            cfg: Transformer block configuration.
            drop_path: Drop path rate.
        epsr   )rC   rD   normdownrC   )r8   r<   rD   r   r@   rA   rM   r?   in_featureshidden_featuresrO   dropN)$r   r   r   r   rQ   rU   r   rO   r   rC   r=   shortcutr   
Sequentialr   norm1r   r   r8   r<   r>   r@   rA   r   rM   r   ls1r   
drop_path1norm2r   ra   r;   r  ls2
drop_path2)
rY   r   r   r   r   r  r  rQ   rO   r   s
            rZ   r   zTransformerBlock2d.__init__  s   $ 	^CNN;N
!#--0	Q;(gUXUfUfgDM{C)c3#--HI4 ( DJ
 '>!>KKMDM#CDJ\\))#mmmm	
	 JM<S__E^`^i^i^k1:R(9-R[[](
#*:*: :;	 
 JM<S__E^`^i^i^k1:R(9-R[[]r\   r   r   c                 :    t        t        t        |      |        y Nr   )r.   r   r  rY   r   s     rZ   init_weightszTransformerBlock2d.init_weights  s    G-f=tDr\   r   r   c           
      ,   | j                  |      | j                  | j                  | j                  | j	                  |      |                  z   }|| j                  | j                  | j                  | j                  |                        z   }|S )Nr   )	r  r!  r   r   r  r$  r#  r  r"  )rY   r   r   s      rZ   r   zTransformerBlock2d.forward  so    MM!ttxx		$**Q-`n	8o/pqq$**Q-)@ ABBr\   rs   rW   )r]   r^   r_   r`   r4   ra   r   r	   rd   r   re   r)  r   r   r   r   r   s   @rZ   r  r    s     .2)>)@!5S5S 5S 	5S
 "(+5S '5S 5SnE3 E E x7M Y^YeYe r\   r  c                    t        | t        j                        r|dk(  rbt        j                  j	                  | j
                  d       | j                  *t        j                  j                  | j                         yy|dk(  rNt        | j
                  d       | j                  *t        j                  j                  | j                         yy|dk(  r`t        j                  j                  | j
                         | j                  *t        j                  j                  | j                         yy| j                  d   | j                  d   z  | j                  z  }|| j                  z  }t        j                  j	                  | j
                  dt        j                  d	|z               | j                  *t        j                  j                  | j                         yyy)
z&Initialize convolution module weights.r   r   r  Nr  r  r   r*   g       @)r  r   r   r  r  r	  rD   r
  r    r  rl   out_channelsgroupsmathsqrt)r   r   r   fan_outs       rZ   
_init_convr2    sU   &"))$XGGOOFMMsO3{{&v{{+ '~%V]]4{{&v{{+ '&GG""6==1{{&v{{+ ' ((+f.@.@.CCfFYFYYG%GGGOOFMM1diig.FG{{&v{{+ '% %r\   rm   channelsc                 &    | sy|| z  dk(  sJ || z  S )z3Calculate number of groups for grouped convolution.r*   r   rf   )rm   r3  s     rZ   
num_groupsr5    s(     *$))):%%r\   c                        e Zd ZdZdd e       dfdedededeeef   d	ed
ef fdZdde	ddfdZ
dej                  dej                  fdZ xZS )MbConvBlockzGPre-Norm Conv Block - 1x1 - kxk - 1x1, w/ inverted bottleneck (expand).r*   r*   r*   r?   in_chsout_chsr   dilationr  r  c           	         t         t        |           t        t	        |j
                  |j                        |j                        }t        |j                  r|n||j                  z        }t        |j                  |      }	|dk(  r4t        |||j                  |j                  |j                         | _        nt%        j&                         | _        |j(                  dv sJ d\  }
}}|j(                  dk(  r||d   }}
n|j(                  dk(  r||d   }}n||d	   }} |||j*                  
      | _        |
dkD  r)t        |||j.                  |j                         | _        nt%        j&                         | _        t3        ||d|      | _         ||      | _        t3        |||j8                  |||	|j                         | _        i }t=        |j>                  t@              rV|j>                  dk(  s|j>                  dk(  r8|jB                  |d<   tE        |jF                  |j                  r|n|z        |d<   |jH                  r1tK        |j>                  |fi || _&         ||      | _'        d| _(        n0d| _&         ||      | _'        tK        |j>                  |fi || _(        t3        ||d|j                        | _)        |dkD  rtU        |      | _+        yt%        j&                         | _+        y)a  
        Args:
            in_chs: Input channels.
            out_chs: Output channels.
            stride: Stride for conv.
            dilation: Dilation for conv.
            cfg: Convolution block configuration.
            drop_path: Drop path rate.
        r  r   )rC   rD   rt   )r   1x1rp   )r*   r*   r*   r   r*   r=  r   )	apply_act)rC   rt   )r   )r   r;  r.  rt   rv   ecarO   rd_channelsNr   r?   ),r   r7  r   r   r   rQ   rO   rU   r#   rj   r;   r5  rm   r   rC   ro   rt   r  r   r   rq   rn   pre_normrr   r  r   	conv1_1x1r  rl   	conv2_kxkr  rw   re   ry   ra   r{   ru   r   se_earlyr"  rv   	conv3_1x1r   r  )rY   r9  r:  r   r;  r  r  norm_act_layermid_chsr.  stride_poolstride_1stride_2
dilation_2attn_kwargsr   s                  rZ   r   zMbConvBlock.__init__  sy   $ 	k4)+ !3CNNCMM!RX[XdXde S->->'FcN^N^!^_CNNG4Q;(3==sX[XcXceDM KKMDM"7777*1'Xx??f$&,hqkK__%#)8A;jH#)8A;jH&v9I9IJ?$VVs?W?WadalalmDIDI&vw(K#G,
&WcoojV cnnc*~~%5)@+.+=+=K(-0cN_N_7el1m-nM* >>'O;ODM'0DJDG DM'0DJ!#..'I[IDG&wQ09B),BKKMr\   r   r   Nc                 :    t        t        t        |      |        y r&  r.   r   r2  r(  s     rZ   r)  zMbConvBlock.init_weights\      GJv6=r\   r   c                    | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  | j                  |      }| j                  |      }| j                  | j                  |      }| j                  |      }| j                  |      |z   }|S rW   )r  rA  r  rB  r  rC  rD  r"  rv   rE  r  rY   r   r  s      rZ   r   zMbConvBlock.forward_  s    ==#MM!IIaL NN1JJqM NN1==$a AJJqM77
A NN1NN1(r\   r+  )r]   r^   r_   r`   r3   ra   r   rd   r   re   r)  r   r   r   r   r   s   @rZ   r7  r7    s    Q (."0"2!FRFR FR 	FR
 CHoFR  FR FRP>3 > > %,, r\   r7  c                        e Zd ZdZdddd e       ddfded	ee   d
ededeeef   dedede	f fdZ
dej                  dej                  fdZ xZS )ConvNeXtBlockzConvNeXt Block.N   r*   r8  Tr?   r9  r:  rl   r   r;  r  conv_mlpr  c	           	         t         |           |xs |}t        |j                        }	|r1t	        t        |j                        |j                        }
t        }nd|j                  v sJ t        }
t        }|| _        |dk(  rt        ||      | _        nG||k7  r)t        j                  ||d|j                         | _        nt        j"                         | _        |j$                  dv sJ d\  }}|j$                  dk(  r|}n|}|dk(  rt        |||j&                  	      | _        nt        j"                         | _        t+        |||||d   d
|j                         | _         |
|      | _         ||t1        |j2                  |z        |j                   |	      | _        |r<|j6                  rt9        ||j6                        nt        j"                         | _        n;|j6                  rt=        ||j6                        nt        j"                         | _        |dkD  rt?        |      | _         yt        j"                         | _         y)ay  
        Args:
            in_chs: Input channels.
            out_chs: Output channels.
            kernel_size: Kernel size for depthwise conv.
            stride: Stride for conv.
            dilation: Dilation for conv.
            cfg: Convolution block configuration.
            conv_mlp: Whether to use convolutional MLP.
            drop_path: Drop path rate.
        r  rR   r   r*   )rl   rD   )r   rp   r8  r   r  T)rl   r   r;  	depthwiserD   )rD   rO   r?   N)!r   r   r   rO   r   r   rQ   rU   r   r   r   use_conv_mlpr   r  r   r   ro   r   rq   rr   r  r   conv_dwr  ra   r;   r  rM   r   lsr   r   r  )rY   r9  r:  rl   r   r;  r  rU  r  rO   rQ   	mlp_layerrH  	stride_dwr   s                 rZ   r   zConvNeXtBlock.__init__y  s   , 	#V!#--0	 !?S\\RJI#..000"JI$Q;(9DMwIIfg13??[DMKKMDM.000!%Y??f$ KI!$VVs?W?WXDIDI$GYQYZ[Q\2 w'	Wc#*:*:W*D&ECOOgpq@Cl7COO<UWU`U`UbDG>Aooj#//:SUS^S^S`DG09B),BKKMr\   r   r   c                    | j                  |      }| j                  |      }| j                  |      }| j                  r4| j	                  |      }| j                  |      }| j                  |      }n[|j                  dddd      }| j	                  |      }| j                  |      }| j                  |      }|j                  dddd      }| j                  |      |z   }|S Nr   r   rk   r*   )	r  r  rY  rX  r  r  rZ  permuter  rQ  s      rZ   r   zConvNeXtBlock.forward  s    ==#IIaLLLO		!AA
A		!Q1%A		!AA
A		!Q1%ANN1(r\   )r]   r^   r_   r`   r3   ra   r   r   rc   rd   r   r   r   r   r   r   s   @rZ   rS  rS  v  s    
 &* (."0"2!!?R?R c]?R 	?R
 ?R CHo?R  ?R ?R ?RB %,, r\   rS  r   rI   c                 l   | j                   \  }}}}t        ||d   z  dk(  d| d|d    d       t        ||d   z  dk(  d| d|d    d       | j                  |||d   z  |d   ||d   z  |d   |      } | j                  ddddd	d
      j	                         j                  d|d   |d   |      }|S )z'Partition into non-overlapping windows.r   height () must be divisible by window ()r*   width (rk   r      r   r   r   r$   r   r_  r   )r   rI   r   r   r   r   windowss          rZ   window_partitionrh    s    JAq!QAA!#xs2QR]^_R`Qaab%cdAA!#wqc1PQ\]^Q_P``a%bc	q!{1~%{1~qKN7JKXYN\]^Aii1aAq)446;;BAP[\]P^`abGNr\   rg  img_sizec                     |\  }}| j                   d   }| j                  d||d   z  ||d   z  |d   |d   |      }|j                  dddddd      j                         j                  d|||      }|S )zReverse window partition.r   r   r*   rk   r   re  r   r   r   r_  r   rg  rI   ri  r   r   r   r   s          rZ   window_reverserm    s     DAqbARk!n,a;q>.A;q>S^_`SacdeA			!Q1a#..055b!QBAHr\   rJ   c           	      h   | j                   \  }}}}t        ||d   z  dk(  d| d|d           t        ||d   z  dk(  d| d|d           | j                  ||d   ||d   z  |d   ||d   z  |      } | j                  dddddd	      j	                         j                  d
|d   |d   |      }|S )z6Partition into overlapping windows with grid striding.r   height  must be divisible by grid r*   width r   re  rk   r   r   rf  )r   rJ   r   r   r   r   rg  s          rZ   grid_partitionrr    s    JAq!QA	!!WQC/J9UV<.#YZA	!!VA3.I)TU,#XY	q)A,Yq\ 19Q<iPQlARTUVAii1aAq)446;;B	!iXYl\]^GNr\   c                     |\  }}| j                   d   }| j                  d||d   z  ||d   z  |d   |d   |      }|j                  dddddd      j                         j                  d|||      }|S )zReverse grid partition.r   r   r*   rk   re  r   r   rk  rg  rJ   ri  r   r   r   r   s          rZ   grid_reverseru    s     DAqbARil*A1,=y|YWX\[\]A			!Q1a#..055b!QBAHr\   r  c                     d}| j                   dk(  rt        t        || j                        }|S | j                   dk(  rt        t        |      }|S | j                   dk(  rt        t
        |      }|S )z,Get relative position class based on config.Nr  )rI   
hidden_dimrD   )rI   bias_tf)rE   r   r%   rG   r&   r'   )r  rI   r   s      rZ   get_rel_pos_clsry    su    K
5 i[S__]
 	 
		V	#jkB  
		Y	&lDr\   c            	       R     e Zd ZdZd e       dfdedededef fdZd	 Z	d
 Z
 xZS )PartitionAttentionClzRGrid or Block partition + Attn + FFN.

    NxC 'channels last' tensor layout.
    blockr?   r   partition_typer  r  c           
         t         |           t        t        |j                        |j
                        }t        |j                        }|dk(  | _        t        | j                  r|j                  n|j                        | _        t        || j                        } ||      | _        t        |||j                   |j"                  |j$                  ||j&                  |j(                        | _        |j,                  rt/        ||j,                        nt1        j2                         | _        |dkD  rt7        |      nt1        j2                         | _         ||      | _        t=        |t?        ||j@                  z        ||j(                        | _!        |j,                  rt/        ||j,                        nt1        j2                         | _"        |dkD  rt7        |      | _#        y t1        j2                         | _#        y )Nr  r|  r8   rD   r9   r   r@   rA   r  r?   r  )$r   r   r   r   rS   rU   r   rO   partition_blockr!   rI   rJ   partition_sizery  r  r   r8   r>   r9   r@   rA   r   rM   r   r   r   r   r   r!  r"  r   ra   r;   r  r#  r$  	rY   r   r}  r  r  rQ   rO   r   r   s	           rZ   r   zPartitionAttentionCl.__init__  s|    	^C,=,=>CLLQ
!#--0	-8'4;O;OUXUbUbc%c4+>+>?_
\\~~#mmmm	
	 DG??:cs?XZXcXcXe1:R(9-R[[]_
c&6&6 67	 
 DG??:cs?XZXcXcXe1:R(9-R[[]r\   c                 0   |j                   dd }| j                  rt        || j                        }nt	        || j                        }| j                  |      }| j                  rt        || j                  |      }|S t        || j                  |      }|S )Nr*   rk   )r   r  rh  r  rr  r   rm  ru  rY   r   ri  partitioneds       rZ   _partition_attnz$PartitionAttentionCl._partition_attn-  s    771Q<*1d.A.ABK(D,?,?@Kii,{D,?,?JA  [$*=*=xHAr\   c           
      
   || j                  | j                  | j                  | j                  |                        z   }|| j	                  | j                  | j                  | j                  |                        z   }|S rW   r!  r   r  r  r$  r#  r  r"  r   s     rZ   r   zPartitionAttentionCl.forward<  c    )=)=djjm)L MNN$**Q-)@ ABBr\   )r]   r^   r_   r`   r4   ra   re   rd   r   r  r   r   r   s   @rZ   r{  r{    sQ     #*)>)@!$S$S  $S '	$S
 $SLr\   r{  c                        e Zd ZdZ e       dfdededef fdZdej                  dej                  fd	Z
dej                  dej                  fd
Z xZS )ParallelPartitionAttentionzQExperimental. Grid and Block partition + single FFN.

    NxC tensor layout.
    r?   r   r  r  c           
         t         |           |dz  dk(  sJ t        t        |j                        |j
                        }t        |j                        }|j                  |j                  k(  sJ t        |j                        | _        t        || j                        } ||      | _        t        ||dz  |j                  |j                   |j"                  ||j$                  |j&                        | _        t        ||dz  |j                  |j                   |j"                  ||j$                  |j&                        | _        |j,                  rt/        ||j,                        nt1        j2                         | _        |dkD  rt7        |      nt1        j2                         | _         ||      | _        t=        |t?        ||j@                  z        |||j&                        | _!        |j,                  rt/        ||j,                        nt1        j2                         | _"        |dkD  rt7        |      | _#        yt1        j2                         | _#        y)	z
        Args:
            dim: Input dimension.
            cfg: Transformer block configuration.
            drop_path: Drop path rate.
        r   r   r  r  r  r?   )r  r  out_featuresrO   r  N)$r   r   r   r   rS   rU   r   rO   rI   rJ   r!   r  ry  r  r   r8   r>   r9   r@   rA   
attn_block	attn_gridrM   r   r   r   r   r   r!  r"  r   ra   r;   r  r#  r$  )rY   r   r  r  rQ   rO   r   r   s          rZ   r   z#ParallelPartitionAttention.__init__H  s    	Qw!||^C,=,=>CLLQ
!#--0	#--///'8%c4+>+>?_
%1H\\~~#mmmm	
 %1H\\~~#mmmm	
 DG??:cs?XZXcXcXe1:R(9-R[[]_
c&6&6 67  DG??:cs?XZXcXcXe1:R(9-R[[]r\   r   r   c                 J   |j                   dd }t        || j                        }| j                  |      }t	        || j                  |      }t        || j                        }| j                  |      }t        || j                  |      }t        j                  ||gd      S )Nr*   rk   r   r   )
r   rh  r  r  rm  rr  r  ru  r   cat)rY   r   ri  partitioned_blockx_windowpartitioned_gridx_grids          rZ   r  z*ParallelPartitionAttention._partition_attn  s    771Q<,Q0C0CD OO,=>!"3T5H5H(S)!T-@-@A>>*:;.0C0CXNyy(F+44r\   c           
      
   || j                  | j                  | j                  | j                  |                        z   }|| j	                  | j                  | j                  | j                  |                        z   }|S rW   r  r   s     rZ   r   z"ParallelPartitionAttention.forward  r  r\   )r]   r^   r_   r`   r4   ra   rd   r   r   r   r  r   r   r   s   @rZ   r  r  B  so     *?)@!	5S5S '5S 	5Sn5 5%,, 5 %,, r\   r  c           	      l   | j                   \  }}}}t        ||d   z  dk(  d| d|d    d       t        ||d   z  dk(  d| d|d    d       | j                  ||||d   z  |d   ||d   z  |d         } | j                  ddddd	d
      j	                         j                  d||d   |d         }|S )z#Partition windows for NCHW tensors.r   ra  rb  rc  r*   rd  r   re  rk   r   r   rf  )r   rI   r   r   r   r   rg  s          rZ   window_partition_nchwr    s    JAq!QAA!#xs2QR]^_R`Qaab%cdAA!#wqc1PQ\]^Q_P``a%bc	q!Q+a.(+a.!{1~:M{[\~^Aii1aAq)446;;B;q>S^_`SabGNr\   c           	          |\  }}| j                   d   }| j                  d||d   z  ||d   z  ||d   |d         }|j                  dddddd      j                         j                  d|||      }|S )z*Reverse window partition for NCHW tensors.r*   r   r   rk   re  r   r   rk  rl  s          rZ   window_reverse_nchwr    s     DAqaARk!n,a;q>.A1kRSnVabcVdeA			!Q1a#..055b!QBAHr\   c           
      h   | j                   \  }}}}t        ||d   z  dk(  d| d|d           t        ||d   z  dk(  d| d|d           | j                  |||d   ||d   z  |d   ||d   z        } | j                  dddddd	      j	                         j                  d
||d   |d         }|S )z Grid partition for NCHW tensors.r   ro  rp  r*   rq  rk   r   r   re  r   rf  )r   rJ   r   r   r   r   rg  s          rZ   grid_partition_nchwr    s    JAq!QA	!!WQC/J9UV<.#YZA	!!VA3.I)TU,#XY	q!Yq\1	!#4ilASTDUVAii1aAq)446;;B9Q<QZ[\Q]^GNr\   c           	          |\  }}| j                   d   }| j                  d||d   z  ||d   z  ||d   |d         }|j                  dddddd      j                         j                  d|||      }|S )z(Reverse grid partition for NCHW tensors.r*   r   r   rk   re  r   r   rk  rt  s          rZ   grid_reverse_nchwr    s     DAqaARil*A1,=q)A,PYZ[P\]A			!Q1a#..055b!QBAHr\   c            	            e Zd ZdZd e       dfdedededef fdZd	e	j                  d
e	j                  fdZd	e	j                  d
e	j                  fdZ xZS )PartitionAttention2dzHGrid or Block partition + Attn + FFN.

    '2D' NCHW tensor layout.
    r|  r?   r   r}  r  r  c           
         t         |           t        t        |j                        |j
                        }t        |j                        }|dk(  | _        t        | j                  r|j                  n|j                        | _        t        || j                        } ||      | _        t        |||j                   |j"                  |j$                  ||j&                  |j(                        | _        |j,                  rt/        ||j,                        nt1        j2                         | _        |dkD  rt7        |      nt1        j2                         | _         ||      | _        t=        |t?        ||j@                  z        ||j(                        | _!        |j,                  rt/        ||j,                        nt1        j2                         | _"        |dkD  rt7        |      | _#        yt1        j2                         | _#        y)z
        Args:
            dim: Input dimension.
            partition_type: Partition type ('block' or 'grid').
            cfg: Transformer block configuration.
            drop_path: Drop path rate.
        r  r|  r  r  r?   r  N)$r   r   r   r   rQ   rU   r   rO   r  r!   rI   rJ   r  ry  r  r   r8   r>   r9   r@   rA   r   rM   r   r   r   r   r   r!  r"  r   ra   r;   r  r#  r$  r  s	           rZ   r   zPartitionAttention2d.__init__  sz    	^CNN;N
!#--0	-8'4;O;OUXUbUbc%c4+>+>?_
\\~~#mmmm	
	 FI__<AZ\ZeZeZg1:R(9-R[[]_
c&6&6 67	 
 FI__<AZ\ZeZeZg1:R(9-R[[]r\   r   r   c                 0   |j                   dd  }| j                  rt        || j                        }nt	        || j                        }| j                  |      }| j                  rt        || j                  |      }|S t        || j                  |      }|S )Nr   )r   r  r  r  r  r   r  r  r  s       rZ   r  z$PartitionAttention2d._partition_attn  s    7723</43F3FGK-a1D1DEKii,#K1D1DhOA  "+t/B/BHMAr\   c           
      
   || j                  | j                  | j                  | j                  |                        z   }|| j	                  | j                  | j                  | j                  |                        z   }|S rW   r  r   s     rZ   r   zPartitionAttention2d.forward  r  r\   )r]   r^   r_   r`   r4   ra   re   rd   r   r   r   r  r   r   r   s   @rZ   r  r    s}     #*)>)@!+S+S  +S '	+S
 +SZ %,,  %,, r\   r  c                   h     e Zd ZdZd e        e       dfdededededed	ef fd
ZddZ	d Z
 xZS )MaxxVitBlockz;MaxVit conv, window partition + FFN , grid partition + FFN.r*   r?   r   r   r   r   r   r  c                 >   t         
|           |j                  | _        |j                  dk(  rt
        nt        } ||||||      | _        t        |||      }| j                  rt        nt        }	|j                  rdn |	di || _         |	dddi|| _        y)a^  Initialize MaxxVitBlock.

        Args:
            dim: Input channel dimension.
            dim_out: Output channel dimension.
            stride: Stride for downsampling.
            conv_cfg: Configuration for convolutional blocks.
            transformer_cfg: Configuration for transformer blocks.
            drop_path: Drop path rate.
        r}   r   r  r  r   r  r  Nr}  gridrf   )r   r   rL   	nchw_attnri   rS  r7  convdictr  r{  rK   r  r  )rY   r   r   r   r   r   r  conv_clsrL  partition_layerr   s             rZ   r   zMaxxVitBlock.__init__  s    & 	(66$,$7$7:$E=;S'&hR[\	wOyQ26...FZ"1"?"?$_EcWbEc(NN+Nr\   c                     | j                   %t        t        t        |      | j                          t        t        t        |      | j                         t        t        t
        |      | j                         y r&  )r  r.   r   r  r  r2  r  r(  s     rZ   r)  zMaxxVitBlock.init_weights#  sM    ??& 1&A4??SG-f=t~~NGJv6		Br\   c                    | j                  |      }| j                  s|j                  dddd      }| j                  | j                  |      }| j	                  |      }| j                  s|j                  dddd      }|S r^  )r  r  r_  r  r  r   s     rZ   r   zMaxxVitBlock.forward)  sp    IIaL~~		!Q1%A??&"ANN1~~		!Q1%Ar\   r+  )r]   r^   r_   r`   r3   r4   ra   rd   r   r)  r   r   r   s   @rZ   r  r    sk    E '5'75J5L!OO O 	O
 %O 3O O<Cr\   r  c                        e Zd ZdZdd e        e       dfdedededed	ed
edef fdZdde	ddfdZ
dej                  dej                  fdZ xZS )ParallelMaxxVitBlockzYMaxVit block with parallel cat(window + grid), one FF.

    Experimental timm block.
    r*   r   r?   r   r   r   num_convr   r   r  c                 $   t         
|           |j                  dk(  rt        nt        }|dkD  r< ||||||      g}	|	 |||||      g|dz
  z  z  }	t        j                  |	 | _        n ||||||      | _        t        |||      | _	        y)aa  
        Args:
            dim: Input dimension.
            dim_out: Output dimension.
            stride: Stride for first conv block.
            num_conv: Number of convolution blocks.
            conv_cfg: Convolution block configuration.
            transformer_cfg: Transformer block configuration.
            drop_path: Drop path rate.
        r}   r*   r  )r  r  r  N)
r   r   ri   rS  r7  r   r  r  r  r   )rY   r   r   r   r  r   r   r  r  convsr   s             rZ   r   zParallelMaxxVitBlock.__init__=  s    ( 	$,$7$7:$E=;a<c76xS\]^EhwXSTX`cdXdeeEu-DI gf(V_`DI.7[de	r\   r   r   Nc                     t        t        t        |      | j                         t        t        t        |      | j
                         y r&  )r.   r   r  r   r2  r  r(  s     rZ   r)  z!ParallelMaxxVitBlock.init_weights\  s-    G-f=tyyIGJv6		Br\   r   c                     | j                  |      }|j                  dddd      }| j                  |      }|j                  dddd      }|S r^  )r  r_  r   r   s     rZ   r   zParallelMaxxVitBlock.forward`  sI    IIaLIIaAq!IIaLIIaAq!r\   r+  )r]   r^   r_   r`   r3   r4   ra   rd   r   re   r)  r   r   r   r   r   s   @rZ   r  r  7  s     '5'75J5L!ff f 	f
 f %f 3f f>C3 C C %,, r\   r  c                        e Zd ZdZdddd e        e       dfdeded	ed
edeeef   dee	ee	   f   dededee
ee
   f   f fdZdej                  dej                  fdZ xZS )MaxxVitStagezEMaxxVit stage consisting of mixed convolution and transformer blocks.r   re  )   r  r   r?   r9  r:  r   depth	feat_sizeblock_typesr   r   r  c
                    t         |           d| _        t        ||      }g }
t	        |      D ]  \  }}|dk(  r|nd}|dv sJ |dk(  r0|j
                  dk(  rt        nt        }|
 ||||||	|         gz  }
ne|dk(  r%t        ||      }|
t        ||||||	|   	      gz  }
n;|d
k(  r|
t        ||||||	|         gz  }
n|dk(  r|
t        ||||||	|         gz  }
|} t        j                  |
 | _        y)a  
        Args:
            in_chs: Input channels.
            out_chs: Output channels.
            stride: Stride for first block.
            depth: Number of blocks in stage.
            feat_size: Feature map size.
            block_types: Block types ('C' for conv, 'T' for transformer, etc).
            transformer_cfg: Transformer block configuration.
            conv_cfg: Convolution block configuration.
            drop_path: Drop path rate(s).
        Fr   r*   )r   r   MPMr   r}   r  r   )r   r   r  r  r  )r   r   r   r  r  N)r   r   grad_checkpointingr"   	enumerateri   rS  r7  ry  r  r  r  r   r  blocks)rY   r9  r:  r   r  r  r  r   r   r  r  itblock_strider  r   r   s                   rZ   r   zMaxxVitStage.__init__k  sj   0 	"'";6k* (	DAq%&!V6L----Cx,4,?,?:,M=S^8' 'l   c-oyI-' +''l   c<'%$3'l   d/'%$3'l   FQ(	R mmV,r\   r   r   c                     | j                   r6t        j                  j                         st	        | j
                  |      }|S | j                  |      }|S rW   )r  r   jitis_scriptingr/   r  r   s     rZ   r   zMaxxVitStage.forward  sE    ""599+A+A+Ct{{A.A  AAr\   )r]   r^   r_   r`   r4   r3   ra   r   r   re   rd   r   r   r   r   r   r   r   s   @rZ   r  r  h  s    O )1255J5L'5'735F-F- F- 	F-
 F- S#XF- sE#J/F- 3F- %F- UDK/0F-P %,, r\   r  c                        e Zd ZdZ	 	 	 	 	 	 ddededededededed	ef fd
ZddeddfdZ	de
j                  de
j                  fdZ xZS )Stemz"Stem layer for feature extraction.r9  r:  rl   rt   rD   rO   rQ   rU   c	                 >   t         
|           t        |t        t        f      st        |      }t        t        ||      |      }	|d   | _        d| _	        t        ||d   |d||      | _         |	|d         | _        t        |d   |d   |d||      | _        y)ae  
        Args:
            in_chs: Input channels.
            out_chs: Output channels.
            kernel_size: Kernel size for convolutions.
            padding: Padding mode.
            bias: Whether to use bias.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
            norm_eps: Normalization epsilon.
        r  r   r   r   )r   rt   rD   r*   N)r   r   r  listtupler!   r   r   r:  r   r   conv1r  conv2)rY   r9  r:  rl   rt   rD   rO   rQ   rU   rF  r   s             rZ   r   zStem.__init__  s    , 	'D%=1(G !3J	!JPXYr{"671:{1V]dhi
#GAJ/
"71:wqz;qZahlm
r\   r   r   Nc                 :    t        t        t        |      |        y r&  rN  r(  s     rZ   r)  zStem.init_weights  rO  r\   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rW   )r  r  r  r   s     rZ   r   zStem.forward  s.    JJqMJJqMJJqMr\   )rk   rs   FrN   r~   r   r+  )r]   r^   r_   r`   ra   re   rc   rd   r   r)  r   r   r   r   r   s   @rZ   r  r    s    ,  !#+" n n  n 	 n
  n  n  n  n  nD>3 > > %,, r\   r  c                     | j                   | j                  sJ | S |d   | j                  z  |d   | j                  z  f}t        | ||      } | S )z>Configure window size based on image size and partition ratio.r   r*   )rI   rJ   )rI   rJ   rH   r   )r  ri  r  s      rZ   cfg_window_sizer    sX    
"}}}
a[C$7$77!H[H[9[[N
#>^
LCJr\   kwargsc           	      V   i }i }i }|j                         D ]X  \  }}|j                  d      r|||j                  dd      <   -|j                  d      r|||j                  dd      <   T|||<   Z t        | ft        | j                  fi |t        | j                  fi |d|} | S )z-Overlay keyword arguments onto configuration.transformer_rs   conv_)r   r   )items
startswithr   r   r   )r  r  transformer_kwargsconv_kwargsbase_kwargsr   r   s          rZ   _overlay_kwargsr    s    KK 1<<'@Aqyy<=\\'"23K		'2./KN  3 3J7IJ55 	C Jr\   c                   v    e Zd ZdZ	 	 	 	 	 	 d'dedeeeeef   f   dededede	de	d	e
f fd
Zd(dej                  dededdfdZej                   j"                  dee   fd       Zej                   j"                  d)dedeee
f   fd       Zej                   j"                  d*deddfd       Zej                   j"                  dej                  fd       Zd+dedee   ddfdZ	 	 	 	 	 d,dej6                  deeeee   f      dededededeeej6                     eej6                  eej6                     f   f   fdZ	 	 	 d-deeee   f   ded edeed!f   fd"Zdej6                  dej6                  fd#Zd)dej6                  d$edej6                  fd%Z dej6                  dej6                  fd&Z! xZ"S ).r5   z{CoaTNet + MaxVit base model.

    Highly configurable for different block compositions, tensor layouts, pooling types.
    r  ri  in_chansnum_classesglobal_pool	drop_ratedrop_path_rater  c                 >   t         |           t        |      }|rt        |fi |}t	        |j
                  |      }	|| _        || _        |j                  d   x| _	        | _        || _
        d| _        g | _        t        ||j                  |j                  j                   |j"                  |j                  j$                  |j                  j&                  |j                  j(                        | _        | j*                  j,                  }
| xj                  t/        | j*                  j0                  dd      gz  c_        t3        t5        |t        |
            D cg c]
  \  }}||z   c}}      }t7        |j                        }t7        |j8                        |k(  sJ t;        j<                  d|t?        |j8                              jA                  |j8                        D cg c]  }|jC                          }}| j*                  j0                  }g }tE        |      D ]  }d}|j                  |   }t3        |D cg c]  }|dz
  |z  dz    c}      }|tG        |||j8                  |   |jH                  |   |j                  |	|||   	      gz  }|
|z  }
|}| xj                  t/        ||
d
|       gz  c_         tK        jL                  | | _'        tQ        tS        |j
                  j&                        |j
                  j(                        }|jT                  rUtK        jV                         | _,        |jT                  | _*        t[        | j                  || jT                  |||      | _.        nF| j                  | _*         || j                        | _,        t_        | j                  |||      | _.        |j`                  dv sJ |j`                  r,tc        tQ        | jd                  |j`                        |        yyc c}}w c c}w c c}w )a  
        Args:
            cfg: Model configuration.
            img_size: Input image size.
            in_chans: Number of input channels.
            num_classes: Number of classification classes.
            global_pool: Global pooling type.
            drop_rate: Dropout rate.
            drop_path_rate: Drop path rate.
            **kwargs: Additional keyword arguments to overlay on config.
        r   F)r9  r:  rt   rD   rO   rQ   rU   r   stem)num_chs	reductionr   r   r*   )r  r  r   r   r  r  zstages.r  )hidden_sizerC   r  rQ   )rC   r  )rs   r   r  r  r   r'  N)3r   r   r!   r  r  r   r  r  r   num_featuresr  r  feature_infor  r   r   rt   r   rO   rQ   rU   r  r   r  r:  r  ziplenr   r   linspacesumsplittolistranger  ri   r   r  stagesr   r   r   r   r  r   headr   r   r.   _init_weights)rY   r  ri  r  r  r  r  r  r  r   r   r  sr  
num_stagesr   dprr9  r  stage_strider:  rfinal_norm_layerr   s                          rZ   r   zMaxxVit.__init__  s   , 	X&!#00C)#*=*=xH&&-0]]2->>DN""'NNLL((ll,,||..\\**
	 !!d499+<+<RXYZZc(If<M.NOda16OP	'
3::*,,,#(>>!^S_#U#[#[\_\f\f#ghaqxxzhh""z" 	aALmmA&GINqA,6:NOI|jjmNN1- /#a&	 	 	F l"FF$w&SZ[\Z]Q^"_!``!	a" mmV,">#2E2E2P2P#QWZWjWjWsWstDI$'$8$8D!-!! 11%#+DI %)$5$5D!():):;DI&t'8'8+Q\hqrDI "\\\\?? 2 23??KTR ] P i Os   !P
,PP
r   r   r   r   Nc                     t        |d      r	 |j                  |       y y # t        $ r |j                          Y y w xY w)Nr)  r'  )hasattrr)  	TypeError)rY   r   r   r   s       rZ   r  zMaxxVit._init_weightsm  sD    6>*&##6#2 +  &##%&s   " >>c                     | j                         D ch c]  \  }t        fddD              r c}}S c c}}w )Nc              3   &   K   | ]  }|v  
 y wrW   rf   ).0nr   s     rZ   	<genexpr>z*MaxxVit.no_weight_decay.<locals>.<genexpr>x  s     Sa16Ss   )relative_position_bias_tablezrel_pos.mlp)named_parametersany)rY   r   _s    ` rZ   no_weight_decayzMaxxVit.no_weight_decayt  sK     //1U U!QS#RSS U 	U Us    :coarsec                 $    t        dddg      }|S )Nz^stem)z^stages\.(\d+)N)z^norm)i )r  r  )r  )rY   r	  matchers      rZ   group_matcherzMaxxVit.group_matcherz  s    -/CD
 r\   enablec                 4    | j                   D ]	  }||_         y rW   )r  r  )rY   r  r  s      rZ   set_grad_checkpointingzMaxxVit.set_grad_checkpointing  s     	*A#)A 	*r\   c                 .    | j                   j                  S rW   )r  fcrX   s    rZ   get_classifierzMaxxVit.get_classifier  s    yy||r\   c                 J    || _         | j                  j                  ||       y rW   )r  r  reset)rY   r  r  s      rZ   reset_classifierzMaxxVit.reset_classifier  s    &		[1r\   r   indicesr  
stop_early
output_fmtintermediates_onlyc                    |dv sJ d       g }t        t        | j                        dz   |      \  }}	d}
| j                  |      }|
|v r|j	                  |       t        | j                        }t
        j                  j                         s|s| j                  }n| j                  d|	 }|D ]@  }|
dz  }
 ||      }|
|v s|r|
|k(  r| j                  |      }n|}|j	                  |       B |r|S |
|k(  r| j                  |      }||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )NCHWzOutput shape must be NCHW.r*   r   N)	r,   r  r  r  appendr   r  r  r  )rY   r   r  r  r  r  r  intermediatestake_indices	max_indexfeat_idxlast_idxr  stagex_inters                  rZ   forward_intermediateszMaxxVit.forward_intermediates  s   * Y&D(DD&"6s4;;7G!7KW"Ui IIaL|#  #t{{#99!!#:[[F[[),F 	.EMHaA<'H0"iilGG$$W-	.   x		!A-r\   
prune_norm
prune_head.c                     t        t        | j                        dz   |      \  }}| j                  d| | _        |rt        j                         | _        |r| j                  dd      | _        |S )z6Prune layers not required for specified intermediates.r*   Nr   rs   )r,   r  r  r   r   r  r  r  )rY   r  r%  r&  r  r  s         rZ   prune_intermediate_layersz!MaxxVit.prune_intermediate_layers  sb     #7s4;;7G!7KW"Uikk*9-DI--a4DIr\   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rW   )r  r  r  r   s     rZ   forward_featureszMaxxVit.forward_features  s.    IIaLKKNIIaLr\   
pre_logitsc                 N    |r| j                  ||      S | j                  |      S )N)r+  )r  )rY   r   r+  s      rZ   forward_headzMaxxVit.forward_head  s%    6@tyyzy2RdiiPQlRr\   c                 J    | j                  |      }| j                  |      }|S rW   )r*  r-  r   s     rZ   r   zMaxxVit.forward  s'    !!!$a r\   )   rk     r   r?   r?   r+  F)TrW   )NFFr  F)r*   FT)#r]   r^   r_   r`   r2   r   ra   r   re   rd   r   r   r   Moduler  r   r  ignorer   r  rc   r
   r  r  r  r   r  r   r   r$  r(  r*  r-  r   r   r   s   @rZ   r5   r5   	  s    58#$!$&\S\S CsCx01\S 	\S
 \S \S \S "\S \S|&BII &S &# &t & YYUS U U
 YYD T#s(^   YY*T *T * * YY		  2C 2hsm 2W[ 2 8<$$',4 ||4  eCcN344  	4 
 4  4  !%4  
tELL!5tELL7I)I#JJ	K4 p ./$#	3S	>*  	
 
sCx%,, 5<< Sell S S S %,, r\   r5   r   FTrD   rF   rq   rC   conv_output_biasconv_attn_earlyconv_attn_act_layerconv_norm_layertransformer_shortcut_biastransformer_norm_layertransformer_norm_layer_clrM   rE   rG   c                 b    t        t        | |dd|||d|	      t        d|||	|||
|            S )as  RW variant configuration for CoAtNet models.

    These models were created and trained before seeing https://github.com/google-research/maxvit

    Common differences for initial timm models:
      - pre-norm layer in MZBConv included an activation after norm
      - mbconv expansion calculated from input instead of output chs
      - mbconv shortcut and final 1x1 conv did not have a bias
      - SE act layer was relu, not silu
      - mbconv uses silu in timm, not gelu
      - expansion in attention block done via output proj, not input proj

    Variable differences (evolved over training initial models):
      - avg pool with kernel_size=2 favoured downsampling (instead of maxpool for coat)
      - SE attention was between conv2 and norm/act
      - default to avg pool for mbconv downsample instead of 1x1 or dw conv
      - transformer block shortcut has no bias
    TFrx   )	rq   rC   rn   rj   ro   ru   ry   rO   rQ   )r<   r=   rC   rM   rQ   rS   rE   rG   r   r   r  r3   r4   )rq   rC   r4  r5  r6  r7  r8  r9  r:  rM   rE   rG   s               rZ   _rw_coat_cfgr>    sW    @ #(&.&

 .3#-3%#	
 r\   rp   conv_attn_ratior8   c                 `    t        t        | |d||d|      t        d||||	|||
|	            S )a  RW variant configuration for MaxViT models.

    These models were created and trained before seeing https://github.com/google-research/maxvit

    Differences of initial timm models:
      - mbconv expansion calculated from input instead of output chs
      - mbconv shortcut and final 1x1 conv did not have a bias
      - mbconv uses silu in timm, not gelu
      - expansion in attention block done via output proj, not input proj
    Frx   )rq   rC   rj   ro   r{   rO   rQ   )	r<   rC   r8   rI   rM   rQ   rS   rE   rG   r<  r=  )rq   rC   r4  r?  r7  r9  r:  rI   r8   rM   rE   rG   s               rZ   _rw_max_cfgrA    sS    0 #(&&
 .##-3%#

 r\   rT   r  conv_norm_layer_clrK   c                     t        |      }t        t        d| |d|d   ||      t        d||||d   |||	|
	            S )z=Configuration for experimental ConvNeXt-based MaxxViT models.r}   Fr   )ri   rq   rC   rj   rM   rQ   rS   r*   )	r<   rC   rI   rK   rM   rQ   rS   rE   rG   r<  )r!   r  r3   r4   )rq   rC   r7  rB  r9  r:  rI   rK   rM   rE   rG   s              rZ   	_next_cfgrD  L  se     K(K!##A&,
 .#'#A-3%#

 r\   c            	      N    t        t        ddd      t        dddd      	      S )
z0Configuration matching TensorFlow MaxViT models.gMbP?	gelu_tanhsame)rU   rO   rt   r   Frx  )rU   rO   r9   rE   r<  r=  rf   r\   rZ   _tf_cfgrH  s  s6    !

 .!"	
 r\   coatnet_pico_rw)r         rF   r   )r7   r   )r   r   r   rz   )r4  r?  coatnet_nano_rw)rk   re     rk   )rq   r4  r?  coatnet_0_rwr   )r   rk   rT  r   )r5  r8  coatnet_1_rw)r   rM  r  r   )rq   r5  r8  coatnet_2_rw)rJ  rK  rF      )r   rJ  rx   )rq   r6  coatnet_3_rw)r   r   r      )r   r   )rq   r6  rM   coatnet_bn_0_rwr~   )rq   r5  r8  r9  coatnet_rmlp_nano_rwr   )r4  r?  rE   rG   coatnet_rmlp_0_rw)rq   rE   coatnet_rmlp_1_rwr   )rC   r5  r8  rE   rG   coatnet_rmlp_1_rw2)rq   rE   rG   coatnet_rmlp_2_rw)rq   r6  rM   rE   coatnet_rmlp_3_rwcoatnet_nano_cc)r   r   r   r   r\  )r   r   r   ri   coatnext_nano_rwr   )r   r   r   r   )r   N)rE   rM   	coatnet_0r   r   )r   r   r   r   	coatnet_1	coatnet_2rJ  rQ  	coatnet_3r   rS  	coatnet_4)r         r   	coatnet_5)rK  rF         rg  maxvit_pico_rw)r7   r   rJ  rK  )r   r   r   r   )r  r  r  r  )   r7   )r   r   ri   r   maxvit_nano_rw)r*   r   rk   r*   maxvit_tiny_rwmaxvit_tiny_pm)r  r  r  r  maxvit_rmlp_pico_rw)rE   maxvit_rmlp_nano_rwmaxvit_rmlp_tiny_rwmaxvit_rmlp_small_rwmaxvit_rmlp_base_rw)r   r   ri   r   r   maxxvit_rmlp_nano_rw)r   r   ri   r   r   maxxvit_rmlp_tiny_rwmaxxvit_rmlp_small_rw)0   r   maxxvitv2_nano_rw)rK   rE   maxxvitv2_rmlp_base_rw)r   rM  rc  r   )rK   maxxvitv2_rmlp_large_rw)   i@  i  rf  )r   rM     r   )P   ry  rf  maxvit_tiny_tf)r   r   ri   r   r   r   maxvit_small_tfmaxvit_base_tfmaxvit_large_tfmaxvit_xlarge_tf
state_dictmodelc                 r   |j                         }i }| j                         D ]  \  }}|j                  d      r|j                  |dd       }|j                  |j
                  j                  k7  s|j                  d   |j                  d   k7  r,t        ||j                  |j
                  j                        }||v rn|j                  ||   j                  k7  rR|j                         ||   j                         k(  r.|j                  dv sJ |j                  ||   j                        }|||<    |S )z/Filter checkpoint state dict for compatibility.r  Nir   r*   )new_window_sizenew_bias_shape)r   re  )r  r  endswithget_submoduler   r  rI   r)   ndimnumelr   )r  r  model_state_dictout_dictr   r   ms          rZ   checkpoint_filter_fnr    s)   '')H  " 1::45##AdsG,Aww!88>>>!--PQBRVWVcVcdeVfBf-$%MM#$#A#A#G#G   QVV/?/B/G/G%GAGGIYijkYlYrYrYtLt66V###		*1-334A Or\   variantcfg_variant
pretrainedc                     |.| t         v r| }n#dj                  | j                  d      dd       }t        t        | |ft         |   t        d      t        d|S )zCreate a MaxxVit model variant.Nr  r   T)flatten_sequential)	model_cfgfeature_cfgpretrained_filter_fn)
model_cfgsjoinr  r+   r5   r  r  )r  r  r  r  s       rZ   _create_maxxvitr    si    j !K((7==#5cr#:;K*[)D11	
  r\   urlc                 "    | dddddddddd	d
|S )z$Create a default configuration dict.r0  )rk   r/  r/  )rT  rT  ffffff?bicubic)      ?r  r  z
stem.conv1zhead.fcT)r  r  
input_size	pool_sizecrop_pctinterpolationmeanr  
first_conv
classifierfixed_input_sizerf   )r  r  s     rZ   _cfgr    s4     4}SY9")   r\   zcoatnet_pico_rw_224.untrained)r  zcoatnet_nano_rw_224.sw_in1kztimm/zyhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_nano_rw_224_sw-f53093b4.pthg?)	hf_hub_idr  r  zcoatnet_0_rw_224.sw_in1kzvhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_0_rw_224_sw-a6439706.pth)r  r  zcoatnet_1_rw_224.sw_in1kzvhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_1_rw_224_sw-5cae1ea8.pthz!coatnet_2_rw_224.sw_in12k_ft_in1k)r  z'coatnet_rmlp_1_rw2_224.sw_in12k_ft_in1kz&coatnet_rmlp_2_rw_224.sw_in12k_ft_in1kz&coatnet_rmlp_2_rw_384.sw_in12k_ft_in1k)rk   r   r   )rc  rc  g      ?squash)r  r  r  r  	crop_modezcoatnet_bn_0_rw_224.sw_in1kzyhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_bn_0_rw_224_sw-c228e218.pthr  )r  r  r  r  r  z coatnet_rmlp_nano_rw_224.sw_in1kz~https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_nano_rw_224_sw-bd1d51b3.pthzcoatnet_rmlp_0_rw_224.untrainedzcoatnet_rmlp_1_rw_224.sw_in1kz{https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_1_rw_224_sw-9051e6c3.pthzcoatnet_rmlp_2_rw_224.sw_in1kz{https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_2_rw_224_sw-5ccfac55.pthzcoatnet_rmlp_3_rw_224.untrainedzcoatnet_nano_cc_224.untrainedzcoatnext_nano_rw_224.sw_in1kzzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnext_nano_rw_224_ad-22cb71c2.pthzcoatnet_2_rw_224.sw_in12ki-.  )r  r  zcoatnet_3_rw_224.sw_in12kzcoatnet_rmlp_1_rw2_224.sw_in12kzcoatnet_rmlp_2_rw_224.sw_in12kzcoatnet_0_224.untrainedzcoatnet_1_224.untrainedzcoatnet_2_224.untrainedzcoatnet_3_224.untrainedzcoatnet_4_224.untrainedzcoatnet_5_224.untrainedzmaxvit_pico_rw_256.untrained)rk   rK  rK  )   r  )r  r  r  zmaxvit_nano_rw_256.sw_in1kzxhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_nano_rw_256_sw-fb127241.pth)r  r  r  r  zmaxvit_tiny_rw_224.sw_in1kzxhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_tiny_rw_224_sw-7d0dffeb.pthzmaxvit_tiny_rw_256.untrainedzmaxvit_tiny_pm_256.untrainedzmaxvit_rmlp_pico_rw_256.sw_in1kz}https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_pico_rw_256_sw-8d82f2c6.pthzmaxvit_rmlp_nano_rw_256.sw_in1kz}https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_nano_rw_256_sw-c17bb0d6.pthzmaxvit_rmlp_tiny_rw_256.sw_in1kz}https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-bbef0ff5.pthz maxvit_rmlp_small_rw_224.sw_in1kz~https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_small_rw_224_sw-6ef0ae4f.pthz"maxvit_rmlp_small_rw_256.untrainedz(maxvit_rmlp_base_rw_224.sw_in12k_ft_in1kz(maxvit_rmlp_base_rw_384.sw_in12k_ft_in1kz maxvit_rmlp_base_rw_224.sw_in12kz maxxvit_rmlp_nano_rw_256.sw_in1kz~https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_nano_rw_256_sw-0325d459.pthz"maxxvit_rmlp_tiny_rw_256.untrainedz!maxxvit_rmlp_small_rw_256.sw_in1kzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_small_rw_256_sw-37e217ff.pthzmaxxvitv2_nano_rw_256.sw_in1k)r  r  r  z+maxxvitv2_rmlp_base_rw_224.sw_in12k_ft_in1kz+maxxvitv2_rmlp_base_rw_384.sw_in12k_ft_in1kz%maxxvitv2_rmlp_large_rw_224.untrainedz#maxxvitv2_rmlp_base_rw_224.sw_in12kzmaxvit_tiny_tf_224.in1k)r  r  r  zmaxvit_tiny_tf_384.in1kzmaxvit_tiny_tf_512.in1k)rk   rF   rF   )rz  rz  zmaxvit_small_tf_224.in1kzmaxvit_small_tf_384.in1kzmaxvit_small_tf_512.in1kzmaxvit_base_tf_224.in1kzmaxvit_base_tf_384.in1kzmaxvit_base_tf_512.in1kzmaxvit_large_tf_224.in1kzmaxvit_large_tf_384.in1kzmaxvit_large_tf_512.in1kzmaxvit_base_tf_224.in21kiSU  z maxvit_base_tf_384.in21k_ft_in1kz maxvit_base_tf_512.in21k_ft_in1kzmaxvit_large_tf_224.in21kz!maxvit_large_tf_384.in21k_ft_in1kz!maxvit_large_tf_512.in21k_ft_in1k)r  r  r  r  zmaxvit_xlarge_tf_224.in21kz"maxvit_xlarge_tf_384.in21k_ft_in1kz"maxvit_xlarge_tf_512.in21k_ft_in1kc                     t        dd| i|S )z)CoatNet Pico model with RW configuration.r  )coatnet_pico_rw_224r  r  r  s     rZ   r  r  	       RZR6RRr\   c                     t        dd| i|S )z)CoatNet Nano model with RW configuration.r  )coatnet_nano_rw_224r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z&CoatNet-0 model with RW configuration.r  )coatnet_0_rw_224r  r  s     rZ   r  r  	       O*OOOr\   c                     t        dd| i|S )z&CoatNet-1 model with RW configuration.r  )coatnet_1_rw_224r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z&CoatNet-2 model with RW configuration.r  )coatnet_2_rw_224r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z&CoatNet-3 model with RW configuration.r  )coatnet_3_rw_224r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z4CoatNet-0 model with BatchNorm and RW configuration.r  )coatnet_bn_0_rw_224r  r  s     rZ   r  r  %	  r  r\   c                     t        dd| i|S )z.CoatNet Nano model with Relative Position MLP.r  )coatnet_rmlp_nano_rw_224r  r  s     rZ   r  r  +	       W*WPVWWr\   c                     t        dd| i|S )z+CoatNet-0 model with Relative Position MLP.r  )coatnet_rmlp_0_rw_224r  r  s     rZ   r  r  1	       TzTVTTr\   c                     t        dd| i|S )z+CoatNet-1 model with Relative Position MLP.r  )coatnet_rmlp_1_rw_224r  r  s     rZ   r  r  7	  r  r\   c                     t        dd| i|S )z.CoatNet-1 model with Relative Position MLP v2.r  )coatnet_rmlp_1_rw2_224r  r  s     rZ   r  r  =	  s     U
UfUUr\   c                     t        dd| i|S )z+CoatNet-2 model with Relative Position MLP.r  )coatnet_rmlp_2_rw_224r  r  s     rZ   r  r  C	  r  r\   c                     t        dd| i|S )z6CoatNet-2 model with Relative Position MLP at 384x384.r  )coatnet_rmlp_2_rw_384r  r  s     rZ   r  r  I	  r  r\   c                     t        dd| i|S )z+CoatNet-3 model with Relative Position MLP.r  )coatnet_rmlp_3_rw_224r  r  s     rZ   r  r  O	  r  r\   c                     t        dd| i|S )z(CoatNet Nano model with ConvNeXt blocks.r  )coatnet_nano_cc_224r  r  s     rZ   r  r  U	  r  r\   c                     t        dd| i|S )z*CoAtNeXt Nano model with RW configuration.r  )coatnext_nano_rw_224r  r  s     rZ   r  r  [	  s     SjSFSSr\   c                     t        dd| i|S )zCoatNet-0 model.r  )coatnet_0_224r  r  s     rZ   r  r  a	       LzLVLLr\   c                     t        dd| i|S )zCoatNet-1 model.r  )coatnet_1_224r  r  s     rZ   r  r  g	  r  r\   c                     t        dd| i|S )zCoatNet-2 model.r  )coatnet_2_224r  r  s     rZ   r  r  m	  r  r\   c                     t        dd| i|S )zCoatNet-3 model.r  )coatnet_3_224r  r  s     rZ   r  r  s	  r  r\   c                     t        dd| i|S )zCoatNet-4 model.r  )coatnet_4_224r  r  s     rZ   r  r  y	  r  r\   c                     t        dd| i|S )zCoatNet-5 model.r  )coatnet_5_224r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z(MaxViT Pico model with RW configuration.r  )maxvit_pico_rw_256r  r  s     rZ   r  r  	       QJQ&QQr\   c                     t        dd| i|S )z(MaxViT Nano model with RW configuration.r  )maxvit_nano_rw_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z(MaxViT Tiny model with RW configuration.r  )maxvit_tiny_rw_224r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z3MaxViT Tiny model with RW configuration at 256x256.r  )maxvit_tiny_rw_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z3MaxViT Relative Position MLP Pico RW 256x256 model.r  )maxvit_rmlp_pico_rw_256r  r  s     rZ   r  r  	       VVvVVr\   c                     t        dd| i|S )z3MaxViT Relative Position MLP Nano RW 256x256 model.r  )maxvit_rmlp_nano_rw_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z3MaxViT Relative Position MLP Tiny RW 256x256 model.r  )maxvit_rmlp_tiny_rw_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z4MaxViT Relative Position MLP Small RW 224x224 model.r  )maxvit_rmlp_small_rw_224r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z9MaxViT Small model with Relative Position MLP at 256x256.r  )maxvit_rmlp_small_rw_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z-MaxViT Base model with Relative Position MLP.r  )maxvit_rmlp_base_rw_224r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z8MaxViT Base model with Relative Position MLP at 384x384.r  )maxvit_rmlp_base_rw_384r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z'MaxViT Tiny model with parallel blocks.r  )maxvit_tiny_pm_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z4MaxxViT Relative Position MLP Nano RW 256x256 model.r  )maxxvit_rmlp_nano_rw_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z.MaxxViT Tiny model with Relative Position MLP.r  )maxxvit_rmlp_tiny_rw_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z/MaxxViT Small model with Relative Position MLP.r  )maxxvit_rmlp_small_rw_256r  r  s     rZ   r  r  	  s     X:XQWXXr\   c                     t        dd| i|S )zMaxxViT-V2 Nano model.r  )maxxvitv2_nano_rw_256r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z1MaxxViT-V2 Base model with Relative Position MLP.r  )maxxvitv2_rmlp_base_rw_224r  r  s     rZ   r  r  	       YJYRXYYr\   c                     t        dd| i|S )z<MaxxViT-V2 Base model with Relative Position MLP at 384x384.r  )maxxvitv2_rmlp_base_rw_384r  r  s     rZ   r  r  	  r  r\   c                     t        dd| i|S )z2MaxxViT-V2 Large model with Relative Position MLP.r  )maxxvitv2_rmlp_large_rw_224r  r  s     rZ   r  r  	  s     ZZZSYZZr\   c                     t        dd| i|S )z"MaxViT Tiny model from TensorFlow.r  )maxvit_tiny_tf_224r|  r  r  s     rZ   r  r  	       cjc\bccr\   c                     t        dd| i|S )z-MaxViT Tiny model from TensorFlow at 384x384.r  )maxvit_tiny_tf_384r|  r  r  s     rZ   r  r  	  r	  r\   c                     t        dd| i|S )z-MaxViT Tiny model from TensorFlow at 512x512.r  )maxvit_tiny_tf_512r|  r  r  s     rZ   r  r  
  r	  r\   c                     t        dd| i|S )z#MaxViT Small model from TensorFlow.r  )maxvit_small_tf_224r}  r  r  s     rZ   r  r  	
       ePZe^deer\   c                     t        dd| i|S )z.MaxViT Small model from TensorFlow at 384x384.r  )maxvit_small_tf_384r}  r  r  s     rZ   r  r  
  r  r\   c                     t        dd| i|S )z.MaxViT Small model from TensorFlow at 512x512.r  )maxvit_small_tf_512r}  r  r  s     rZ   r  r  
  r  r\   c                     t        dd| i|S )z"MaxViT Base model from TensorFlow.r  )maxvit_base_tf_224r~  r  r  s     rZ   r  r  
  r	  r\   c                     t        dd| i|S )z-MaxViT Base model from TensorFlow at 384x384.r  )maxvit_base_tf_384r~  r  r  s     rZ   r  r  !
  r	  r\   c                     t        dd| i|S )z-MaxViT Base model from TensorFlow at 512x512.r  )maxvit_base_tf_512r~  r  r  s     rZ   r  r  '
  r	  r\   c                     t        dd| i|S )z#MaxViT Large model from TensorFlow.r  )maxvit_large_tf_224r  r  r  s     rZ   r  r  -
  r  r\   c                     t        dd| i|S )z.MaxViT Large model from TensorFlow at 384x384.r  )maxvit_large_tf_384r  r  r  s     rZ   r  r  3
  r  r\   c                     t        dd| i|S )z.MaxViT Large model from TensorFlow at 512x512.r  )maxvit_large_tf_512r  r  r  s     rZ   r   r   9
  r  r\   c                     t        dd| i|S )z$MaxViT XLarge model from TensorFlow.r  )maxvit_xlarge_tf_224r  r  r  s     rZ   r"  r"  ?
       gR\g`fggr\   c                     t        dd| i|S )z/MaxViT XLarge model from TensorFlow at 384x384.r  )maxvit_xlarge_tf_384r  r  r  s     rZ   r%  r%  E
  r#  r\   c                     t        dd| i|S )z/MaxViT XLarge model from TensorFlow at 512x512.r  )maxvit_xlarge_tf_512r  r  r  s     rZ   r'  r'  K
  r#  r\   r+  )r   rB   FFrelurs   TrP   rR   NrD   rF   )rp   rB   Fg      ?rs   rP   rR   Nr7   NrD   rF   )rp   rB   rP   rR   rP   rR   NFrT   r  rF   rf   )NFr1  )r`   r/  collectionsr   dataclassesr   r   r   	functoolsr   typingr   r	   r
   r   r   r   r   r   r   r   	torch.jitr   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   _builderr+   	_featuresr,   _features_fxr-   _manipulater.   r/   	_registryr0   r1   __all__r4   r3   r2   r2  r   r   r   r   r   re   r  r  r2  ra   r5  r7  rS  r   rh  rm  rr  ru  ry  r{  r  r  r  r  r  r  r  r  r  r  r  r  r5   rc   rd   r>  rA  rD  rH  r  r  r  r  r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r"  r%  r'  rf   r\   rZ   <module>r7     s   "H  # 1 1  I I I    A ` ` t t Z Z f f * + 3 4 <
N 2 2 2B !P !P !PH 
! 
! 
!N")) NbO")) Od< <&<299 <&-299 -`0bii 0s 0C 0 02H HV,ryy , ,S ,$ ,0&8C= &C &C &b")) bJTBII Tn 49   ELL tCy DQTI Z_ZfZf  ell tCy U\\  %,, 49 S	 V[VbVb  	. 	U38_ 	QYZbQc 	>299 >BM M`U\\ S	 ell   DI QUVYQZ _d_k_k  5<< DI %,,  u|| S	 TRUY [`[g[g  E299 EP2299 2j.299 .bP299 Pf,299 ,^. %S/ Nc  s z *Wbii Wv "!& %#)!*.&3)4'+"666 6 	6
 !6 6 $(6 !$6 $'6 e_6 6 6 
#s(^6t  !&!'!&3)415'+"--- - 	-
 - !$- $'- eCHo.- - e_- - - 
#s(^-b  ,"-&3)415#9=!$$$ $  	$
 !$$ $'$ eCHo.$ $ 5%u"556$ $ $ 
#s(^$Nc3h "  n % ! 
	n  	%	 ! 
		n*  %  &+
	+n<  	%	  &+
		=nP  	'	  &
		Qnd  	'	  &
		en|  
%
  &+#0	
	
}nR $ 
%
 ! 	
	
Snh ! % 
	inz ! %  &+
	{nR " 	%	 
		Snf ! 
'
  &	
	
gn| ! 
'
  &	
	
}nT  %5	
 .Unb   	%		
 $
	cnz %	{nF %	GnR '	Sn^ '	_nj '	knv (	wnF  $	
 -GnT  %	
 -Unb  %	
 -cnp  %	
 -qn@ # $	
 5
)AnN # %	
 5
)On\ # %	
 5
)]nj $ 	%		
 
	kn~ # 	%	 
	nT $ % +Und $ %	
 +enr % %	
 +snB	 ! 
%
 

C	nX	 & '	
 
Y	nj	 ' 	'	 
	k	nB
  % )C
nT
  % )U
nf
  % )g
nx
  ' )y
nJ   ' )Kn
bT#u||*;%< RYY SWX[]b]i]iXiSj ,S x} QU il qx 	c 	# 	$sCx. 	 % X&#Tb\X& "4 H$X&  E!FX&  E!X&  (*!X&* .t0+X&. -d//X&2 -d Hsh/X3X&< "4 H"(<	$=X&F ' M)GX&N &t|OX&P $T J&KQX&V $T J&KWX&\ &t|]X&^ $Tb\_X&` #D I%aX&l  "mX&r  "sX&x &t(yX&~ %d'X&H t|IX&J t|KX&L t|MX&N t|OX&P t|QX&R t|SX&X #DRMU[$\YX&Z !$ G F#4[X&b !$ G#HcX&h #D F%4iX&n #DRMU[$\oX&t &t L F(4uX&| &t L F(4}X&D &t L F(4EX&L ' M)MX&V )$ F+4WX&` /1aX&f / Hsh1XgX&p ')qX&| ' M F)4}X&D )$2-[a*bEX&F ( N F*4GX&R $T F&4SX&X 244YX&\ 24 Hsh4X]X&b ,Tb\cX&f *4,gX&p t"(< >qX&v t Hsh XwX&| t Hsh X}X&B "(<!>CX&H  Hsh!XIX&N  Hsh!XOX&T t"(< >UX&Z t Hsh X[X&` t Hsh XaX&f "(<!>gX&l  Hsh!XmX&r  Hsh!XsX&z !{X&@ ' Hsh)XAX&F ' Hsh)XGX&L  "MX&R ( Hsh*XSX&X ( 3(*DYX&^ !$#_X&d )$ Hsh+XeX&j )$ Hsh+XkX& Xv SD SC SG S S
 SD SC SG S S
 P P P P P
 P P P P P
 P P P P P
 P P P P P
 SD SC SG S S
 X X X X X
 Ud Uc Ug U U
 Ud Uc Ug U U
 Vt Vs Vw V V
 Ud Uc Ug U U
 Ud Uc Ug U U
 Ud Uc Ug U U
 SD SC SG S S
 TT TS TW T T
 Md Mc Mg M M
 Md Mc Mg M M
 Md Mc Mg M M
 Md Mc Mg M M
 Md Mc Mg M M
 Md Mc Mg M M
 R4 R3 R7 R R
 R4 R3 R7 R R
 R4 R3 R7 R R
 R4 R3 R7 R R
 W W W W W
 W W W W W
 W W W W W
 X X X X X
 X X X X X
 W W W W W
 W W W W W
 R4 R3 R7 R R
 X X X X X
 X X X X X
 Y$ Y# Y' Y Y
 Ud Uc Ug U U
 Z4 Z3 Z7 Z Z
 Z4 Z3 Z7 Z Z
 [D [C [G [ [
 d4 d3 d7 d d
 d4 d3 d7 d d
 d4 d3 d7 d d
 fD fC fG f f
 fD fC fG f f
 fD fC fG f f
 d4 d3 d7 d d
 d4 d3 d7 d d
 d4 d3 d7 d d
 fD fC fG f f
 fD fC fG f f
 fD fC fG f f
 hT hS hW h h
 hT hS hW h h
 hT hS hW h hr\   