
    khS              
          d Z ddlZddlZddlmZmZmZmZmZm	Z	m
Z
mZ ddlZddlmZ ddlmc mZ ddlmZmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ dd	l m!Z! dd
l"m#Z#m$Z$ ddl%m&Z&m'Z' dgZ( ejR                  e*      Z+dejX                  dejX                  fdZ-dejX                  dejX                  fdZ.dejX                  de	e/e/f   dejX                  fdZ0e!dejX                  de	e/e/f   de	e/e/f   dejX                  fd       Z1 G d dejd                        Z3 G d dejd                        Z4 G d dejd                        Z5 G d dejd                        Z6 G d dejd                        Z7 G d  dejd                        Z8dXd"ejd                  d#e9fd$Z:d% Z;dYd&Z<dXd'e9dee9ef   fd(Z= e&i d) e=d!d*d+d,-      d. e=d!d/d01      d2 e=d3d4d/d05      d6 e=d!d*d+d,-      d7 e=d3d8d/d05      d9 e=d3d:d/d05      d; e=d!d<d+d=-      d> e=d!d*d+d,-      d? e=d!d/d01      d@ e=d!d/d01      dA e=d!d*d+d,-      dB e=d!d/d01      dC e=d!d*d+d,-      dD e=d!d/d01      dE e=d!d*d+d,-      dF e=d!d/d01            Z>e'dYdGe?de8fdH       Z@e'dYdGe?de8fdI       ZAe'dYdGe?de8fdJ       ZBe'dYdGe?de8fdK       ZCe'dYdGe?de8fdL       ZDe'dYdGe?de8fdM       ZEe'dYdGe?de8fdN       ZFe'dYdGe?de8fdO       ZGe'dYdGe?de8fdP       ZHe'dYdGe?de8fdQ       ZIe'dYdGe?de8fdR       ZJe'dYdGe?de8fdS       ZKe'dYdGe?de8fdT       ZLe'dYdGe?de8fdU       ZMe'dYdGe?de8fdV       ZNe'dYdGe?de8fdW       ZOy)Za   Swin Transformer V2

A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
    - https://arxiv.org/pdf/2111.09883

Code adapted from https://github.com/ChristophReich1996/Swin-Transformer-V2, original copyright/license info below

This implementation is experimental and subject to change in manners that will break weight compat:
* Size of the pos embed MLP are not spelled out in paper in terms of dim, fixed for all models? vary with num_heads?
  * currently dim is fixed, I feel it may make sense to scale with num_heads (dim per head)
* The specifics of the memory saving 'sequential attention' are not detailed, Christoph Reich has an impl at
  GitHub link above. It needs further investigation as throughput vs mem tradeoff doesn't appear beneficial.
* num_heads per stage is not detailed for Huge and Giant model variants
* 'Giant' is 3B params in paper but ~2.6B here despite matching paper dim + block counts
* experiments are ongoing wrt to 'main branch' norm layer use and weight init scheme

Noteworthy additions over official Swin v1:
* MLP relative position embedding is looking promising and adapts to different image/window sizes
* This impl has been designed to allow easy change of image size with matching window size changes
* Non-square image size and window size are supported

Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
    N)AnyCallableDictListOptionalTupleTypeUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPathMlpClassifierHead	to_2tuple_assertndgrid   )build_model_with_cfg)feature_take_indices)register_notrace_function)named_apply
checkpoint)generate_default_cfgsregister_modelSwinTransformerV2Crxreturnc                 *    | j                  dddd      S )z>Permutes a tensor from the shape (B, C, H, W) to (B, H, W, C).r         r   permuter   s    ^/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/swin_transformer_v2_cr.pybchw_to_bhwcr&   3       99Q1a      c                 *    | j                  dddd      S )z>Permutes a tensor from the shape (B, H, W, C) to (B, C, H, W).r   r!   r   r    r"   r$   s    r%   bhwc_to_bchwr*   8   r'   r(   window_sizec                     | j                   \  }}}}| j                  |||d   z  |d   ||d   z  |d   |      } | j                  dddddd      j                         j                  d|d   |d   |      }|S )zPartition into non-overlapping windows.

    Args:
        x: Input tensor of shape (B, H, W, C).
        window_size: Window size (height, width).

    Returns:
        Windows tensor of shape (num_windows*B, window_size[0], window_size[1], C).
    r   r   r!   r          shapeviewr#   
contiguous)r   r+   BHWCwindowss          r%   window_partitionr9   =   s     JAq!Q	q!{1~%{1~qKN7JKXYN\]^Aii1aAq)446;;BAP[\]P^`abGNr(   r8   img_sizec                     |\  }}| j                   d   }| j                  d||d   z  ||d   z  |d   |d   |      }|j                  dddddd      j                         j                  d|||      }|S )a1  Merge windows back to feature map.

    Args:
        windows: Windows tensor of shape (num_windows * B, window_size[0], window_size[1], C).
        window_size: Window size (height, width).
        img_size: Image size (height, width).

    Returns:
        Feature map tensor of shape (B, H, W, C).
    r/   r   r   r!   r    r-   r.   r0   )r8   r+   r:   r5   r6   r7   r   s          r%   window_reverser<   M   s     DAqbARk!n,a;q>.A;q>S^_`SacdeA			!Q1a#..055b!QBAHr(   c                        e Zd ZdZ	 	 	 	 ddededeeef   dedededed	d
f fdZddZ	deeef   d	d
fdZ
d	ej                  fdZddej                  deej                     d	ej                  fdZ xZS )WindowMultiHeadAttentiona  This class implements window-based Multi-Head-Attention with log-spaced continuous position bias.

    Args:
        dim (int): Number of input features
        window_size (int): Window size
        num_heads (int): Number of attention heads
        drop_attn (float): Dropout rate of attention map
        drop_proj (float): Dropout rate after projection
        meta_hidden_dim (int): Number of hidden features in the two layer MLP meta network
        sequential_attn (bool): If true sequential self-attention is performed
    dim	num_headsr+   	drop_attn	drop_projmeta_hidden_dimsequential_attnr   Nc                 `   t         t        |           ||z  dk(  sJ d       || _        t	        |      | _        || _        || _        t        j                  ||dz  d      | _
        t        j                  |      | _        t        j                  ||d      | _        t        j                  |      | _        t        d||t        j                   d      | _        t        j$                  t'        j(                  d	t'        j*                  |      z              | _        | j/                          y )
Nr   z`The number of input features (in_features) are not divisible by the number of heads (num_heads).r!   Tin_featuresout_featuresbiasr    )g      ?        )hidden_featuresrH   	act_layerdrop
   )superr>   __init__rG   r   r+   r@   rD   nnLinearqkvDropout	attn_dropproj	proj_dropr   ReLUmeta_mlp	Parametertorchlogoneslogit_scale"_make_pair_wise_relative_positions)	selfr?   r@   r+   rA   rB   rC   rD   	__class__s	           r%   rP   z!WindowMultiHeadAttention.__init__m   s     	&68Y!# 	on	o# #,5k,B'%49937NI.II#CdK	I.+"gg
 <<		"uzz)7L2L(MN//1r(   c           	      L   | j                   j                  }t        j                  t	        t        j
                  | j                  d   |      t        j
                  | j                  d   |            d      j                  d      }|dddddf   |dddddf   z
  }|j                  ddd      j                  dd      j                         }t        j                  |      t        j                  d|j                         z         z  }| j                  d	|d
       y)zMInitialize the pair-wise relative positions to compute the positional biases.r   )devicer   r?   Nr    r/         ?relative_coordinates_logF
persistent)r^   rc   r[   stackr   aranger+   flattenr#   reshapefloatsignr\   absregister_buffer)r`   rc   coordinatesrelative_coordinatesrf   s        r%   r_   z;WindowMultiHeadAttention._make_pair_wise_relative_positions   s   !!((kk&LL))!,V<LL))!,V<#
  '!* 	  +1a:6QaZ9PP3;;Aq!DLLRQRSYY[#(::.B#Ceii&**,,G. $. 79Q^cdr(   c                 h    t        |      }|| j                  k7  r|| _        | j                          yy)zUpdate window size and regenerate relative position coordinates.

        Args:
            window_size: New window size.
        N)r   r+   r_   )r`   r+   s     r%   set_window_sizez(WindowMultiHeadAttention.set_window_size   s5      ,$****D335 +r(   c                     | j                   d   | j                   d   z  }| j                  | j                        }|j                  dd      j	                  | j
                  ||      }|j                  d      }|S )zCompute the relative positional encodings.

        Returns:
            Relative positional encodings of shape (1, num_heads, window_size**2, window_size**2).
        r   r   )r+   rY   rf   	transposerl   r@   	unsqueeze)r`   window_arearelative_position_biass      r%   _relative_positional_encodingsz7WindowMultiHeadAttention._relative_positional_encodings   s{     &&q)D,<,<Q,??!%t/L/L!M!7!A!A!Q!G!O!ONNK"
 "8!A!A!!D%%r(   r   maskc                    |j                   \  }}}| j                  |      j                  ||d| j                  || j                  z        j	                  ddddd      }|j                  d      \  }}}	t        j                  |d      t        j                  |d      j                  dd      z  }
t        j                  | j                  j                  d| j                  dd      t        j                  d	      
      j                         }|
|z  }
|
| j!                         z   }
|r|j                   d   }|
j                  ||z  || j                  ||      }
|
|j#                  d      j#                  d      z   }
|
j                  d| j                  ||      }
|
j%                  d      }
| j'                  |
      }
|
|	z  j                  dd      j                  ||d      }| j)                  |      }| j+                  |      }|S )a  Forward pass of window multi-head self-attention.

        Args:
            x: Input tensor of shape (B * windows, N, C).
            mask: Attention mask for the shift case.

        Returns:
            Output tensor of shape (B * windows, N, C).
        r!   r    r   r   r-   r/   rd   g      Y@)max)r1   rS   r2   r@   r#   unbindF	normalizerv   r[   clampr^   rl   mathr\   exprz   rw   softmaxrU   rV   rW   )r`   r   r{   BwLr7   rS   querykeyvalueattnr^   num_wins                r%   forwardz WindowMultiHeadAttention.forward   s    77Aqhhqkr1adnn9LMUUVWYZ\]_`bcdJJqMsE Er*Q[["-E-O-OPRTV-WWkk$"2"2":":1dnnaQR"SY]YaYabkYlmqqsk!d99;;::a=G99R7]GT^^QJD$..+55a88D99RA6D|||#~~d#E\$$Q*222q"=IIaLNN1r(   )rJ   rJ     F)r   NN)__name__
__module____qualname____doc__intr   rm   boolrP   r_   rt   r[   Tensorrz   r   r   __classcell__ra   s   @r%   r>   r>   `   s    
" " % 2 2  2 38_	 2
  2  2  2  2 
 2De	65c? 	6t 	6& &! !Xell-C !u|| !r(   r>   c                        e Zd ZdZddddddddddej
                  fdeded	eeef   d
eeef   deeef   dedede	de
e	   de	de	de	dededeej                     f fdZdeeef   deeeef   eeef   f   fdZd!de
ej                      de
ej                      fdZd Zd	eeef   d
eeef   ddfdZd Zdej                   dej                   fd Z xZS )"SwinTransformerV2CrBlocka5  This class implements the Swin transformer block.

    Args:
        dim (int): Number of input channels
        num_heads (int): Number of attention heads to be utilized
        feat_size (Tuple[int, int]): Input resolution
        window_size (Tuple[int, int]): Window size to be utilized
        shift_size (int): Shifting size to be used
        mlp_ratio (int): Ratio of the hidden dimension in the FFN to the input channels
        proj_drop (float): Dropout in input mapping
        drop_attn (float): Dropout rate of attention map
        drop_path (float): Dropout in main path
        extra_norm (bool): Insert extra norm on 'main' branch if True
        sequential_attn (bool): If true sequential self-attention is performed
        norm_layer (Type[nn.Module]): Type of normalization layer to be utilized
    )r   r   F      @r   rJ   r?   r@   	feat_sizer+   
shift_sizealways_partitiondynamic_mask	mlp_ratioinit_valuesrW   rA   	drop_path
extra_normrD   
norm_layerc                 6   t         t        |           || _        || _        t        |      | _        || _        || _        | j                  |      \  | _
        | _        | j                  d   | j                  d   z  | _        |	| _        t        ||| j                  ||
|      | _         ||      | _        |dkD  rt#        |      nt%        j&                         | _        t+        |t-        ||z        |
|      | _         ||      | _        |dkD  rt#        |      nt%        j&                         | _        |r ||      nt%        j&                         | _        | j7                  d| j                  rd n| j9                         d	       | j;                          y )
Nr   r   )r?   r@   r+   rA   rB   rD   rJ   )	drop_prob)rG   rK   rM   rH   	attn_maskFrg   )rO   r   rP   r?   r   r   target_shift_sizer   r   _calc_window_shiftr+   r   rx   r   r>   r   norm1r   rQ   Identity
drop_path1r   r   mlpnorm2
drop_path2norm3rp   get_attn_maskinit_weights)r`   r?   r@   r   r+   r   r   r   r   r   rW   rA   r   r   rD   r   ra   s                   r%   rP   z!SwinTransformerV2CrBlock.__init__   sv   $ 	&68*32;J2G 0(,0,C,CK,P)$/++A.1A1A!1DD,7 -((+
	  _
;Ds?(Y7PRP[P[P] i0	
  _
;Ds?(Y7PRP[P[P] )3Z_
%%D4+=+=+? 	 	

 	r(   target_window_sizer   c                    t        |      }| j                  }t        |      r|d   dz  |d   dz  f}| j                  r||fS t	        | j
                  |      D cg c]  \  }}||k  r|n| }}}t	        | j
                  ||      D cg c]  \  }}}||k  rdn| }}}}t        |      t        |      fS c c}}w c c}}}w )Nr   r    r   )r   r   anyr   zipr   tuple)r`   r   r   fwr+   sr   s           r%   r   z+SwinTransformerV2CrBlock._calc_window_shift(  s     ''9: 22 !!3A!6!!;=OPQ=RVW=W X  %'88869$..J\6]^daAFq)^^8;DNNKYj8kllWQ116aq(l
l[!5#444 _ls   B;CNr   c           	         t        | j                        r|&t        j                  dg| j                  d      }nJt        j                  d|j
                  d   |j
                  d   df|j                  |j                        }d}d| j                  d    f| j                  d    | j                  d    f| j                  d    dffD ]l  }d| j                  d    f| j                  d    | j                  d    f| j                  d    dffD ]$  }||dd|d   |d   |d   |d   ddf<   |dz  }& n t        || j                        }|j                  d| j                        }|j                  d      |j                  d      z
  }|j                  |dk7  t        d            j                  |dk(  t        d            }|S d}|S )	z7Method generates the attention mask used in shift case.Nr   r    )dtyperc   r   r/   g      YrJ   )r   r   r[   zerosr   r1   r   rc   r+   r9   r2   rx   rw   masked_fillrm   )r`   r   img_maskcnthr   mask_windowsr   s           r%   r   z&SwinTransformerV2CrBlock.get_attn_mask9  s    ty ;;'>DNN'>A'>? ;;1771:qwwqz1'EQWW]^]e]efC))!,,-&&q))DOOA,>+>?ooa(($/  T--a001**1--0B/BC//!,,d3 A
 <?HQ!QqT	1Q4!9a781HC ,Hd6F6FGL',,R1A1ABL$..q1L4J4J14MMI!--i1neFmLXXYbfgYginoristI  Ir(   c                    | j                   }t        j                  j                  | j                  j
                  | j                          t        j                  j                  | j                  j
                  | j                          y y r   )r   rQ   init	constant_r   weightr   r`   s    r%   r   z%SwinTransformerV2CrBlock.init_weightsW  s[    'GGdjj//1A1ABGGdjj//1A1AB (r(   c                 R   || _         | j                  t        |            \  | _        | _        | j                  d   | j                  d   z  | _        | j                  j                  | j                         | j                  d| j                  rdn| j                         d       y)zMethod updates the image resolution to be processed and window size and so the pair-wise relative positions.

        Args:
            feat_size (Tuple[int, int]): New input resolution
            window_size (int): New window size
        r   r   r   NFrg   )r   r   r   r+   r   rx   r   rt   rp   r   r   )r`   r   r+   s      r%   set_input_sizez'SwinTransformerV2CrBlock.set_input_size]  s     +4,0,C,CIkDZ,[)$/++A.1A1A!1DD		!!$"2"23%%D4+=+=+? 	 	
r(   c           	         |j                   \  }}}}| j                  \  }}t        | j                        }|rt        j                  || | fd      }| j
                  d   || j
                  d   z  z
  | j
                  d   z  }	| j
                  d   || j
                  d   z  z
  | j
                  d   z  }
t        j                  j                  j                  |ddd|
d|	f      }|j                   \  }}}}t        || j
                        }|j                  d| j
                  d   | j
                  d   z  |      }t        | dd      r| j                  |      }n| j                  }| j                  ||      }|j                  d| j
                  d   | j
                  d   |      }t        || j
                  ||f      }|d d d |d |d d f   j!                         }|rt        j                  |||fd      }|S )	N)r   r    )shiftsdimsr   r   r/   r   F)r{   )r1   r   r   r[   rollr+   rQ   
functionalpadr9   r2   getattrr   r   r   r<   r3   )r`   r   r4   r5   r6   r7   shswdo_shiftpad_hpad_w_HpWp	x_windowsr   attn_windowss                    r%   _shifted_window_attnz-SwinTransformerV2CrBlock._shifted_window_attno  s   WW
1a BT__- 

1rcB3Zf=A!!!$q4+;+;A+>'>>$BRBRSTBUU!!!$q4+;+;A+>'>>$BRBRSTBUUHH##A1a5'ABww2r1 %Q(8(89	NN2t'7'7':T=M=Ma=P'PRST	 4/**1-IIyyy; $((T-=-=a-@$BRBRSTBUWXY<)9)9B8Da!RaRlO&&(  

1b"XF;Ar(   c                 `   || j                  | j                  | j                  |                  z   }|j                  \  }}}}|j	                  |d|      }|| j                  | j                  | j                  |                  z   }| j                  |      }|j	                  ||||      }|S )zForward pass of Swin Transformer V2 block.

        Args:
            x: Input tensor of shape [B, C, H, W].

        Returns:
            Output tensor of shape [B, C, H, W].
        r/   )	r   r   r   r1   rl   r   r   r   r   )r`   r   r4   r5   r6   r7   s         r%   r   z SwinTransformerV2CrBlock.forward  s     

4+D+DQ+G HIIWW
1aIIaQ

488A; 788JJqMIIaAq!r(   r   )r   r   r   r   rQ   	LayerNormr   r   r   rm   r   r	   ModulerP   r   r[   r   r   r   r   r   r   r   r   s   @r%   r   r      s   . +1%*!&"+,"""$$)*,,,!;; ; S#X	;
 sCx; c3h; #; ; ; "%; ; ; ; ; ";  RYY!;z5 %c3h5 
uS#Xc3h/	05"x5 %,,AW <C
c3h 
eCQTHo 
Z^ 
$(T %,, r(   r   c                        e Zd ZdZej
                  fdedeej                     ddf fdZ	de
j                  de
j                  fdZ xZS )	PatchMergingz|Patch merging layer.

    This class implements the patch merging as a strided convolution with a normalization before.
    r?   r   r   Nc                     t         t        |            |d|z        | _        t	        j
                  d|z  d|z  d      | _        y)zInitialize patch merging layer.

        Args:
            dim: Number of input channels.
            norm_layer: Type of normalization layer to be utilized.
        r-   r    FrF   N)rO   r   rP   normrQ   rR   	reduction)r`   r?   r   ra   s      r%   rP   zPatchMerging.__init__  s?     	lD*,q3w'	q3wQWSXYr(   r   c                 h   |j                   \  }}}}ddd|dz  d|dz  f}t        j                  j                  ||      }|j                   \  }}}}|j	                  ||dz  d|dz  d|      j                  dddddd      j                  d      }| j                  |      }| j                  |      }|S )zForward pass of patch merging.

        Args:
            x: Input tensor of shape [B, C, H, W].

        Returns:
            Output tensor of shape [B, 2 * C, H // 2, W // 2].
        r   r    r   r!   r-   r.   )	r1   rQ   r   r   rl   r#   rk   r   r   )r`   r   r4   r5   r6   r7   
pad_valuesr   s           r%   r   zPatchMerging.forward  s     WW
1aAq1uaQ/
MMa,WW
1aIIaaAFAq199!Q1aKSSTUVIIaLNN1r(   )r   r   r   r   rQ   r   r   r	   r   rP   r[   r   r   r   r   s   @r%   r   r     sU    
 @B|| 	ZC 	ZT"))_ 	ZPT 	Z %,, r(   r   c                        e Zd ZdZ	 	 	 	 	 	 ddeeeeef   f   deeeeef   f   dededee   de	d	df fd
Z
deeef   d	dfdZdej                  d	ej                  fdZ xZS )
PatchEmbedz2D Image to Patch Embedding.Nr:   
patch_sizein_chans	embed_dimr   strict_img_sizer   c                    t         |           t        |      }t        |      }|| _        || _        |d   |d   z  |d   |d   z  f| _        | j
                  d   | j
                  d   z  | _        || _        t        j                  ||||      | _
        |r ||      | _        yt        j                         | _        y)a@  Initialize patch embedding.

        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of input channels.
            embed_dim: Embedding dimension.
            norm_layer: Normalization layer.
            strict_img_size: Enforce strict image size.
        r   r   )kernel_sizestrideN)rO   rP   r   r:   r   	grid_sizenum_patchesr   rQ   Conv2drV   r   r   )r`   r:   r   r   r   r   r   ra   s          r%   rP   zPatchEmbed.__init__  s    & 	X&z*
 $"1+A6zRS}8TU>>!,t~~a/@@.IIh	zR\]	-7Jy)	R[[]	r(   c                     t        |      }|| j                  k7  r[|| _        |d   | j                  d   z  |d   | j                  d   z  f| _        | j                  d   | j                  d   z  | _        yy)zVUpdate input image size.

        Args:
            img_size: New image size.
        r   r   N)r   r:   r   r   r   )r`   r:   s     r%   r   zPatchEmbed.set_input_size  sy     X&t}}$$DM&qkT__Q-??!PTP_P_`aPbAbcDN#~~a04>>!3DDD %r(   r   c                    |j                   \  }}}}| j                  r`t        || j                  d   k(  d| d| j                  d    d       t        || j                  d   k(  d| d| j                  d    d       | j	                  |      }| j                  |j                  dddd            j                  dddd      }|S )	zForward pass of patch embedding.

        Args:
            x: Input tensor of shape [B, C, H, W].

        Returns:
            Output tensor of shape [B, C', H', W'].
        r   zInput image height (z) doesn't match model (z).r   zInput image width (r    r!   )r1   r   r   r:   rV   r   r#   )r`   r   r4   r7   r5   r6   s         r%   r   zPatchEmbed.forward   s     WW
1aAq))-A!D[\`\i\ijk\l[mmo+pqAq))-@CZ[_[h[hij[kZlln+opIIaLIIaii1a+,44Q1a@r(   )      r!   i   NT)r   r   r   r   r
   r   r   r   r   r   rP   r   r[   r   r   r   r   s   @r%   r   r     s    & 5868 -1$(KCsCx01K c5c?23K 	K
 K !*K "K 
K>
EuS#X 
E4 
E %,, r(   r   c            #       8    e Zd ZdZdddddddej
                  dddfdededed	ed
eeef   deeef   dedede	de
e	   de	de	deee	   e	f   deej                     dededef" fdZ	 dd
eeef   dede
e   fdZdej$                  dej$                  fdZ xZS )SwinTransformerV2CrStagea<  This class implements a stage of the Swin transformer including multiple layers.

    Args:
        embed_dim (int): Number of input channels
        depth (int): Depth of the stage (number of layers)
        downscale (bool): If true input is downsampled (see Fig. 3 or V1 paper)
        feat_size (Tuple[int, int]): input feature map size (H, W)
        num_heads (int): Number of attention heads to be utilized
        window_size (int): Window size to be utilized
        mlp_ratio (int): Ratio of the hidden dimension in the FFN to the input channels
        proj_drop (float): Dropout in input mapping
        drop_attn (float): Dropout rate of attention map
        drop_path (float): Dropout in main path
        norm_layer (Type[nn.Module]): Type of normalization layer to be utilized. Default: nn.LayerNorm
        extra_norm_period (int): Insert extra norm layer on main branch every N (period) blocks
        extra_norm_stage (bool): End each stage with an extra norm layer in main branch
        sequential_attn (bool): If true sequential self-attention is performed
    Fr   rJ   r   r   depth	downscaler@   r   r+   r   r   r   r   rW   rA   r   r   extra_norm_periodextra_norm_stagerD   c                 *   t         t        |           || _        d| _        |r|d   dz  |d   dz  fn|| _        |rt        ||      | _        |dz  }nt        j                         | _        fd}t        j                  t              D cg c]e  }t        ||| j
                  |||t        |D cg c]  }|dz  dk(  rdn|dz   c}      |	|
||t        |t              r||   n| ||      ||      g c}} | _        y c c}w c c}}w )NFr   r    r   )r   c                 6    | dz   }r	|z  dk(  ryr|k(  S dS )Nr   r   TF )indexir   r   r   s     r%   _extra_normz6SwinTransformerV2CrStage.__init__.<locals>._extra_normE  s0    	A Q):%:a%?!11:<u<r(   )r?   r@   r   r+   r   r   r   r   r   rW   rA   r   r   rD   r   )rO   r   rP   r   grad_checkpointingr   r   
downsamplerQ   r   
Sequentialranger   r   
isinstancelistblocks)r`   r   r   r   r@   r   r+   r   r   r   r   rW   rA   r   r   r   r   rD   r  r   r   ra   s     `            ``    r%   rP   z!SwinTransformerV2CrStage.__init__&  s   ( 	&68((-T]9Q<1+<ila>O*Pcl*9LDO!AI kkmDO	= mm$ u%&'$ # %#..'!1) {![!q(8!qAv"E![\#'##.8D.I)E*y&u- /%&' 
 "\&'s   #D
8D
$1D

D
c                     | j                   r|d   dz  |d   dz  fn|| _        | j                  D ]  }|j                  | j                  |       ! y)z Updates the resolution to utilize and the window size and so the pair-wise relative positions.

        Args:
            window_size (int): New window size
            feat_size (Tuple[int, int]): New input resolution
        r   r    r   )r   r+   N)r   r   r  r   )r`   r   r+   r   blocks        r%   r   z'SwinTransformerV2CrStage.set_input_size`  s]     DH>>)A,!+Yq\Q->?W`[[ 	E  ..' ! 	r(   r   r   c                     t        |      }| j                  |      }| j                  D ]A  }| j                  r+t        j
                  j                         st        ||      }: ||      }C t        |      }|S )zForward pass.
        Args:
            x (torch.Tensor): Input tensor of the shape [B, C, H, W] or [B, L, C]
        Returns:
            output (torch.Tensor): Output tensor of the shape [B, 2 * C, H // 2, W // 2]
        )	r&   r  r  r  r[   jitis_scriptingr   r*   )r`   r   r
  s      r%   r   z SwinTransformerV2CrStage.forwards  sk     OOOA[[ 	E&&uyy/E/E/Gua(!H	 Or(   r   )r   r   r   r   rQ   r   r   r   r   rm   r   r
   r   r	   r   rP   r   r[   r   r   r   r   s   @r%   r   r     sT   6 "'"'*/2&(ll!"!& %%8
8
 8
 	8

 8
 c?8
 38_8
 8
 8
 8
 e_8
 8
 8
 ee+,8
 O8
  !8
" #8
$ %8
| 04	S#X  'tn	& %,, r(   r   c            1           e Zd ZdZdddddddd	d
ddddddddej
                  dddddfdeeef   dedee   dede	de	dedededeedf   deedf   de
dee
   d e
d!e
d"e
d#e
d$eej                     d%ed&e	d'e	d(ed)ed*df0 fd+Z	 	 	 	 d?deeeef      deeeef      dedee	   d*df
d,Zej$                  j&                  d@d-       Zej$                  j&                  dAd.       Zej$                  j'                         d*ej                  fd/       ZdBded(ee   d*dfd0Z	 	 	 	 	 dCd1ej0                  d2eeeee   f      d3e	d4e	d5ed6e	d*eeej0                     eej0                  eej0                     f   f   fd7Z	 	 	 dDd2eeee   f   d8e	d9e	fd:Zd1ej0                  d*ej0                  fd;Zd@d<e	fd=Zd1ej0                  d*ej0                  fd>Z xZ S )Er   a   Swin Transformer V2
        A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`  -
          https://arxiv.org/pdf/2111.09883

    Args:
        img_size: Input resolution.
        window_size: Window size. If None, grid_size // window_div
        window_ratio: Window size to patch grid ratio.
        patch_size: Patch size.
        in_chans: Number of input channels.
        depths: Depth of the stage (number of layers).
        num_heads: Number of attention heads to be utilized.
        embed_dim: Patch embedding dimension.
        num_classes: Number of output classes.
        mlp_ratio:  Ratio of the hidden dimension in the FFN to the input channels.
        drop_rate: Dropout rate.
        proj_drop_rate: Projection dropout rate.
        attn_drop_rate: Dropout rate of attention map.
        drop_path_rate: Stochastic depth rate.
        norm_layer: Type of normalization layer to be utilized.
        extra_norm_period: Insert extra norm layer on main branch every N (period) blocks in stage
        extra_norm_stage: End each stage with an extra norm layer in main branch
        sequential_attn: If true sequential self-attention is performed.
    )r   r   r-   N   FTr!     `   r    r       r    r!   r        r   rJ   r   avgskipr:   r   r+   window_ratior   r   r   num_classesr   depths.r@   r   r   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rater   r   r   rD   global_poolkwargsr   c                 f   t         t        |           t        |      }|| _        || _        || _        t        |	dt        |
      dz
  z  z        x| _	        | _
        g | _        t        ||||	||      | _        | j                  j                  }|"t        |D cg c]  }||z  	 c}      | _        nt        |      | _        t#        j$                  d|t'        |
            j)                  |
      D cg c]  }|j+                          }}g }|	}d}t-        t/        |
|            D ]  \  } \  }!}|t1        di d|d|!d| dk7  d|d   |z  |d   |z  fd	|d
| j                   d|d| d|d|d|d|d||    d|d|xs | dz   t        |
      k(  d|d|gz  }| dk7  r
|dz  }|dz  }| xj                  t3        |d|z  d|        gz  c_         t5        j6                  | | _        t;        | j                  |||      | _        |dk7  rt?        t@        |        y y c c}w c c}w )Nr    r   )r:   r   r   r   r   r   r   r   r   r   r   r@   r+   r   r   r   r   rW   rA   r   r   r   rD   r   r-   zstages.)num_chsr   module)	pool_typer  r  r   )!rO   r   rP   r   r  r   r:   r   lennum_featureshead_hidden_sizefeature_infor   patch_embedr   r   r+   r[   linspacesumsplittolist	enumerater   r   dictrQ   r  stagesr   headr   r   )#r`   r:   r   r+   r  r   r   r   r  r   r  r@   r   r   r  r  r  r  r   r   r   rD   r   weight_initr!  r   r   r   dprr1  in_dimin_scale	stage_idxr   ra   s#                                     r%   rP   zSwinTransformerV2Cr.__init__  s   6 	!413X& +))147	A#f+XY/DZ8Z4[[D1%!!+
 $$..	$%KAa<&7%KLD(5D#(>>!^S[#Q#W#WX^#_`aqxxz``-6s697M-N 	n)I)y/   $q. %Q<83Yq\X5MN	
 $ !,, "2 "10 $ ( ) ) i. #4 "2!Si!mF5S  !0!" &#  F& A~!A$vXX_`i_jVk"l!mm/	n0 mmV,"!	
	 & d+ !W &L as   H)5H.c                 X   |2| j                   j                  |       | j                   j                  }||t        D cg c]  }||z  	 c}      }t	        | j
                        D ]9  \  }}dt        |dz
  d      z  }	|j                  d   |	z  |d   |	z  f||       ; yc c}w )a  Updates the image resolution, window size and so the pair-wise relative positions.

        Args:
            img_size (Optional[Tuple[int, int]]): New input resolution, if None current resolution is used
            window_size (Optional[int]): New window size, if None based on new_img_size // window_div
            window_ratio (int): divisor for calculating window size from patch grid size
            always_partition: always partition / shift windows even if feat size is < window
        N)r:   r    r   r   )r   r+   r   )r*  r   r   r   r/  r1  r~   )
r`   r:   r+   r  r   r   r   r   stagestage_scales
             r%   r   z"SwinTransformerV2Cr.set_input_size  s     ++X+>((22I<#;I Fql!2 FGK%dkk2 	LE5s519a00K  $Q<;6	!8ST'!1 ! 	 !Gs   B'c                 2    t        d|rd      S ddg      S )Nz^patch_embedz^stages\.(\d+))z^stages\.(\d+).downsample)r   )z^stages\.(\d+)\.\w+\.(\d+)N)stemr  )r0  )r`   coarses     r%   group_matcherz!SwinTransformerV2Cr.group_matcher  s/     (.$
 	
 555
 	
r(   c                 4    | j                   D ]	  }||_         y r   )r1  r  )r`   enabler   s      r%   set_grad_checkpointingz*SwinTransformerV2Cr.set_grad_checkpointing#  s     	*A#)A 	*r(   c                 .    | j                   j                  S )zMethod returns the classification head of the model.
        Returns:
            head (nn.Module): Current classification head
        )r2  fcr   s    r%   get_classifierz"SwinTransformerV2Cr.get_classifier(  s     yy||r(   c                 J    || _         | j                  j                  ||       y)zMethod results the classification head

        Args:
            num_classes (int): Number of classes to be predicted
            global_pool (str): Unused
        N)r  r2  reset)r`   r  r   s      r%   reset_classifierz$SwinTransformerV2Cr.reset_classifier0  s     '		[1r(   r   indicesr   
stop_early
output_fmtintermediates_onlyc                 r   |dv sJ d       g }t        t        | j                        |      \  }}	| j                  |      }t        j
                  j                         s|s| j                  }
n| j                  d|	dz    }
t        |
      D ]#  \  }} ||      }||v s|j                  |       % |r|S ||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )NCHWzOutput shape must be NCHW.Nr   )	r   r&  r1  r*  r[   r  r  r/  append)r`   r   rH  r   rI  rJ  rK  intermediatestake_indices	max_indexr1  r   r9  s                r%   forward_intermediatesz)SwinTransformerV2Cr.forward_intermediates:  s    * Y&D(DD&"6s4;;7G"Qi Q99!!#:[[F[[)a-0F!&) 	(HAuaAL $$Q'	(
   -r(   
prune_norm
prune_headc                     t        t        | j                        |      \  }}| j                  d|dz    | _        |r| j                  dd       |S )z@ Prune layers not required for specified intermediates.
        Nr   r    )r   r&  r1  rG  )r`   rH  rS  rT  rP  rQ  s         r%   prune_intermediate_layersz-SwinTransformerV2Cr.prune_intermediate_layersd  sM     #7s4;;7G"Qikk.9q=1!!!R(r(   c                 J    | j                  |      }| j                  |      }|S r   )r*  r1  r`   r   s     r%   forward_featuresz$SwinTransformerV2Cr.forward_featuresr  s$    QKKNr(   
pre_logitsc                 N    |r| j                  |d      S | j                  |      S )NT)r[  )r2  )r`   r   r[  s      r%   forward_headz SwinTransformerV2Cr.forward_headw  s$    0:tyyty,L		!Lr(   c                 J    | j                  |      }| j                  |      }|S r   )rZ  r]  rY  s     r%   r   zSwinTransformerV2Cr.forwardz  s'    !!!$a r(   )NNr  NF)Tr   )NFFrM  F)r   FT)!r   r   r   r   rQ   r   r   r   r   r   rm   r	   r   strr   rP   r   r[   r  ignorer>  rA  rD  rG  r   r
   r   rR  rW  rZ  r]  r   r   r   s   @r%   r   r     s   6 %/%)!& $".%3') # # #&(ll!"!& % 1Y,S/Y, Y, c]	Y,
 Y, Y, Y, Y, Y, Y, c3hY, c?Y, Y, e_Y, Y,  !Y," #Y,$ %Y,& O'Y,( )Y,* +Y,, -Y,. /Y,2 3Y,4 
5Y,z 3759 !/3uS#X/ "%S/2 	
 'tn 
< YY
 
 YY* * YY		  2C 2hsm 2W[ 2 8<$$',( ||(  eCcN34(  	( 
 (  (  !%(  
tELL!5tELL7I)I#JJ	K( X ./$#	3S	>*  	%,, 5<< 
M$ M %,, r(   rV  r$  namec                    t        | t        j                        rd|v rt        j                  dt        | j                  j                  d   dz  | j                  j                  d   z         z        }t        j                  j                  | j                  | |       nWd|v r*t        j                  j                  | j                         n)t        j                  j                  | j                         | j                  *t        j                  j                  | j                         y y t        | d      r| j                          y y )NrS   g      @r   r!   r   r2  r   )r  rQ   rR   r   sqrtrm   r   r1   r   uniform_zeros_xavier_uniform_rI   hasattrr   )r$  rb  vals      r%   r   r     s    &"))$D=))Bv}}':':1'='BV]]EXEXYZE['[!\\]CGGV]]SD#6t^GGNN6==)GG##FMM2;;"GGNN6;;' #		( 
)r(   c                    | j                  d|       } | j                  d|       } d| v r| S i }| j                         D ]J  \  }}d|v r*t        j                  d|z        }|j	                  dd      }|j	                  dd      }|||<   L |S )	zJ convert patch embedding weight from manual patchify + linear proj to convmodel
state_dictzhead.fc.weighttaur   r^   zhead.zhead.fc.)getitemsr[   r\   replace)rl  rk  out_dictkvs        r%   checkpoint_filter_fnrt    s    4Jj9J:%H  " 1A:		!a% A		%/AIIgz* Or(   c           	          t        d t        |j                  dd            D              }|j                  d|      }t	        t
        | |ft        t        d|      d|}|S )Nc              3   &   K   | ]	  \  }}|  y wr   r   ).0r   r   s      r%   	<genexpr>z1_create_swin_transformer_v2_cr.<locals>.<genexpr>  s     \da\s   r  )r   r   r   r   out_indicesT)flatten_sequentialry  )pretrained_filter_fnfeature_cfg)r   r/  rn  popr   r   rt  r0  )variant
pretrainedr!  default_out_indicesry  rk  s         r%   _create_swin_transformer_v2_crr    sh    \i

8\8Z.[\\**],?@K Wj1DkJ 	E Lr(   urlc                 2    | ddddddt         t        ddd	|S )
zCreate a default configuration dictionary.

    Args:
        url: Model weights URL.
        **kwargs: Additional configuration parameters.

    Returns:
        Configuration dictionary.
    r  r!   r   r   )   r  ?bicubicTzpatch_embed.projzhead.fc)r  r  
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanstd
first_conv
classifierr   )r  r!  s     r%   _cfgr    s9     #" %#(  r(   zswinv2_cr_tiny_384.untrained)r!   r   r   re   )r  r  )r  r  r  r  zswinv2_cr_tiny_224.untrainedr  r  )r  r  r  zswinv2_cr_tiny_ns_224.sw_in1kztimm/z{https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_tiny_ns_224-ba8166c6.pth)	hf_hub_idr  r  r  zswinv2_cr_small_384.untrainedzswinv2_cr_small_224.sw_in1kzyhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_small_224-0813c165.pthzswinv2_cr_small_ns_224.sw_in1kzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_small_ns_224_iv-2ce90f8e.pthz swinv2_cr_small_ns_256.untrained)r!      r  )r  r  zswinv2_cr_base_384.untrainedzswinv2_cr_base_224.untrainedzswinv2_cr_base_ns_224.untrainedzswinv2_cr_large_384.untrainedzswinv2_cr_large_224.untrainedzswinv2_cr_huge_384.untrainedzswinv2_cr_huge_224.untrainedzswinv2_cr_giant_384.untrainedzswinv2_cr_giant_224.untrainedr  c           	      L    t        ddd      }t        dd| it        |fi |S )z,Swin-T V2 CR @ 384x384, trained ImageNet-1k.r  r  r  r   r  r@   r  )swinv2_cr_tiny_384r0  r  r  r!  
model_argss      r%   r  r    9      J
 *t:tY]^hYslrYsttr(   c           	      L    t        ddd      }t        dd| it        |fi |S )z,Swin-T V2 CR @ 224x224, trained ImageNet-1k.r  r  r  r  r  )swinv2_cr_tiny_224r  r  s      r%   r  r    r  r(   c           	      N    t        dddd      }t        dd| it        |fi |S )zSwin-T V2 CR @ 224x224, trained ImageNet-1k w/ extra stage norms.

    ** Experimental, may make default if results are improved. **
    r  r  r  Tr   r  r@   r   r  )swinv2_cr_tiny_ns_224r  r  s      r%   r  r  	  s<      	J *wjw\`ak\vou\vwwr(   c           	      L    t        ddd      }t        dd| it        |fi |S )z,Swin-S V2 CR @ 384x384, trained ImageNet-1k.r  r    r       r    r  r  r  )swinv2_cr_small_384r  r  s      r%   r  r    9      J
 *uJuZ^_iZtmsZtuur(   c           	      L    t        ddd      }t        dd| it        |fi |S ),Swin-S V2 CR @ 224x224, trained ImageNet-1k.r  r  r  r  r  )swinv2_cr_small_224r  r  s      r%   r  r  #  r  r(   c           	      N    t        dddd      }t        dd| it        |fi |S )r  r  r  r  Tr  r  )swinv2_cr_small_ns_224r  r  s      r%   r  r  .  <      	J *xzx]abl]wpv]wxxr(   c           	      N    t        dddd      }t        dd| it        |fi |S )z,Swin-S V2 CR @ 256x256, trained ImageNet-1k.r  r  r  Tr  r  )swinv2_cr_small_ns_256r  r  s      r%   r  r  :  r  r(   c           	      L    t        ddd      }t        dd| it        |fi |S )z,Swin-B V2 CR @ 384x384, trained ImageNet-1k.   r  r-   r  r       r  r  )swinv2_cr_base_384r  r  s      r%   r  r  F  9      J
 *t:tY]^hYslrYsttr(   c           	      L    t        ddd      }t        dd| it        |fi |S ),Swin-B V2 CR @ 224x224, trained ImageNet-1k.r  r  r  r  r  )swinv2_cr_base_224r  r  s      r%   r  r  Q  r  r(   c           	      N    t        dddd      }t        dd| it        |fi |S )r  r  r  r  Tr  r  )swinv2_cr_base_ns_224r  r  s      r%   r  r  \  s<      	J *wjw\`ak\vou\vwwr(   c           	      L    t        ddd      }t        dd| it        |fi |S )z,Swin-L V2 CR @ 384x384, trained ImageNet-1k.   r  r  r  r  0   r  r  )swinv2_cr_large_384r  r  s      r%   r  r  h  9     !J
 *uJuZ^_iZtmsZtuur(   c           	      L    t        ddd      }t        dd| it        |fi |S )z,Swin-L V2 CR @ 224x224, trained ImageNet-1k.r  r  r  r  r  )swinv2_cr_large_224r  r  s      r%   r  r  s  r  r(   c           	      N    t        dddd      }t        dd| it        |fi |S )z,Swin-H V2 CR @ 384x384, trained ImageNet-1k.`  r  )      ,   X   r  r   r  r@   r   r  )swinv2_cr_huge_384r  r  s      r%   r  r  ~  s<     "	J *t:tY]^hYslrYsttr(   c           	      N    t        dddd      }t        dd| it        |fi |S )z,Swin-H V2 CR @ 224x224, trained ImageNet-1k.r  r  )r  r   r  @   r  r  r  )swinv2_cr_huge_224r  r  s      r%   r  r    s<     !	J *t:tY]^hYslrYsttr(   c           	      N    t        dddd      }t        dd| it        |fi |S )z,Swin-G V2 CR @ 384x384, trained ImageNet-1k.   r    r    *   r    r   r  r  r  r  r  r  )swinv2_cr_giant_384r  r  s      r%   r  r    <     #	J *uJuZ^_iZtmsZtuur(   c           	      N    t        dddd      }t        dd| it        |fi |S )z,Swin-G V2 CR @ 224x224, trained ImageNet-1k.r  r  r  r  r  r  )swinv2_cr_giant_224r  r  s      r%   r  r    r  r(   )rV  r_  )Pr   loggingr   typingr   r   r   r   r   r   r	   r
   r[   torch.nnrQ   torch.nn.functionalr   r   	timm.datar   r   timm.layersr   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r   __all__	getLoggerr   _loggerr   r&   r*   r   r9   r<   r   r>   r   r   r   r   r   r`  r   rt  r  r  default_cfgsr   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r(   r%   <module>r     s  :   J J J     A Q Q * + 3 0 < 
!
'

H
%!ELL !U\\ !
!ELL !U\\ !
 5c? u||   ELL uS#X RWX[]`X`Ra fkfrfr  $vryy vrQryy Qh#299 #L< <~qryy qhw")) wt # ""
c T#s(^ 4 % '&"D=3(%L'& #D=3%8'&
 $T J 3&0'& $T=3(&L'& "4 H 3$0'& %d N 3'0'&& '=3&)J''&* #D=3(%L+'&. #D=3%8/'&2 &t=3(83'&6 $T=3(&L7'&: $T=3&8;'&> #D=3(%L?'&B #D=3%8C'&F $T=3(&LG'&J $T=3&8K'& 'T u4 u>Q u u u4 u>Q u u xd xAT x x vD v?R v v vD v?R v v yt yBU y y yt yBU y y u4 u>Q u u u4 u>Q u u xd xAT x x vD v?R v v vD v?R v v u4 u>Q u u u4 u>Q u u vD v?R v v vD v?R v vr(   