
    kh̿                     n   d Z ddlZddlmZmZmZmZmZmZm	Z	m
Z
mZ ddlZddlmZ ddlmc mZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z# dd	l$m%Z% dd
l&m'Z' ddl(m)Z)m*Z*m+Z+ dgZ,ee-e	e-e-f   f   Z.dej^                  de	e-e-f   dej^                  fdZ0e%dej^                  de	e-e-f   de	e-e-f   dej^                  fd       Z1 G d dejd                        Z3 G d dejd                        Z4 G d dejd                        Z5 G d dejd                        Z6 G d dejd                        Z7dee8ej^                  f   dejd                  dee8ej^                  f   fdZ9dNd e8d!e:de7fd"Z;dOd#Z< e) e<d$d%&       e<d$d'd(d)d*+       e<d$d,&       e<d$d-d(d)d*+       e<d$d.&       e<d$d/&       e<d$d0&       e<d$d1&       e<d$d2&       e<d$d3&       e<d$d4d5d6d78       e<d$d9d5d6d78      d:      Z=e*dNd!e:de7fd;       Z>e*dNd!e:de7fd<       Z?e*dNd!e:de7fd=       Z@e*dNd!e:de7fd>       ZAe*dNd!e:de7fd?       ZBe*dNd!e:de7fd@       ZCe*dNd!e:de7fdA       ZDe*dNd!e:de7fdB       ZEe*dNd!e:de7fdC       ZFe*dNd!e:de7fdD       ZGe*dNd!e:de7fdE       ZHe*dNd!e:de7fdF       ZI e+eJdGdHdIdJdKdLdM       y)PaK   Swin Transformer V2
A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
    - https://arxiv.org/abs/2111.09883

Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below

Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
    N)	AnyCallableDictListOptionalSetTupleTypeUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)

PatchEmbedMlpDropPath	to_2tupletrunc_normal_ClassifierHeadresample_patch_embedndgridget_act_layer	LayerType   )build_model_with_cfg)feature_take_indices)register_notrace_function)
checkpoint)generate_default_cfgsregister_modelregister_model_deprecationsSwinTransformerV2xwindow_sizereturnc                     | j                   \  }}}}| j                  |||d   z  |d   ||d   z  |d   |      } | j                  dddddd      j                         j                  d|d   |d   |      }|S )zPartition into non-overlapping windows.

    Args:
        x: Input tensor of shape (B, H, W, C).
        window_size: Window size (height, width).

    Returns:
        Windows tensor of shape (num_windows*B, window_size[0], window_size[1], C).
    r   r               shapeviewpermute
contiguous)r"   r#   BHWCwindowss          [/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/swin_transformer_v2.pywindow_partitionr6   $   s     JAq!Q	q!{1~%{1~qKN7JKXYN\]^Aii1aAq)446;;BAP[\]P^`abGN    r4   img_sizec                     |\  }}| j                   d   }| j                  d||d   z  ||d   z  |d   |d   |      }|j                  dddddd      j                         j                  d|||      }|S )a1  Merge windows back to feature map.

    Args:
        windows: Windows tensor of shape (num_windows * B, window_size[0], window_size[1], C).
        window_size: Window size (height, width).
        img_size: Image size (height, width).

    Returns:
        Feature map tensor of shape (B, H, W, C).
    r*   r   r   r&   r'   r(   r)   r+   )r4   r#   r8   r1   r2   r3   r"   s          r5   window_reverser:   4   s     DAqbARk!n,a;q>.A;q>S^_`SacdeA			!Q1a#..055b!QBAHr7   c                        e Zd ZdZ	 	 	 	 	 ddedeeef   dededededed	eeef   d
df fdZddZ	deeef   d
dfdZ
ddej                  deej                     d
ej                  fdZ xZS )WindowAttentionzWindow based multi-head self attention (W-MSA) module with relative position bias.

    Supports both shifted and non-shifted window attention with continuous relative
    position bias and cosine attention.
    dimr#   	num_headsqkv_biasqkv_bias_separate	attn_drop	proj_droppretrained_window_sizer$   Nc	           
      $   t         	|           || _        || _        t	        |      | _        || _        || _        t        j                  t        j                  dt        j                  |ddf      z              | _        t        j                  t        j                  ddd      t        j                   d      t        j                  d|d            | _        t        j                  ||d	z  d      | _        |rt        j                  t        j&                  |            | _        | j+                  d
t        j&                  |      d       t        j                  t        j&                  |            | _        nd| _        d| _        d| _        t        j0                  |      | _        t        j                  ||      | _        t        j0                  |      | _        t        j8                  d      | _        | j=                          y)a4  Initialize window attention module.

        Args:
            dim: Number of input channels.
            window_size: The height and width of the window.
            num_heads: Number of attention heads.
            qkv_bias: If True, add a learnable bias to query, key, value.
            qkv_bias_separate: If True, use separate bias for q, k, v projections.
            attn_drop: Dropout ratio of attention weight.
            proj_drop: Dropout ratio of output.
            pretrained_window_size: The height and width of the window in pre-training.
        
   r   r'   i   Tbias)inplaceFr&   k_bias
persistentNr*   r=   )super__init__r=   r#   r   rC   r>   r@   nn	Parametertorchlogoneslogit_scale
SequentialLinearReLUcpb_mlpqkvzerosq_biasregister_bufferv_biasrI   DropoutrA   projrB   Softmaxsoftmax"_make_pair_wise_relative_positions)
selfr=   r#   r>   r?   r@   rA   rB   rC   	__class__s
            r5   rN   zWindowAttention.__init__N   sz   . 	&&/0F&G#"!2<<		"uzz9aQRBS7T2T(UV }}IIa4(GGD!IIc951
 99S#'6,,u{{3'78DK  5;;s+; N,,u{{3'78DKDKDKDKI.IIc3'	I.zzb)//1r7   c                    t        j                  | j                  d   dz
   | j                  d         j                  t         j                        }t        j                  | j                  d   dz
   | j                  d         j                  t         j                        }t        j
                  t        ||            }|j                  ddd      j                         j                  d      }| j                  d   dkD  rO|dddddddfxx   | j                  d   dz
  z  cc<   |dddddddfxx   | j                  d   dz
  z  cc<   nN|dddddddfxx   | j                  d   dz
  z  cc<   |dddddddfxx   | j                  d   dz
  z  cc<   |dz  }t        j                  |      t        j                  t        j                  |      dz         z  t        j                  d      z  }| j                  d|d	       t        j                  | j                  d         }t        j                  | j                  d         }t        j
                  t        ||            }t        j                   |d      }|dddddf   |dddddf   z
  }|j                  ddd      j                         }|dddddfxx   | j                  d   dz
  z  cc<   |dddddfxx   | j                  d   dz
  z  cc<   |dddddfxx   d| j                  d   z  dz
  z  cc<   |j#                  d
      }	| j                  d|	d	       y)z?Create pair-wise relative position index and coordinates table.r   r   r'   N         ?relative_coords_tableFrJ   r*   relative_position_index)rQ   aranger#   tofloat32stackr   r.   r/   	unsqueezerC   signlog2absmathr\   flattensum)
rc   relative_coords_hrelative_coords_wrh   coords_hcoords_wcoordscoords_flattenrelative_coordsri   s
             r5   rb   z2WindowAttention._make_pair_wise_relative_positions   s    "LL4+;+;A+>+B)CTEUEUVWEXY\\]b]j]jk!LL4+;+;A+>+B)CTEUEUVWEXY\\]b]j]jk %F3DFW,X Y 5 = =aA F Q Q S ] ]^_ `&&q)A-!!Q1*-$2M2Ma2PST2TU-!!Q1*-$2M2Ma2PST2TU-!!Q1*-$2B2B12E2IJ-!!Q1*-$2B2B12E2IJ-" %

+@ AEJJII+,s2E4 !46:iil!C46KX]^ << 0 0 34<< 0 0 34VHh78vq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968O\abr7   c                 h    t        |      }|| j                  k7  r|| _        | j                          yy)zUpdate window size and regenerate relative position tables.

        Args:
            window_size: New window size (height, width).
        N)r   r#   rb   )rc   r#   s     r5   set_window_sizezWindowAttention.set_window_size   s5      ,$****D335 +r7   r"   maskc                    |j                   \  }}}| j                  | j                  |      }nt        j                  | j                  | j
                  | j                  f      }| j                  r| j                  |      }||z  }n,t        j                  || j                  j                  |      }|j                  ||d| j                  d      j                  ddddd      }|j                  d      \  }}	}
t        j                  |d      t        j                  |	d      j!                  d	d      z  }t        j"                  | j$                  t'        j(                  d
            j+                         }||z  }| j-                  | j.                        j1                  d| j                        }|| j2                  j1                  d         j1                  | j4                  d   | j4                  d   z  | j4                  d   | j4                  d   z  d      }|j                  ddd      j7                         }dt        j8                  |      z  }||j;                  d      z   }||j                   d   }|j1                  d|| j                  ||      |j;                  d      j;                  d      z   }|j1                  d| j                  ||      }| j=                  |      }n| j=                  |      }| j?                  |      }||
z  j!                  dd      j                  |||      }| jA                  |      }| jC                  |      }|S )a#  Forward pass of window attention.

        Args:
            x: Input features with shape of (num_windows*B, N, C).
            mask: Attention mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None.

        Returns:
            Output features with shape of (num_windows*B, N, C).
        )weightrG   r&   r*   r'   r   r   r(   rL   g      Y@)max   )"r,   r[   rY   rQ   catrI   r]   r@   Flinearr   reshaper>   r.   unbind	normalize	transposeclamprT   rr   rR   exprX   rh   r-   ri   r#   r/   sigmoidrn   ra   rA   r_   rB   )rc   r"   r~   B_Nr3   rY   r?   qkvattnrT   relative_position_bias_tablerelative_position_biasnum_wins                   r5   forwardzWindowAttention.forward   s    77Aq;;((1+Cyy$++t{{DKK!HIH%%hhqkxhhqxHkk"aDNNB7??1aAN**Q-1a A2&QB)?)I)I"b)QQkk$"2"28KLPPRk!'+||D4N4N'O'T'TUWY]YgYg'h$!=d>Z>Z>_>_`b>c!d!i!iQ$"2"21"55t7G7G7JTM]M]^_M`7`bd"f!7!?!?1a!H!S!S!U!#emm4J&K!K,66q99jjmG99R$..!Q?$..QRBSB]B]^_B``D99RA6D<<%D<<%D~~d#AX  A&..r1a8IIaLNN1r7   )TF        r   )r   r   r$   NN)__name__
__module____qualname____doc__intr	   boolfloatrN   rb   r}   rQ   Tensorr   r   __classcell__rd   s   @r5   r<   r<   G   s     "&+!!6<5252 sCx52 	52
 52  $52 52 52 %*#s(O52 
52nc>	65c? 	6t 	61 1Xell-C 1u|| 1r7   r<   c                       e Zd ZdZddddddddddej
                  dfd	ed
edededededede	dede	de	de	de
deej                     def fdZd$deej                      deej                      fdZ	 d$dedee   deeeef   eeef   f   fdZ	 d$d eeef   deeef   dee   ddfd!Zdej                   dej                   fd"Zdej                   dej                   fd#Z xZS )%SwinTransformerV2BlockzSwin Transformer V2 Block.

    A standard transformer block with window attention and shifted window attention
    for modeling long-range dependencies efficiently.
       r   F      @Tr   gelur=   input_resolutionr>   r#   
shift_sizealways_partitiondynamic_mask	mlp_ratior?   rB   rA   	drop_path	act_layer
norm_layerrC   c                 "   t         |           || _        t        |      | _        || _        t        |      | _        || _        || _        | j                  ||      \  | _
        | _        | j                  d   | j                  d   z  | _        || _        t        |      }t        |t        | j                        ||	||
t        |            | _         ||      | _        |dkD  rt%        |      nt'        j(                         | _        t-        |t/        ||z        ||
      | _         ||      | _        |dkD  rt%        |      nt'        j(                         | _        | j7                  d| j                  rdn| j9                         d	       y)
a  
        Args:
            dim: Number of input channels.
            input_resolution: Input resolution.
            num_heads: Number of attention heads.
            window_size: Window size.
            shift_size: Shift size for SW-MSA.
            always_partition: Always partition into full windows and shift
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
            pretrained_window_size: Window size in pretraining.
        r   r   )r#   r>   r?   rA   rB   rC   r   )in_featureshidden_featuresr   drop	attn_maskNFrJ   )rM   rN   r=   r   r   r>   target_shift_sizer   r   _calc_window_shiftr#   r   window_arear   r   r<   r   norm1r   rO   Identity
drop_path1r   r   mlpnorm2
drop_path2r\   get_attn_mask)rc   r=   r   r>   r#   r   r   r   r   r?   rB   rA   r   r   r   rC   rd   s                   r5   rN   zSwinTransformerV2Block.__init__   sk   F 	 )*: ;"!*:!6 0(,0,C,CKQ[,\)$/++A.1A1A!1DD"!),	#!$"2"23#,-C#D
	  _
1:R(9-R[[]i0	
  _
1:R(9-R[[]%%D4+=+=+? 	 	
r7   Nr"   r$   c           	         t        | j                        r|&t        j                  dg| j                  d      }nJt        j                  d|j
                  d   |j
                  d   df|j                  |j                        }d}d| j                  d    f| j                  d    | j                  d    f| j                  d    dffD ]l  }d| j                  d    f| j                  d    | j                  d    f| j                  d    dffD ]$  }||dd|d   |d   |d   |d   ddf<   |dz  }& n t        || j                        }|j                  d| j                        }|j                  d      |j                  d      z
  }|j                  |dk7  t        d            j                  |dk(  t        d            }|S d}|S )	zGenerate attention mask for shifted window attention.

        Args:
            x: Input tensor for dynamic shape calculation.

        Returns:
            Attention mask or None if no shift.
        Nr   r'   )dtypedevicer   r*   g      Yr   )anyr   rQ   rZ   r   r,   r   r   r#   r6   r-   r   rn   masked_fillr   )rc   r"   img_maskcnthwmask_windowsr   s           r5   r   z$SwinTransformerV2Block.get_attn_mask4  s    ty ;;'ED,A,A'E1'EF ;;1771:qwwqz1'EQWW]^]e]efC))!,,-&&q))DOOA,>+>?ooa(($/  T--a001**1--0B/BC//!,,d3 A
 <?HQ!QqT	1Q4!9a781HC ,Hd6F6FGL',,R1A1ABL$..q1L4J4J14MMI!--i1neFmLXXYbfgYginoristI  Ir7   target_window_sizer   c                    t        |      }|(| j                  }t        |      r|d   dz  |d   dz  f}nt        |      }| j                  r||fS t        |      }t        |      }t	        | j
                  |      D cg c]  \  }}||k  r|n| }}}t	        | j
                  ||      D cg c]  \  }}}||k  rdn| }}}}t        |      t        |      fS c c}}w c c}}}w )a  Calculate window size and shift size based on input resolution.

        Args:
            target_window_size: Target window size.
            target_shift_size: Target shift size.

        Returns:
            Tuple of (adjusted_window_size, adjusted_shift_size).
        r   r'   r   )r   r   r   r   zipr   tuple)rc   r   r   rr   r#   sr   s           r5   r   z)SwinTransformerV2Block._calc_window_shiftX  s    ''9:$ $ 6 6$%%7%:a%?ASTUAVZ[A[$\! )*; <  %'888&'9:%&7869$:O:OQc6dedaAFq)ee8;D<Q<QS^`q8rssWQ116aq(s
s[!5#444 fss    C1C%	feat_sizec                 d   || _         ||| _        | j                  t        |            \  | _        | _        | j                  d   | j                  d   z  | _        | j                  j                  | j                         | j                  d| j                  rdn| j                         d       y)zSet input size and update window configuration.

        Args:
            feat_size: New feature map size.
            window_size: New window size.
            always_partition: Override always_partition setting.
        Nr   r   r   FrJ   )r   r   r   r   r#   r   r   r   r}   r\   r   r   )rc   r   r#   r   s       r5   set_input_sizez%SwinTransformerV2Block.set_input_sizey  s     !*'$4D!,0,C,CIkDZ,[)$/++A.1A1A!1DD		!!$"2"23%%D4+=+=+? 	 	
r7   c           	         |j                   \  }}}}t        | j                        }|r7t        j                  || j                  d    | j                  d    fd      }n|}| j
                  d   || j
                  d   z  z
  | j
                  d   z  }| j
                  d   || j
                  d   z  z
  | j
                  d   z  }	t        j                  j                  j                  |ddd|	d|f      }|j                   \  }
}}}
t        || j
                        }|j                  d| j                  |      }t        | dd      r| j                  |      }n| j                  }| j                  ||      }|j                  d| j
                  d   | j
                  d   |      }t!        || j
                  ||f      }|d	d	d	|d	|d	d	f   j#                         }|r$t        j                  || j                  d      }|S |}|S )
zApply windowed attention with optional shift.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Output tensor of shape (B, H, W, C).
        r   r   )r   r'   )shiftsdimsr*   r   F)r~   N)r,   r   r   rQ   rollr#   rO   
functionalpadr6   r-   r   getattrr   r   r   r:   r/   )rc   r"   r0   r1   r2   r3   	has_shift	shifted_xpad_hpad_w_HpWp	x_windowsr   attn_windowss                   r5   _attnzSwinTransformerV2Block._attn  s    WW
1a (	

1tq/A.ADOOTUDVCV-W^deII!!!$q4+;+;A+>'>>$BRBRSTBUU!!!$q4+;+;A+>'>>$BRBRSTBUUHH''++I1a57QR	 2r1 %Y0@0@A	NN2t'7'7;	 4/**95IIyyy; $((T-=-=a-@$BRBRSTBUWXY"<1A1AB8L	a!RaRl+668	 

9T__6JA  Ar7   c                 >   |j                   \  }}}}|| j                  | j                  | j                  |                  z   }|j	                  |d|      }|| j                  | j                  | j                  |                  z   }|j	                  ||||      }|S )Nr*   )r,   r   r   r   r   r   r   r   )rc   r"   r0   r1   r2   r3   s         r5   r   zSwinTransformerV2Block.forward  s    WW
1a

4::a= 9::IIaQ

488A; 788IIaAq!r7   r   )r   r   r   r   rO   	LayerNormr   _int_or_tuple_2_tr   r   r   r
   ModulerN   r   rQ   r   r   r	   r   r   r   r   r   r   s   @r5   r   r      s    ./,-%*!&!!!!!#)*,,,89!H
H
 0H
 	H

 +H
 *H
 #H
 H
 H
 H
 H
 H
 H
 !H
 RYYH
  %6!H
T"x5 "%,,AW "N >B5 15  ((9:5 
uS#Xc3h/	0	5J 04	
S#X
 sCx
 'tn	

 

4,u|| , ,\ %,, r7   r   c                        e Zd ZdZdej
                  fdedee   deej                     f fdZ
dej                  dej                  fd	Z xZS )
PatchMergingzPatch Merging Layer.

    Merges 2x2 neighboring patches and projects to higher dimension,
    effectively downsampling the feature maps.
    Nr=   out_dimr   c                     t         |           || _        |xs d|z  | _        t	        j
                  d|z  | j                  d      | _         || j                        | _        y)z
        Args:
            dim (int): Number of input channels.
            out_dim (int): Number of output channels (or 2 * dim if None)
            norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
        r'   r(   FrF   N)rM   rN   r=   r   rO   rV   	reductionnorm)rc   r=   r   r   rd   s       r5   rN   zPatchMerging.__init__  sS     	)!c'1s7DLLuEt||,	r7   r"   r$   c                 h   |j                   \  }}}}ddd|dz  d|dz  f}t        j                  j                  ||      }|j                   \  }}}}|j	                  ||dz  d|dz  d|      j                  dddddd      j                  d      }| j                  |      }| j                  |      }|S )Nr   r'   r   r&   r(   r)   )	r,   rO   r   r   r   r.   rs   r   r   )rc   r"   r0   r1   r2   r3   
pad_valuesr   s           r5   r   zPatchMerging.forward  s    WW
1aAq1uaQ/
MMa,WW
1aIIaaAFAq199!Q1aKSSTUVNN1IIaLr7   )r   r   r   r   rO   r   r   r   r
   r   rN   rQ   r   r   r   r   s   @r5   r   r     s\     &**,,,	-- c]- RYY	-$
 
%,, 
r7   r   c            '       .    e Zd ZdZdddddddddej
                  ddfded	ed
ededededededede	dede	de	de	de
eef   deej                     dededdf& fdZ	 d"deeef   dedee   ddfdZdej(                  dej(                  fd Zd#d!Z xZS )$SwinTransformerV2StagezA Swin Transformer V2 Stage.

    A single stage consisting of multiple Swin Transformer blocks with
    optional downsampling at the beginning.
    Fr   Tr   r   r   r=   r   r   depthr>   r#   r   r   
downsampler   r?   rB   rA   r   r   r   rC   output_nchwr$   Nc                 P   t         |           || _        || _        |	rt	        d |D              n|| _        || _        || _        d| _        t        |      }t	        |D cg c]  }|dz  	 c}      }|	rt        |||      | _        n ||k(  sJ t        j                         | _        t        j                  t        |      D cg c]E  }t!        || j
                  |||dz  dk(  rdn||||
|||t#        |t$              r||   n||||      G c}      | _        yc c}w c c}w )a  
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels.
            input_resolution: Input resolution.
            depth: Number of blocks.
            num_heads: Number of attention heads.
            window_size: Local window size.
            always_partition: Always partition into full windows and shift
            dynamic_mask: Create attention mask in forward based on current input size
            downsample: Use downsample layer at start of the block.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Projection dropout rate
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer.
            pretrained_window_size: Local window size in pretraining.
            output_nchw: Output tensors on NCHW format instead of NHWC.
        c              3   &   K   | ]	  }|d z    ywr'   N .0is     r5   	<genexpr>z2SwinTransformerV2Stage.__init__.<locals>.<genexpr>$  s     &H!qAv&H   Fr'   )r=   r   r   r   )r=   r   r>   r#   r   r   r   r   r?   rB   rA   r   r   r   rC   N)rM   rN   r=   r   r   output_resolutionr   r   grad_checkpointingr   r   r   rO   r   
ModuleListranger   
isinstancelistblocks)rc   r=   r   r   r   r>   r#   r   r   r   r   r?   rB   rA   r   r   r   rC   r   r   r   r   rd   s                         r5   rN   zSwinTransformerV2Stage.__init__  s/   T 	 0LV&H7G&H!H\l
&"',K8qAF89
 *sGPZ[DO'>!> kkmDO mm$ 5\%%#$ # #!%!7!7#'!"Q!1*!1)#!##*4Y*E)A,9#%'=%# $ 9%#s   "DA
D#r   c                 .   || _         t        | j                  t        j                        r|| _        n3t        | j                  t              sJ t        d |D              | _        | j                  D ]   }|j                  | j
                  ||       " y)zUpdate resolution, window size and relative positions.

        Args:
            feat_size: New input (feature) resolution.
            window_size: New window size.
            always_partition: Always partition / shift the window.
        c              3   &   K   | ]	  }|d z    ywr   r   r   s     r5   r   z8SwinTransformerV2Stage.set_input_size.<locals>.<genexpr>Y  s     *Ea16*Er   r   r#   r   N)
r   r  r   rO   r   r   r   r   r  r   )rc   r   r#   r   blocks        r5   r   z%SwinTransformerV2Stage.set_input_sizeG  s     !*door{{3%.D"doo|<<<%**E9*E%ED"[[ 	E  00'!1 ! 	r7   r"   c                     | j                  |      }| j                  D ]A  }| j                  r+t        j                  j                         st        ||      }: ||      }C |S )zForward pass through the stage.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Output tensor of shape (B, H', W', C').
        )r   r  r   rQ   jitis_scriptingr   )rc   r"   blks      r5   r   zSwinTransformerV2Stage.forwarda  sY     OOA;; 	C&&uyy/E/E/GsA&F		
 r7   c                    | j                   D ]  }t        j                  j                  |j                  j
                  d       t        j                  j                  |j                  j                  d       t        j                  j                  |j                  j
                  d       t        j                  j                  |j                  j                  d        y)z/Initialize residual post-normalization weights.r   N)r  rO   init	constant_r   rG   r   r   )rc   r  s     r5   _init_respostnormz(SwinTransformerV2Stage._init_respostnorms  s    ;; 	3CGGciinna0GGcii..2GGciinna0GGcii..2		3r7   r   r   )r   r   r   r   rO   r   r   r   r   r   r   strr   r
   r   rN   r	   r   r   rQ   r   r   r  r   r   s   @r5   r   r     su    &+!&$!!!!!.4*,,,89 %'N$N$ N$ 0	N$
 N$ N$ +N$ #N$ N$ N$ N$ N$ N$ N$ N$  S(]+!N$" RYY#N$$ %6%N$& 'N$( 
)N$h 04	S#X  'tn	
 
4 %,, $3r7   r   c            +       6    e Zd ZdZdddddddd	d
dddddddddej
                  dfdedededededede	edf   de	edf   dede
de
dede
d ed!ed"ed#ed$eeef   d%ed&e	edf   f( fd'Zd(ej                  d)d*fd+Z	 	 	 	 	 dCdee	eef      dee	eef      dee	eef      d,ee   dee
   f
d-Zej(                  j*                  d)ee   fd.       Zej(                  j*                  dDd/e
d)eeef   fd0       Zej(                  j*                  dEd1e
d)d*fd2       Zej(                  j*                  d)ej                  fd3       ZdFdedee   d)d*fd4Z	 	 	 	 	 dGd5ej<                  d6eeeee   f      d7e
d8e
d9ed:e
d)eeej<                     e	ej<                  eej<                     f   f   fd;Z 	 	 	 dHd6eeee   f   d<e
d=e
fd>Z!d5ej<                  d)ej<                  fd?Z"dDd5ej<                  d@e
d)ej<                  fdAZ#d5ej<                  d)ej<                  fdBZ$ xZ%S )Ir!   a   Swin Transformer V2.

    A hierarchical vision transformer using shifted windows for efficient
    self-attention computation with continuous position bias.

    A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
        - https://arxiv.org/abs/2111.09883
       r(   r&     avg`   r'   r'      r'   r&   r        r   FTr   r   g?r   )r   r   r   r   r8   
patch_sizein_chansnum_classesglobal_pool	embed_dimdepths.r>   r#   r   strict_img_sizer   r?   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rater   r   pretrained_window_sizesc           
         t         |           || _        |dv sJ || _        d| _        t        |      | _        || _        t        |d| j                  dz
  z  z        x| _	        | _
        g | _        t        |t        t        f      s1t        | j                        D cg c]  }t        |d|z  z         }}t!        ||||d   ||d      | _        | j"                  j$                  }t'        j(                  d|t+        |            j-                  |      D cg c]  }|j/                          }}g }|d   }d}t        | j                        D ]  }||   }|t1        di d|d|d	|d   |z  |d   |z  fd
||   d|dkD  d||   d|	d|
d| d|d|d|d|d||   d|d|d||   gz  }|}|dkD  r|dz  }| xj                  t3        |d|z  d|       gz  c_         t5        j6                  | | _         || j                        | _        t=        | j                  |||| j                        | _        | jA                  | jB                         | j8                  D ]  }|jE                           yc c}w c c}w )a]  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of input image channels.
            num_classes: Number of classes for classification head.
            embed_dim: Patch embedding dimension.
            depths: Depth of each Swin Transformer stage (layer).
            num_heads: Number of attention heads in different layers.
            window_size: Window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            drop_rate: Head dropout rate.
            proj_drop_rate: Projection dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth rate.
            norm_layer: Normalization layer.
            act_layer: Activation layer type.
            patch_norm: If True, add normalization after patch embedding.
            pretrained_window_sizes: Pretrained window sizes of each layer.
            output_fmt: Output tensor format if not None, otherwise output 'NHWC' by default.
        ) r  NHWCr'   r   r   )r8   r  r  r!  r   r#  
output_fmtr=   r   r   r   r   r>   r#   r   r   r   r?   rB   rA   r   r   r   rC   r(   layers.)num_chsr   module)	pool_typer$  	input_fmtNr   )#rM   rN   r  r   r,  len
num_layersr!  r   num_featureshead_hidden_sizefeature_infor  r   r  r  r   patch_embed	grid_sizerQ   linspacert   splittolistr   dictrO   rU   layersr   r   headapply_init_weightsr  ) rc   r8   r  r  r  r   r!  r"  r>   r#   r   r#  r   r?   r$  r%  r&  r'  r   r   r(  kwargsr   r8  r"   dprr=  in_dimscaler   blyrd   s                                   r5   rN   zSwinTransformerV2.__init__  s   \ 	&k)))& f+"47	A$//\]J]D^8^4__D1)eT]3:?:PQQYa/0QIQ &!l!+
 $$..	#(>>!^S[#Q#W#WX^#_`aqxxz``1t' 	dAlG-  #,A,%"719N!O Qi	
 q5 $A, ( "2 "10 $ " ) ) a& $  &!" (?q'A#  F& F1u
$w!e)V]^_]`Ta"b!cc1	d4 mmV,t001	"!oo
	 	

4%%&;; 	$C!!#	$o R as    JJmr$   Nc                    t        |t        j                        rjt        |j                  d       t        |t        j                        r8|j
                  +t        j                  j                  |j
                  d       yyyy)zVInitialize model weights.

        Args:
            m: Module to initialize.
        g{Gz?)stdNr   )r  rO   rV   r   r   rG   r  r  )rc   rF  s     r5   r@  zSwinTransformerV2._init_weights  s\     a#!((,!RYY'AFF,>!!!&&!, -?' $r7   window_ratioc                 ^   ||3| j                   j                  ||       | j                   j                  }||t        D cg c]  }||z  	 c}      }t	        | j
                        D ]9  \  }}	dt        |dz
  d      z  }
|	j                  d   |
z  |d   |
z  f||       ; yc c}w )aT  Updates the image resolution, window size, and so the pair-wise relative positions.

        Args:
            img_size (Optional[Tuple[int, int]]): New input resolution, if None current resolution is used
            patch_size (Optional[Tuple[int, int]): New patch size, if None use current patch size
            window_size (Optional[int]): New window size, if None based on new_img_size // window_div
            window_ratio (int): divisor for calculating window size from patch grid size
            always_partition: always partition / shift windows even if feat size is < window
        N)r8   r  r'   r   r   r  )r7  r   r8  r   	enumerater=  r   )rc   r8   r  r#   rI  r   r8  r   indexstagestage_scales              r5   r   z SwinTransformerV2.set_input_size  s    " :#9++X*+U((22I<#;I Fql!2 FGK%dkk2 	LE5s519a00K  $Q<;6	!8ST'!1 ! 	 !Gs   B*c                     t               }| j                         D ]2  \  }}t        dD cg c]  }||v  c}      s"|j                  |       4 |S c c}w )zGet parameter names that should not use weight decay.

        Returns:
            Set of parameter names to exclude from weight decay.
        )rX   rT   )setnamed_modulesr   add)rc   nodnrF  kws        r5   no_weight_decayz!SwinTransformerV2.no_weight_decay%  sW     e&&( 	DAq&@AB!GAB
	 
 Bs   A
coarsec                 2    t        d|rd      S g d      S )zCreate parameter group matcher for optimizer parameter groups.

        Args:
            coarse: If True, use coarse grouping.

        Returns:
            Dictionary mapping group names to regex patterns.
        z^absolute_pos_embed|patch_embedz^layers\.(\d+)))z^layers\.(\d+).downsample)r   )z^layers\.(\d+)\.\w+\.(\d+)N)z^norm)i )stemr  )r<  )rc   rW  s     r5   group_matcherzSwinTransformerV2.group_matcher2  s)     3(.$
 	
5
 	
r7   enablec                 4    | j                   D ]	  }||_         y)z}Enable or disable gradient checkpointing.

        Args:
            enable: If True, enable gradient checkpointing.
        N)r=  r   )rc   r[  ls      r5   set_grad_checkpointingz(SwinTransformerV2.set_grad_checkpointingE  s      	*A#)A 	*r7   c                 .    | j                   j                  S )z_Get the classifier head.

        Returns:
            The classification head module.
        )r>  fc)rc   s    r5   get_classifierz SwinTransformerV2.get_classifierO  s     yy||r7   c                 J    || _         | j                  j                  ||       y)zReset the classification head.

        Args:
            num_classes: Number of classes for new head.
            global_pool: Global pooling type.
        N)r  r>  reset)rc   r  r   s      r5   reset_classifierz"SwinTransformerV2.reset_classifierX  s     '		[1r7   r"   indicesr   
stop_earlyr,  intermediates_onlyc                 >   |dv sJ d       g }t        t        | j                        |      \  }}	| j                  |      }t        | j                        }
t        j
                  j                         s|s| j                  }n| j                  d|	dz    }t        |      D ]c  \  }} ||      }||v s|r||
dz
  k(  r| j                  |      }n|}|j                  dddd      j                         }|j                  |       e |r|S | j                  |      }||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )NCHWzOutput shape must be NCHW.Nr   r   r&   r'   )r   r2  r=  r7  rQ   r  r  rK  r   r.   r/   append)rc   r"   re  r   rf  r,  rg  intermediatestake_indices	max_index
num_stagesstagesr   rM  x_inters                  r5   forward_intermediatesz'SwinTransformerV2.forward_intermediatesb  s   * Y&D(DD&"6s4;;7G"Qi Q%
99!!#:[[F[[)a-0F!&) 	.HAuaAL Aa/"iilGG!//!Q15@@B$$W-	.   IIaL-r7   
prune_norm
prune_headc                     t        t        | j                        |      \  }}| j                  d|dz    | _        |rt        j                         | _        |r| j                  dd       |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r*  )r   r2  r=  rO   r   r   rd  )rc   re  rr  rs  rl  rm  s         r5   prune_intermediate_layersz+SwinTransformerV2.prune_intermediate_layers  s]     #7s4;;7G"Qikk.9q=1DI!!!R(r7   c                 l    | j                  |      }| j                  |      }| j                  |      }|S )zForward pass through feature extraction layers.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Feature tensor of shape (B, H', W', C).
        )r7  r=  r   rc   r"   s     r5   forward_featuresz"SwinTransformerV2.forward_features  s3     QKKNIIaLr7   
pre_logitsc                 N    |r| j                  |d      S | j                  |      S )a  Forward pass through classification head.

        Args:
            x: Feature tensor of shape (B, H, W, C).
            pre_logits: If True, return features before final linear layer.

        Returns:
            Logits tensor of shape (B, num_classes) or pre-logits.
        T)ry  )r>  )rc   r"   ry  s      r5   forward_headzSwinTransformerV2.forward_head  s&     1;tyyty,L		!Lr7   c                 J    | j                  |      }| j                  |      }|S )zForward pass through the model.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Logits tensor of shape (B, num_classes).
        )rx  r{  rw  s     r5   r   zSwinTransformerV2.forward  s)     !!!$a r7   )NNNrf   NF)Tr   )NFFri  F)r   FT)&r   r   r   r   rO   r   r   r   r  r	   r   r   r   r   rN   r   r@  r   r   rQ   r  ignorer   rV  r   r   rZ  r^  ra  rd  r   r   rq  ru  rx  r{  r   r   r   s   @r5   r!   r!   |  s    +.#$&2)7-.%*$(!!!$&$&$'.4#%<<7C+r$'r$ r$ 	r$
 r$ r$ r$ #s(Or$ S#Xr$ +r$ #r$ "r$ r$ r$ r$  "!r$" "#r$$ "%r$& S(]+'r$( !)r$* &+38_+r$h	-ryy 	-T 	- 374859*+/3uS#X/ !sCx1 "%S/2	
 #3- 'tn@ YY
S 
 
 YY
D 
T#s(^ 
 
$ YY*T *T * * YY		  2C 2hsm 2W[ 2 8<$$',0 ||0  eCcN340  	0 
 0  0  !%0  
tELL!5tELL7I)I#JJ	K0 h ./$#	3S	>*  	 %,, 5<< 
Mell 
M 
M 
M %,, r7   
state_dictmodelc                     | j                  d|       } | j                  d|       } d| v }i }ddl}| j                         D ]  \  }}t        dD cg c]  }||v  c}      r"d|v re|j                  j
                  j                  j                  \  }}}	}
|j                  d   |	k7  s|j                  d	   |
k7  rt        ||	|
fd
dd      }|s&|j                  dd |      }|j                  dd      }|||<    |S c c}w )aM  Filter and process checkpoint state dict for loading.

    Handles resizing of patch embeddings and relative position tables
    when model size differs from checkpoint.

    Args:
        state_dict: Checkpoint state dictionary.
        model: Target model to load weights into.

    Returns:
        Filtered state dictionary.
    r  r  zhead.fc.weightr   N)ri   rh   r   zpatch_embed.proj.weightr   r*   bicubicT)interpolation	antialiasverbosezlayers.(\d+).downsamplec                 D    dt        | j                  d            dz    dS )Nr-  r   z.downsample)r   group)r"   s    r5   <lambda>z&checkpoint_filter_fn.<locals>.<lambda>  s$    ws177ST:YZGZF[[f=g r7   zhead.zhead.fc.)getreitemsr   r7  r_   r   r,   r   subreplace)r  r  native_checkpointout_dictr  r   r   rT  r   r1   r2   s              r5   checkpoint_filter_fnr    s    4Jj9J(J6H  " 1 ab1Qbc$)**//66<<JAq!Qwwr{a1772;!#3(F"+"  !13gijkA		':.A'* O) cs   C;
variant
pretrainedc           	          t        d t        |j                  dd            D              }|j                  d|      }t	        t
        | |ft        t        d|      d|}|S )zCreate a Swin Transformer V2 model.

    Args:
        variant: Model variant name.
        pretrained: If True, load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        SwinTransformerV2 model instance.
    c              3   &   K   | ]	  \  }}|  y wr   r   )r   r   r   s      r5   r   z._create_swin_transformer_v2.<locals>.<genexpr>  s     \da\r   r"  )r   r   r   r   out_indicesT)flatten_sequentialr  )pretrained_filter_fnfeature_cfg)r   rK  r  popr   r!   r  r<  )r  r  rA  default_out_indicesr  r  s         r5   _create_swin_transformer_v2r    sj      \i

8\8Z.[\\**],?@K 7J1DkJ 	E
 Lr7   c                 4    | ddddddt         t        ddd	d
|S )Nr  )r&      r  )rf   rf   g?r  Tzpatch_embed.projzhead.fcmit)urlr  
input_size	pool_sizecrop_pctr  fixed_input_sizemeanrH  
first_conv
classifierlicenser   )r  rA  s     r5   _cfgr    s5    =v%.B(	 # r7   ztimm/z{https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to16_192to256_22kto1k_ft.pth)	hf_hub_idr  z{https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to24_192to384_22kto1k_ft.pth)r&     r  )r  r  rg   )r  r  r  r  r  z|https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to16_192to256_22kto1k_ft.pthz|https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to24_192to384_22kto1k_ft.pthzfhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window8_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window16_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window8_256.pthzhhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window16_256.pthzfhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window8_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window16_256.pthzkhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12_192_22k.pthiQU  )r&      r  )r  r  )r  r  r  r  r  zlhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12_192_22k.pth)2swinv2_base_window12to16_192to256.ms_in22k_ft_in1k2swinv2_base_window12to24_192to384.ms_in22k_ft_in1k3swinv2_large_window12to16_192to256.ms_in22k_ft_in1k3swinv2_large_window12to24_192to384.ms_in22k_ft_in1kzswinv2_tiny_window8_256.ms_in1kz swinv2_tiny_window16_256.ms_in1kz swinv2_small_window8_256.ms_in1kz!swinv2_small_window16_256.ms_in1kzswinv2_base_window8_256.ms_in1kz swinv2_base_window16_256.ms_in1k!swinv2_base_window12_192.ms_in22k"swinv2_large_window12_192.ms_in22kc           	      P    t        dddd      }t        	 dd| it        |fi |S )z"Swin-T V2 @ 256x256, window 16x16.r   r  r  r  r#   r!  r"  r>   r  )swinv2_tiny_window16_256r<  r  r  rA  
model_argss      r5   r  r  Q  sD     "<SabJ&"Y/9Y=A*=WPV=WY Yr7   c           	      P    t        dddd      }t        	 dd| it        |fi |S )z Swin-T V2 @ 256x256, window 8x8.rf   r  r  r  r  r  )swinv2_tiny_window8_256r  r  s      r5   r  r  Y  sC     !r,R`aJ&!X.8X<@<Vv<VX Xr7   c           	      P    t        dddd      }t        	 dd| it        |fi |S )z"Swin-S V2 @ 256x256, window 16x16.r   r  r'   r'      r'   r  r  r  )swinv2_small_window16_256r  r  s      r5   r  r  a  sD     "=TbcJ&#Z0:Z>B:>XQW>XZ Zr7   c           	      P    t        dddd      }t        	 dd| it        |fi |S )z Swin-S V2 @ 256x256, window 8x8.rf   r  r  r  r  r  )swinv2_small_window8_256r  r  s      r5   r  r  i  sD     !r-SabJ&"Y/9Y=A*=WPV=WY Yr7   c           	      P    t        dddd      }t        	 dd| it        |fi |S )z"Swin-B V2 @ 256x256, window 16x16.r      r  r(   rf   r       r  r  )swinv2_base_window16_256r  r  s      r5   r  r  q  D     "MUcdJ&"Y/9Y=A*=WPV=WY Yr7   c           	      P    t        dddd      }t        	 dd| it        |fi |S )z Swin-B V2 @ 256x256, window 8x8.rf   r  r  r  r  r  )swinv2_base_window8_256r  r  s      r5   r  r  y  sC     !s=TbcJ&!X.8X<@<Vv<VX Xr7   c           	      P    t        dddd      }t        	 dd| it        |fi |S )z"Swin-B V2 @ 192x192, window 12x12.r  r  r  r  r  r  )swinv2_base_window12_192r  r  s      r5   r  r    r  r7   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	zQSwin-B V2 @ 192x192, trained at window 12x12, fine-tuned to 256x256 window 16x16.r   r  r  r  r  r  r  r  r#   r!  r"  r>   r(  r  )!swinv2_base_window12to16_192to256r  r  s      r5   r  r    K     #m~ /1J '+b8BbFJ:F`Y_F`b br7   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	zQSwin-B V2 @ 192x192, trained at window 12x12, fine-tuned to 384x384 window 24x24.r  r  r  r  r  r  r  )!swinv2_base_window12to24_192to384r  r  s      r5   r  r    r  r7   c           	      P    t        dddd      }t        	 dd| it        |fi |S )z"Swin-L V2 @ 192x192, window 12x12.r  r  r  r  r  r  0   r  r  )swinv2_large_window12_192r  r  s      r5   r  r    sD     "MUdeJ&#Z0:Z>B:>XQW>XZ Zr7   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	zQSwin-L V2 @ 192x192, trained at window 12x12, fine-tuned to 256x256 window 16x16.r   r  r  r  r  r  r  )"swinv2_large_window12to16_192to256r  r  s      r5   r  r    K     #m /1J ',c9CcGKJGaZ`Gac cr7   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	zQSwin-L V2 @ 192x192, trained at window 12x12, fine-tuned to 384x384 window 24x24.r  r  r  r  r  r  r  )"swinv2_large_window12to24_192to384r  r  s      r5   r  r    r  r7   r  r  r  r  r  r  )swinv2_base_window12_192_22k)swinv2_base_window12to16_192to256_22kft1k)swinv2_base_window12to24_192to384_22kft1kswinv2_large_window12_192_22k*swinv2_large_window12to16_192to256_22kft1k*swinv2_large_window12to24_192to384_22kft1kr}  )r*  )Kr   rr   typingr   r   r   r   r   r   r	   r
   r   rQ   torch.nnrO   torch.nn.functionalr   r   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   	_registryr   r   r    __all__r   r   r   r6   r:   r   r<   r   r   r   r!   r  r  r   r  r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  r  r   r   r7   r5   <module>r     s    O O O     A; ; ; * + 3 # Y Y
#uS#X./  5c? u||   ELL uS#X RWX[]`X`Ra fkfrfr  $Ybii YxdRYY dN#299 #LI3RYY I3XM		 M`
'T#u||*;%< 'RYY 'SWX[]b]i]iXiSj 'T $ Uf , %:> J; ;? J Hs;
 <@ K< <@ K Hs< (,t( )-u) )-u) *.v* (,t( )-u)
 *.ymv*
 +/zmv+e7& 7t Y YDU Y Y X XCT X X Z$ ZEV Z Z Y YDU Y Y Y YDU Y Y X XCT X X Y YDU Y Y b$ bM^ b b b$ bM^ b b Z$ ZEV Z Z c4 cN_ c c c4 cN_ c c H$G1e1e%I2g2g' r7   