
    kh                        d Z ddlZddlZddlmZmZmZmZmZm	Z	m
Z
mZ ddlZddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddl m!Z! dd	l"m#Z# dd
l$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z, dgZ- ej\                  e/      Z0ee1e
e1e1f   f   Z2dejf                  de
e1e1f   dejf                  fdZ4e#dejf                  de
e1e1f   de1de1dejf                  f
d       Z5de1de1dejf                  fdZ6 G d dejn                        Z8 G d dejn                        Z9 G d dejn                        Z: G d d ejn                        Z; G d! dejn                        Z<d"e=d#ejn                  dee>ejf                  f   fd$Z?dbd%e>d&e@de<fd'ZAdcd(e>dee>ef   fd)ZB e(i d* eBd+d,-      d. eBd+d/-      d0 eBd+d1d2d3d45      d6 eBd+d7-      d8 eBd+d9d2d3d45      d: eBd+d;-      d< eBd+d=-      d> eBd+d?-      d@ eBd+dAd2d3d45      dB eBd+dC-      dD eBd+dEdFG      dH eBd+dIdFG      dJ eBd+dKdFG      dL eBd+dMd2d3d4dFN      dO eBd+dPdFG      dQ eBd+dRd2d3d4dFN      dS eBd+dT-       eBd+dU-       eBd+dV-      dW      ZCe)dbde<fdX       ZDe)dbde<fdY       ZEe)dbde<fdZ       ZFe)dbde<fd[       ZGe)dbde<fd\       ZHe)dbde<fd]       ZIe)dbde<fd^       ZJe)dbde<fd_       ZKe)dbde<fd`       ZL e*e/dJdLdOdQda       y)da   Swin Transformer
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`
    - https://arxiv.org/pdf/2103.14030

Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below

S3 (AutoFormerV2, https://arxiv.org/abs/2111.14725) Swin weights from
    - https://github.com/microsoft/Cream/tree/main/AutoFormerV2

Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
    N)AnyDictCallableListOptionalSetTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPathClassifierHead	to_2tuple	to_ntupletrunc_normal_use_fused_attnresize_rel_pos_bias_tableresample_patch_embedndgrid   )build_model_with_cfg)feature_take_indices)register_notrace_function)checkpoint_seqnamed_apply)generate_default_cfgsregister_modelregister_model_deprecations)get_init_weights_vitSwinTransformerxwindow_sizereturnc                     | j                   \  }}}}| j                  |||d   z  |d   ||d   z  |d   |      } | j                  dddddd      j                         j                  d|d   |d   |      }|S )zPartition into non-overlapping windows.

    Args:
        x: Input tokens with shape [B, H, W, C].
        window_size: Window size.

    Returns:
        Windows after partition with shape [B * num_windows, window_size, window_size, C].
    r   r               shapeviewpermute
contiguous)r$   r%   BHWCwindowss          X/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/swin_transformer.pywindow_partitionr8   *   s     JAq!Q	q!{1~%{1~qKN7JKXYN\]^Aii1aAq)446;;BAP[\]P^`abGN    r6   r3   r4   c                     | j                   d   }| j                  d||d   z  ||d   z  |d   |d   |      }|j                  dddddd      j                         j                  d|||      }|S )a
  Reverse window partition.

    Args:
        windows: Windows with shape (num_windows*B, window_size, window_size, C).
        window_size: Window size.
        H: Height of image.
        W: Width of image.

    Returns:
        Tensor with shape (B, H, W, C).
    r,   r   r   r(   r)   r*   r+   r-   )r6   r%   r3   r4   r5   r$   s         r7   window_reverser;   =   s     	bARk!n,a;q>.A;q>S^_`SacdeA			!Q1a#..055b!QBAHr9   win_hwin_wc                    t        j                  t        t        j                  |       t        j                  |                  }t        j                  |d      }|dddddf   |dddddf   z
  }|j                  ddd      j                         }|dddddfxx   | dz
  z  cc<   |dddddfxx   |dz
  z  cc<   |dddddfxx   d|z  dz
  z  cc<   |j                  d      S )zGet pair-wise relative position index for each token inside the window.

    Args:
        win_h: Window height.
        win_w: Window width.

    Returns:
        Relative position index tensor.
    r   Nr)   r   r,   )torchstackr   arangeflattenr0   r1   sum)r<   r=   coordscoords_flattenrelative_coordss        r7   get_relative_position_indexrG   P   s     [[U 3U\\%5HIJF]]61-N$Q4Z0>!T1*3MMO%--aA6AACOAq!G	)Aq!G	)Aq!GE	A-r""r9   c                       e Zd ZU dZej
                  j                  e   ed<   	 	 	 	 	 dde	de	de
e	   deded	ed
ef fdZdee	e	f   ddfdZdej                   fdZddej                   de
ej                      dej                   fdZ xZS )WindowAttentionzWindow based multi-head self attention (W-MSA) module with relative position bias.

    Supports both shifted and non-shifted windows.
    
fused_attnNdim	num_headshead_dimr%   qkv_bias	attn_drop	proj_dropc                    t         |           || _        t        |      | _        | j                  \  }}	||	z  | _        || _        |xs ||z  }||z  }
|dz  | _        t        d      | _	        t        j                  t        j                  d|z  dz
  d|	z  dz
  z  |            | _        | j                  dt!        ||	      d       t        j"                  ||
d	z  |
      | _        t        j&                  |      | _        t        j"                  |
|      | _        t        j&                  |      | _        t/        | j                  d       t        j0                  d      | _        y)a  
        Args:
            dim: Number of input channels.
            num_heads: Number of attention heads.
            head_dim: Number of channels per head (dim // num_heads if not set)
            window_size: The height and width of the window.
            qkv_bias:  If True, add a learnable bias to query, key, value.
            attn_drop: Dropout ratio of attention weight.
            proj_drop: Dropout ratio of output.
        g      T)experimentalr)   r   relative_position_indexF
persistentr(   biasg{Gz?)stdr,   )rK   N)super__init__rK   r   r%   window_arearL   scaler   rJ   nn	Parameterr?   zerosrelative_position_bias_tableregister_bufferrG   LinearqkvDropoutrO   projrP   r   Softmaxsoftmax)selfrK   rL   rM   r%   rN   rO   rP   r<   r=   attn_dim	__class__s              r7   rZ   zWindowAttention.__init__l   s>   ( 	$[1''u 5="/si/i'%
(d; -/LLa%iRSmXY\aXadeXeEfhq9r,s) 	68STY[`8anst99S(Q,X>I.IIh,	I.d77SAzzb)r9   r&   c                    t        |      }|| j                  k(  ry|| _        | j                  \  }}||z  | _        t        j                         5  d|z  dz
  d|z  dz
  z  | j
                  f}t        j                  t        | j                  | j                  |            | _	        | j                  dt        ||      d       ddd       y# 1 sw Y   yxY w)zzUpdate window size & interpolate position embeddings
        Args:
            window_size (int): New window size
        Nr)   r   new_window_sizenew_bias_shaperS   FrT   )r   r%   r[   r?   no_gradrL   r]   r^   r   r`   ra   rG   )rh   r%   r<   r=   rn   s        r7   set_window_sizezWindowAttention.set_window_size   s    
  ,$***&''u 5=]]_ 	y%i!mE	A>NN02)55$($4$4#11D-   !:<WX]_d<erw x	y 	y 	ys   A6CCc                     | j                   | j                  j                  d         j                  | j                  | j                  d      }|j	                  ddd      j                         }|j                  d      S )Nr,   r)   r   r   )r`   rS   r/   r[   r0   r1   	unsqueeze)rh   relative_position_biass     r7   _get_rel_pos_biasz!WindowAttention._get_rel_pos_bias   ss    !%!B!B((--b1"33748H8H$JZJZ\^3_ 	!7!?!?1a!H!S!S!U%//22r9   r$   maskc                    |j                   \  }}}| j                  |      j                  ||d| j                  d      j	                  ddddd      }|j                  d      \  }}}	| j                  r| j                         }
|e|j                   d   }|j                  d|d||      j                  ||z  d| j                  dd      }|
|j                  d| j                  ||      z   }
t        j                  j                  j                  |||	|
| j                  r| j                  j                   nd      }n|| j"                  z  }||j%                  d	d      z  }|| j                         z   }|m|j                   d   }|j                  d|| j                  ||      |j'                  d      j'                  d      z   }|j                  d| j                  ||      }| j)                  |      }| j                  |      }||	z  }|j%                  dd      j                  ||d      }| j+                  |      }| j-                  |      }|S )
a  Forward pass.

        Args:
            x: Input features with shape of (num_windows*B, N, C).
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None.

        Returns:
            Output features with shape of (num_windows*B, N, C).
        r(   r,   r)   r   r   r*           )	attn_mask	dropout_p)r.   rc   reshaperL   r0   unbindrJ   rt   r/   expandr?   r]   
functionalscaled_dot_product_attentiontrainingrO   pr\   	transposerr   rg   re   rP   )rh   r$   ru   B_Nr5   rc   qkvrx   num_winattns                r7   forwardzWindowAttention.forward   s    77Aqhhqk!!"aDNNB?GG1aQRTUV**Q-1a??..0I**Q-yyGQ15<<R7]BPTP^P^`bdfg%RA(NN	##@@1a#.2mm$..** A A DJJAq{{2r**D$0022D**Q-yyWdnnaCdnnUVFWFaFabcFddyyT^^Q:<<%D>>$'DqAKK1%%b!R0IIaLNN1r9   )N   Trw   rw   N)__name__
__module____qualname____doc__r?   jitFinalbool__annotations__intr   _int_or_tuple_2_tfloatrZ   r	   rp   Tensorrt   r   __classcell__rj   s   @r7   rI   rI   e   s     		%% '+-.!!!+*+* +* sm	+*
 ++* +* +* +*Zy5c? yt y*35<< 3( (Xell-C (u|| (r9   rI   c                       e Zd ZdZddddddddd	d	d	ej
                  ej                  fd
ededede	e   dedede
de
dede
dededededef fdZd$de	ej                     de	ej                     fdZ	 d$deeeeef   f   de	eeeeef   f      deeeef   eeef   f   fdZ	 d$d eeef   deeef   de	e
   fd!Zd" Zdej                  dej                  fd#Z xZS )%SwinTransformerBlockzkSwin Transformer Block.

    A transformer block with window-based self-attention and shifted windows.
    r*   Nr   r   F      @Trw   rK   input_resolutionrL   rM   r%   
shift_sizealways_partitiondynamic_mask	mlp_ratiorN   rP   rO   	drop_path	act_layer
norm_layerc           	         t         |           || _        || _        t	        |      | _        || _        || _        | j                  ||      \  | _	        | _
        | j                  d   | j                  d   z  | _        |	| _         ||      | _        t        |||| j                  |
||      | _        |dkD  rt!        |      nt#        j$                         | _         ||      | _        t+        |t-        ||	z        ||      | _        |dkD  rt!        |      nt#        j$                         | _        | j3                  d| j                  rdn| j5                         d	       y)
a  
        Args:
            dim: Number of input channels.
            input_resolution: Input resolution.
            window_size: Window size.
            num_heads: Number of attention heads.
            head_dim: Enforce the number of channels per head
            shift_size: Shift size for SW-MSA.
            always_partition: Always partition into full windows and shift
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
        r   r   )rL   rM   r%   rN   rO   rP   rw   )in_featureshidden_featuresr   droprx   NFrT   )rY   rZ   rK   r   r   target_shift_sizer   r   _calc_window_shiftr%   r   r[   r   norm1rI   r   r   r]   Identity
drop_path1norm2r   r   mlp
drop_path2ra   get_attn_mask)rh   rK   r   rL   rM   r%   r   r   r   r   rN   rP   rO   r   r   r   rj   s                   r7   rZ   zSwinTransformerBlock.__init__   sI   F 	 0!*:!6 0(,0,C,CKQ[,\)$/++A.1A1A!1DD"_
#((
	 2;R(9-R[[]_
i0	
 2;R(9-R[[]%%D4+=+=+? 	 	
r9   r$   r&   c           	      d   t        | j                        r|7|j                  d   |j                  d   }}|j                  }|j                  }n| j
                  \  }}d }d }t        j                  || j                  d   z        | j                  d   z  }t        j                  || j                  d   z        | j                  d   z  }t        j                  d||df||      }d}d| j                  d    f| j                  d    | j                  d    f| j                  d    d ffD ]l  }d| j                  d    f| j                  d    | j                  d    f| j                  d    d ffD ]$  }	||d d |d   |d   |	d   |	d   d d f<   |dz  }& n t        || j                        }
|
j                  d| j                        }
|
j                  d      |
j                  d      z
  }|j                  |dk7  t!        d            j                  |dk(  t!        d            }|S d }|S )Nr   r)   r   )dtypedevicer,   g      Yrw   )anyr   r.   r   r   r   mathceilr%   r?   r_   r8   r/   r[   rr   masked_fillr   )rh   r$   r3   r4   r   r   img_maskcnthwmask_windowsrx   s               r7   r   z"SwinTransformerBlock.get_attn_mask-  sZ   t}wwqz1771:1,,1		!d..q112T5E5Ea5HHA		!d..q112T5E5Ea5HHA{{Aq!Q<uVLHC))!,,-&&q))DOOA,>+>?ooa(($/  T--a001**1--0B/BC//!,,d3 A
 <?HQ!QqT	1Q4!9a781HC ,Hd6F6FGL',,R1A1ABL$..q1L4J4J14MMI!--i1neFmLXXYbfgYginoristI  Ir9   target_window_sizer   c                    t        |      }|(| j                  }t        |      r|d   dz  |d   dz  f}nt        |      }| j                  r||fS t	        | j
                  |      D cg c]  \  }}||k  r|n| }}}t	        | j
                  ||      D cg c]  \  }}}||k  rdn| }}}}t        |      t        |      fS c c}}w c c}}}w )Nr   r)   r   )r   r   r   r   zipr   tuple)rh   r   r   rr   r%   sr   s           r7   r   z'SwinTransformerBlock._calc_window_shiftP  s    
 ''9:$ $ 6 6$%%7%:a%?ASTUAVZ[A[$\! )*; <  %'88869$:O:OQc6dedaAFq)ee8;D<Q<QS^`q8rssWQ116aq(s
s[!5#444 fss   *C	C	feat_sizec                 R   || _         ||| _        | j                  |      \  | _        | _        | j                  d   | j                  d   z  | _        | j                  j                  | j                         | j                  d| j                  rdn| j                         d       y)z
        Args:
            feat_size: New input resolution
            window_size: New window size
            always_partition: Change always_partition attribute if not None
        Nr   r   rx   FrT   )r   r   r   r%   r   r[   r   rp   ra   r   r   )rh   r   r%   r   s       r7   set_input_sizez#SwinTransformerBlock.set_input_sizee  s     !*'$4D!,0,C,CK,P)$/++A.1A1A!1DD		!!$"2"23%%D4+=+=+? 	 	
r9   c           	         |j                   \  }}}}t        | j                        }|r7t        j                  || j                  d    | j                  d    fd      }n|}| j
                  d   || j
                  d   z  z
  | j
                  d   z  }| j
                  d   || j
                  d   z  z
  | j
                  d   z  }	t        j                  j                  j                  |ddd|	d|f      }|j                   \  }
}}}
t        || j
                        }|j                  d| j                  |      }t        | dd      r| j                  |      }n| j                  }| j                  ||      }|j                  d| j
                  d   | j
                  d   |      }t!        || j
                  ||      }|d d d |d |d d f   j#                         }|r$t        j                  || j                  d      }|S |}|S )	Nr   r   )r   r)   )shiftsdimsr,   r   F)ru   )r.   r   r   r?   rollr%   r]   r~   padr8   r/   r[   getattrr   rx   r   r;   r1   )rh   r$   r2   r3   r4   r5   	has_shift	shifted_xpad_hpad_w_HpWp	x_windowsrx   attn_windowss                   r7   _attnzSwinTransformerBlock._attn}  s    WW
1a (	

1tq/A.ADOOTUDVCV-W^deII !!!$q4+;+;A+>'>>$BRBRSTBUU!!!$q4+;+;A+>'>>$BRBRSTBUUHH''++I1a57QR	 2r1 %Y0@0@A	NN2t'7'7;	 4/**95IIyyy; $((T-=-=a-@$BRBRSTBUWXY"<1A1A2rJ	a!RaRl+668	 

9T__6JA  Ar9   c                 >   |j                   \  }}}}|| j                  | j                  | j                  |                  z   }|j	                  |d|      }|| j                  | j                  | j                  |                  z   }|j	                  ||||      }|S )zForward pass.

        Args:
            x: Input features with shape (B, H, W, C).

        Returns:
            Output features with shape (B, H, W, C).
        r,   )r.   r   r   r   r{   r   r   r   )rh   r$   r2   r3   r4   r5   s         r7   r   zSwinTransformerBlock.forward  s     WW
1a

4::a= 9::IIaQA 788IIaAq!r9   r   )r   r   r   r   r]   GELU	LayerNormr   r   r   r   r   r   rZ   r?   r   r   r
   r	   r   r   r   r   r   r   s   @r7   r   r      s    &*-.%*!&!!!!!"$''#%<<!F
F
 0F
 	F

 smF
 +F
 F
 #F
 F
 F
 F
 F
 F
 F
  F
  !!F
P!x5 !%,,AW !L HL5 %c5c?&: ;5  (c5c?.B(CD5 
uS#Xc3h/	0	52 04	
S#X
 sCx
 'tn	
0%N %,, r9   r   c                        e Zd ZdZdej
                  fdedee   def fdZ	de
j                  de
j                  fd	Z xZS )
PatchMergingzVPatch Merging Layer.

    Downsample features by merging 2x2 neighboring patches.
    NrK   out_dimr   c                     t         |           || _        |xs d|z  | _         |d|z        | _        t        j                  d|z  | j                  d      | _        y)z
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels (or 2 * dim if None)
            norm_layer: Normalization layer.
        r)   r*   FrV   N)rY   rZ   rK   r   normr]   rb   	reduction)rh   rK   r   r   rj   s       r7   rZ   zPatchMerging.__init__  sS     	)!c'q3w'	1s7DLLuEr9   r$   r&   c                 h   |j                   \  }}}}ddd|dz  d|dz  f}t        j                  j                  ||      }|j                   \  }}}}|j	                  ||dz  d|dz  d|      j                  dddddd      j                  d      }| j                  |      }| j                  |      }|S )zForward pass.

        Args:
            x: Input features with shape (B, H, W, C).

        Returns:
            Output features with shape (B, H//2, W//2, out_dim).
        r   r)   r   r(   r*   r+   )	r.   r]   r~   r   r{   r0   rB   r   r   )rh   r$   r2   r3   r4   r5   
pad_valuesr   s           r7   r   zPatchMerging.forward  s     WW
1aAq1uaQ/
MMa,WW
1aIIaaAFAq199!Q1aKSSTUVIIaLNN1r9   )r   r   r   r   r]   r   r   r   r   rZ   r?   r   r   r   r   s   @r7   r   r     sX     &*#%<<	FF c]F !	F$ %,, r9   r   c            !           e Zd ZdZdddddddddddej
                  fd	ed
edeeef   dededede	e   de
dededededededeee   ef   def  fdZ	 ddeeef   dede	e   fdZdej$                  dej$                  fdZ xZS ) SwinTransformerStagez|A basic Swin Transformer layer for one stage.

    Contains multiple Swin Transformer blocks and optional downsampling.
    Tr*   Nr   Fr   rw   rK   r   r   depth
downsamplerL   rM   r%   r   r   r   rN   rP   rO   r   r   c                 :   t         |           || _        || _        |rt	        d |D              n|| _        || _        d| _        t        |      }t	        |D cg c]  }|dz  	 c}      }|rt        |||      | _
        n ||k(  sJ t        j                         | _
        t        j                  t        |      D cg c]D  }t        || j
                  ||||dz  dk(  rdn||	|
||||t!        |t"              r||   n||      F c} | _        yc c}w c c}w )a  
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels.
            input_resolution: Input resolution.
            depth: Number of blocks.
            downsample: Downsample layer at the end of the layer.
            num_heads: Number of attention heads.
            head_dim: Channels per head (dim // num_heads if not set)
            window_size: Local window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Projection dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            norm_layer: Normalization layer.
        c              3   &   K   | ]	  }|d z    ywr)   N .0is     r7   	<genexpr>z0SwinTransformerStage.__init__.<locals>.<genexpr>  s     &H!qAv&H   Fr)   )rK   r   r   r   )rK   r   rL   rM   r%   r   r   r   r   rN   rP   rO   r   r   N)rY   rZ   rK   r   r   output_resolutionr   grad_checkpointingr   r   r   r]   r   
Sequentialranger   
isinstancelistblocks)rh   rK   r   r   r   r   rL   rM   r%   r   r   r   rN   rP   rO   r   r   r   r   r   rj   s                       r7   rZ   zSwinTransformerStage.__init__  s*   H 	 0LV&H7G&H!H\l
"',K8qAF89
 *%DO '>!> kkmDO mm" 5\#&#" ! !!%!7!7#!'!"Q!1*!1)#!##*4Y*E)A,9%&# $ 9&#s   D A	Dr   c                     || _         t        | j                  t        j                        r|| _        nt        d |D              | _        | j                  D ]   }|j                  | j
                  ||       " y)a   Updates the resolution, window size and so the pair-wise relative positions.

        Args:
            feat_size: New input (feature) resolution
            window_size: New window size
            always_partition: Always partition / shift the window
        c              3   &   K   | ]	  }|d z    ywr   r   r   s     r7   r   z6SwinTransformerStage.set_input_size.<locals>.<genexpr>E  s     *Ea16*Er   r   r%   r   N)	r   r   r   r]   r   r   r   r   r   )rh   r   r%   r   blocks        r7   r   z#SwinTransformerStage.set_input_size4  sn     !*door{{3%.D"%**E9*E%ED"[[ 	E  00'!1 ! 	r9   r$   r&   c                     | j                  |      }| j                  r6t        j                  j	                         st        | j                  |      }|S | j                  |      }|S )zsForward pass.

        Args:
            x: Input features.

        Returns:
            Output features.
        )r   r   r?   r   is_scriptingr   r   rh   r$   s     r7   r   zSwinTransformerStage.forwardM  sU     OOA""599+A+A+Ct{{A.A  AAr9   r   )r   r   r   r   r]   r   r   r	   r   r   r   r   r
   r   r   rZ   r   r?   r   r   r   r   s   @r7   r   r     sN     $&*-.%*!&!!!!35#%<<#J$J$ J$ $CHo	J$
 J$ J$ J$ smJ$ +J$ #J$ J$ J$ J$ J$ J$  T%[%/0!J$" !#J$` 04	S#X  'tn	2 %,, r9   r   c            +       `    e Zd ZdZdddddddd	d
dddddddddeej                  dfdedededede	dede
edf   de
edf   dee   dedededed ed!ed"ed#ed$ed%ed&ee	ef   d'e	f* fd(Zej$                  j&                  dCd)e	d*d
fd+       Zej$                  j&                  d*ee	   fd,       Z	 	 	 	 	 dDdee
eef      dee
eef      dee
eef      d-edee   d*d
fd.Zej$                  j&                  dEd/ed*ee	ef   fd0       Zej$                  j&                  dFd1ed*d
fd2       Zej$                  j&                  d*ej8                  fd3       ZdGdedee	   d*d
fd4Z	 	 	 	 	 dHd5ej>                  d6eeee e   f      d7ed8ed9e	d:ed*ee ej>                     e
ej>                  e ej>                     f   f   fd;Z!	 	 	 dId6eee e   f   d<ed=ed*e e   fd>Z"d5ej>                  d*ej>                  fd?Z#dEd5ej>                  d@ed*ej>                  fdAZ$d5ej>                  d*ej>                  fdBZ% xZ&S )Jr#   zSwin Transformer.

    A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030
       r*   r(     avg`   r)   r)      r)   r(   r        Nr   FTr   rw   g? img_size
patch_sizein_chansnum_classesglobal_pool	embed_dimdepths.rL   rM   r%   r   strict_img_sizer   rN   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rateembed_layerr   weight_initc           
         t         |           |dv sJ || _        || _        d| _        t        |      | _        || _        t        |d| j                  dz
  z  z        x| _	        | _
        g | _        t        |t        t        f      s1t        | j                        D cg c]  }t        |d|z  z         }} |||||d   ||d      | _        | j                   j"                  } t%        | j                        |	      }	t        |
t        t        f      s t%        | j                        |
      }
nt        |
      dk(  r|
f| j                  z  }
t        |
      | j                  k(  sJ  t%        | j                        |      }t'        j(                  d|t+        |            j-                  |      D cg c]  }|j/                          }}g }|d   }d}t        | j                        D ]  }||   }|t1        di d|d|d	|d   |z  |d   |z  fd
||   d|dkD  d||   d|	|   d|
|   d|d| d||   d|d|d|d||   d|gz  }|}|dkD  r|dz  }| xj                  t3        |||z  d|       gz  c_         t5        j6                  | | _         || j                        | _        t=        | j                  |||| j                        | _        |dk7  r| jA                  |       yyc c}w c c}w )a~  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of input image channels.
            num_classes: Number of classes for classification head.
            embed_dim: Patch embedding dimension.
            depths: Depth of each Swin Transformer layer.
            num_heads: Number of attention heads in different layers.
            head_dim: Dimension of self-attention heads.
            window_size: Window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            drop_rate: Dropout rate.
            attn_drop_rate (float): Attention dropout rate.
            drop_path_rate (float): Stochastic depth rate.
            embed_layer: Patch embedding layer.
            norm_layer (nn.Module): Normalization layer.
        )r  r  NHWCr)   r   r   )r	  r
  r  r  r   r  
output_fmtrK   r   r   r   r   rL   rM   r%   r   r   r   rN   rP   rO   r   r   layers.)num_chsr   module)	pool_typer  	input_fmtskipNr   )!rY   rZ   r  r  r  len
num_layersr  r   num_featureshead_hidden_sizefeature_infor   r   r   r   patch_embed	grid_sizer   r?   linspacerC   splittolistr   dictr]   r   layersr   r   headinit_weights) rh   r	  r
  r  r  r  r  r  rL   rM   r%   r   r  r   rN   r  r  r  r  r  r   r  kwargsr   
patch_gridr$   dprr+  in_dimr\   r   rj   s                                   r7   rZ   zSwinTransformer.__init__f  s}   X 	k)))&& f+"47	A$//\]J]D^8^4__D1)eT]3:?:PQQYa/0QIQ '!l!+
 %%//
 .9T__-h7+e}54)DOO4[AK"&.4??:K;4??222.Idoo.y9	#(>>!^S[#Q#W#WX^#_`aqxxz``1t' 	mAlG+  qMU*qMU*" Qi q5 $A, "! (N "2 "10 $A, "  )!" )#$ a&%& &'  F* F1u
$w*uBT_fghfi]j"k!ll5	m6 mmV,t001	"!oo
	 & k* ! R, as    K=8Lmoder&   c                     |dv sJ d|v r t        j                  | j                         nd}t        t	        ||      |        y)zInitialize model weights.

        Args:
            mode: Weight initialization mode ('jax', 'jax_nlhb', 'moco', or '').
        )jaxjax_nlhbmocor  nlhbrw   )	head_biasN)r   logr  r   r"   )rh   r2  r8  s      r7   r-  zSwinTransformer.init_weights  sD     666639T>TXXd..//r	(CTJr9   c                 v    t               }| j                         D ]  \  }}d|v s|j                  |        |S )z,Parameters that should not use weight decay.r`   )setnamed_parametersadd)rh   nwdnr   s       r7   no_weight_decayzSwinTransformer.no_weight_decay  sA     e))+ 	DAq-2
	 
r9   window_ratioc                 Z   ||3| j                   j                  ||       | j                   j                  }|t        D cg c]  }||z  	 c}      }t	        | j
                        D ]9  \  }}	dt        |dz
  d      z  }
|	j                  d   |
z  |d   |
z  f||       ; yc c}w )a  Update the image resolution and window size.

        Args:
            img_size: New input resolution, if None current resolution is used.
            patch_size: New patch size, if None use current patch size.
            window_size: New window size, if None based on new_img_size // window_div.
            window_ratio: Divisor for calculating window size from grid size.
            always_partition: Always partition into windows and shift (even if window size < feat size).
        N)r	  r
  r)   r   r   r   )r%  r   r&  r   	enumerater+  max)rh   r	  r
  r%   rA  r   r/  pgindexstagestage_scales              r7   r   zSwinTransformer.set_input_size  s    " :#9++X*+U))33Jj I|!3 IJK%dkk2 	LE5s519a00K  %a=K7A+9UV'!1 ! 	 !Js   B(coarsec                 2    t        d|rd      S g d      S )z"Group parameters for optimization.z^patch_embedz^layers\.(\d+)))z^layers\.(\d+).downsample)r   )z^layers\.(\d+)\.\w+\.(\d+)N)z^norm)i )stemr   )r*  )rh   rI  s     r7   group_matcherzSwinTransformer.group_matcher  s)      (.$
 	
5
 	
r9   enablec                 4    | j                   D ]	  }||_         y)z)Enable or disable gradient checkpointing.N)r+  r   )rh   rM  ls      r7   set_grad_checkpointingz&SwinTransformer.set_grad_checkpointing   s      	*A#)A 	*r9   c                 .    | j                   j                  S )zGet the classifier head.)r,  fc)rh   s    r7   get_classifierzSwinTransformer.get_classifier&  s     yy||r9   c                 L    || _         | j                  j                  ||       y)zReset the classifier head.

        Args:
            num_classes: Number of classes for new classifier.
            global_pool: Global pooling type.
        )r  N)r  r,  reset)rh   r  r  s      r7   reset_classifierz SwinTransformer.reset_classifier+  s      '		{;r9   r$   indicesr   
stop_earlyr  intermediates_onlyc                 >   |dv sJ d       g }t        t        | j                        |      \  }}	| j                  |      }t        | j                        }
t        j
                  j                         s|s| j                  }n| j                  d|	dz    }t        |      D ]c  \  }} ||      }||v s|r||
dz
  k(  r| j                  |      }n|}|j                  dddd      j                         }|j                  |       e |r|S | j                  |      }||fS )aK  Forward features that returns intermediates.

        Args:
            x: Input image tensor.
            indices: Take last n blocks if int, all if None, select matching indices if sequence.
            norm: Apply norm layer to compatible intermediates.
            stop_early: Stop iterating over blocks when last desired intermediate hit.
            output_fmt: Shape of intermediate feature outputs.
            intermediates_only: Only return intermediate features.

        Returns:
            List of intermediate features or tuple of (final features, intermediates).
        )NCHWzOutput shape must be NCHW.Nr   r   r(   r)   )r   r   r+  r%  r?   r   r   rC  r   r0   r1   append)rh   r$   rW  r   rX  r  rY  intermediatestake_indices	max_index
num_stagesstagesr   rG  x_inters                  r7   forward_intermediatesz%SwinTransformer.forward_intermediates5  s   , Y&D(DD&"6s4;;7G"Qi Q%
99!!#:[[F[[)a-0F!&) 	.HAuaAL Aa/"iilGG!//!Q15@@B$$W-	.   IIaL-r9   
prune_norm
prune_headc                     t        t        | j                        |      \  }}| j                  d|dz    | _        |rt        j                         | _        |r| j                  dd       |S )aE  Prune layers not required for specified intermediates.

        Args:
            indices: Indices of intermediate layers to keep.
            prune_norm: Whether to prune normalization layer.
            prune_head: Whether to prune the classifier head.

        Returns:
            List of indices that were kept.
        Nr   r   r  )r   r   r+  r]   r   r   rV  )rh   rW  rd  re  r^  r_  s         r7   prune_intermediate_layersz)SwinTransformer.prune_intermediate_layersh  s]      #7s4;;7G"Qikk.9q=1DI!!!R(r9   c                 l    | j                  |      }| j                  |      }| j                  |      }|S )z/Forward pass through feature extraction layers.)r%  r+  r   r   s     r7   forward_featuresz SwinTransformer.forward_features  s1    QKKNIIaLr9   
pre_logitsc                 N    |r| j                  |d      S | j                  |      S )zForward pass through classifier head.

        Args:
            x: Feature tensor.
            pre_logits: Return features before final classifier.

        Returns:
            Output tensor.
        T)rj  )r,  )rh   r$   rj  s      r7   forward_headzSwinTransformer.forward_head  s&     1;tyyty,L		!Lr9   c                 J    | j                  |      }| j                  |      }|S )zoForward pass.

        Args:
            x: Input tensor.

        Returns:
            Output logits.
        )ri  rl  r   s     r7   r   zSwinTransformer.forward  s)     !!!$a r9   r  )NNN   NF)Tr   )NFFr[  F)r   FT)'r   r   r   r   r   r]   r   r   r   strr	   r   r   r   r   r
   rZ   r?   r   ignorer-  r   r@  r   r   r   rL  rP  ModulerS  rV  r   r   rc  rg  ri  rl  r   r   r   s   @r7   r#   r#   _  s    +.#$&2)7&*-.%*$(!!!$&$&$'$./1||!-x+'x+ x+ 	x+
 x+ x+ x+ #s(Ox+ S#Xx+ smx+ +x+ #x+ "x+ x+ x+  !x+" "#x+$ "%x+& "'x+( ")x+* c8m,+x+, -x+t YYK Kd K K YYS   374859 !/3uS#X/ !sCx1 "%S/2	
  'tn 
@ YY	
D 	
T#s(^ 	
 	
 YY*T *T * *
 YY		  <C <hsm <W[ < 8<$$',1 ||1  eCcN341  	1 
 1  1  !%1  
tELL!5tELL7I)I#JJ	K1 j ./$#	3S	>*  	
 
c0%,, 5<< 
Mell 
M 
M 
M %,, r9   
state_dictmodelc                 2   d}d| v rd}ddl }i }| j                  d|       } | j                  d|       } | j                         D ]K  \  }}t        dD cg c]  }||v  c}      r#d	|v re|j                  j
                  j                  j                  \  }}}	}
|j                  d
   |	k7  s|j                  d   |
k7  rt        ||	|
fddd      }|j                  d      r|j                  |dd       }|j                  |j                  j                  k7  s|j                  d   |j                  d   k7  r,t        ||j                  |j                  j                        }|r&|j                  dd |      }|j                  dd      }|||<   N |S c c}w )zConvert patch embedding weight from manual patchify + linear proj to conv.

    Args:
        state_dict: State dictionary from checkpoint.
        model: Model instance.

    Returns:
        Filtered state dictionary.
    Tzhead.fc.weightFr   Nru  rt  )rS   rx   zpatch_embed.proj.weightrz   r,   bicubic)interpolation	antialiasverboser`   ir   rl   zlayers.(\d+).downsamplec                 D    dt        | j                  d            dz    dS )Nr  r   z.downsample)r   group)r$   s    r7   <lambda>z&checkpoint_filter_fn.<locals>.<lambda>  s$    ws177ST:YZGZF[[f=g r9   zhead.zhead.fc.)regetitemsr   r%  re   weightr.   r   endswithget_submoduler`   r%   r   subreplace)rt  ru  old_weightsr~  out_dictr   r   r?  r   r3   r4   ms               r7   checkpoint_filter_fnr    s    K:%H4Jj9J  " 1 HI1QIJ$)**//66<<JAq!Qwwr{a1772;!#3(F"+"  ::45##AdsG,Aww!88>>>!--PQBRVWVcVcdeVfBf-$%MM#$#A#A#G#G 13gijkA		':.A9: O9 Js   F
variant
pretrainedc           	          t        d t        |j                  dd            D              }|j                  d|      }t	        t
        | |ft        t        d|      d|}|S )zCreate a Swin Transformer model.

    Args:
        variant: Model variant name.
        pretrained: Load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        SwinTransformer model instance.
    c              3   &   K   | ]	  \  }}|  y wr   r   )r   r   r   s      r7   r   z+_create_swin_transformer.<locals>.<genexpr>  s     \da\r   r  )r   r   r(   r   out_indicesT)flatten_sequentialr  )pretrained_filter_fnfeature_cfg)r   rC  r  popr   r#   r  r*  )r  r  r.  default_out_indicesr  ru  s         r7   _create_swin_transformerr    sj      \i

8\8Z.[\\**],?@K *1DkJ 	E Lr9   urlc                 4    | ddddddt         t        ddd	d
|S )z9Create default configuration for Swin Transformer models.r   )r(   r   r   )r   r   g?rw  Tzpatch_embed.projzhead.fcmit)r  r  
input_size	pool_sizecrop_pctrx  fixed_input_sizemeanrX   
first_conv
classifierlicenser   )r  r.  s     r7   _cfgr    s7     =v%.B(	 # r9   z.swin_small_patch4_window7_224.ms_in22k_ft_in1kztimm/zvhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_small_patch4_window7_224_22kto1k_finetune.pth)	hf_hub_idr  z-swin_base_patch4_window7_224.ms_in22k_ft_in1kzlhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pthz.swin_base_patch4_window12_384.ms_in22k_ft_in1kzmhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22kto1k.pth)r(     r  )r  r  g      ?)r  r  r  r  r  z.swin_large_patch4_window7_224.ms_in22k_ft_in1kzmhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22kto1k.pthz/swin_large_patch4_window12_384.ms_in22k_ft_in1kznhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pthz$swin_tiny_patch4_window7_224.ms_in1kzdhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pthz%swin_small_patch4_window7_224.ms_in1kzehttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pthz$swin_base_patch4_window7_224.ms_in1kzdhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pthz%swin_base_patch4_window12_384.ms_in1kzehttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pthz-swin_tiny_patch4_window7_224.ms_in22k_ft_in1kzuhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_tiny_patch4_window7_224_22kto1k_finetune.pthz%swin_tiny_patch4_window7_224.ms_in22kzhhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_tiny_patch4_window7_224_22k.pthiQU  )r  r  r  z&swin_small_patch4_window7_224.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_small_patch4_window7_224_22k.pthz%swin_base_patch4_window7_224.ms_in22kzhhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22k.pthz&swin_base_patch4_window12_384.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth)r  r  r  r  r  r  z&swin_large_patch4_window7_224.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22k.pthz'swin_large_patch4_window12_384.ms_in22kzjhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pthzswin_s3_tiny_224.ms_in1kzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_t-1d53f6a8.pthzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_s-3bb4c69d.pthzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_b-a1e95db4.pth)zswin_s3_small_224.ms_in1kzswin_s3_base_224.ms_in1kc           	      R    t        ddddd      }t        	 dd| it        |fi |S )	z+ Swin-T @ 224x224, trained ImageNet-1k
    r*   r   r  r  r  r
  r%   r  r  rL   r  )swin_tiny_patch4_window7_224r*  r  r  r.  
model_argss      r7   r  r  @  sF     R`noJ#&]3=]AEjA[TZA[] ]r9   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	z Swin-S @ 224x224
    r*   r   r  r)   r)      r)   r  r  r  )swin_small_patch4_window7_224r  r  s      r7   r  r  I  sF     RaopJ#'^4>^BFzB\U[B\^ ^r9   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	z Swin-B @ 224x224
    r*   r      r  r*   ro         r  r  )swin_base_patch4_window7_224r  r  s      r7   r  r  R  sF     SbpqJ#&]3=]AEjA[TZA[] ]r9   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	z Swin-B @ 384x384
    r*   r  r  r  r  r  r  )swin_base_patch4_window12_384r  r  s      r7   r  r  [  sF     c-cqrJ#'^4>^BFzB\U[B\^ ^r9   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	z Swin-L @ 224x224
    r*   r      r  r  r  r  0   r  r  )swin_large_patch4_window7_224r  r  s      r7   r  r  d  sF     SbqrJ#'^4>^BFzB\U[B\^ ^r9   c           	      R    t        ddddd      }t        	 dd| it        |fi |S )	z Swin-L @ 384x384
    r*   r  r  r  r  r  r  )swin_large_patch4_window12_384r  r  s      r7   r  r  m  sF     c-crsJ#(_5?_CG
C]V\C]_ _r9   c           	      P    t        ddddd      }t        dd| it        |fi |S )	z; Swin-S3-T @ 224x224, https://arxiv.org/abs/2111.14725
    r*   r   r      r   r  r  r  r  r  )swin_s3_tiny_224r  r  s      r7   r  r  v  s;     -2l^lnJ#l:lQUV`QkdjQkllr9   c           	      P    t        ddddd      }t        dd| it        |fi |S )	z; Swin-S3-S @ 224x224, https://arxiv.org/abs/2111.14725
    r*   )r  r  r  r   r  r  r  r  r  )swin_s3_small_224r  r  s      r7   r  r    s;     /RaoqJ#mJmRVWaRlekRlmmr9   c           	      P    t        ddddd      }t        dd| it        |fi |S )	z; Swin-S3-B @ 224x224, https://arxiv.org/abs/2111.14725
    r*   r  r  )r)   r)      r)   r  r  r  )swin_s3_base_224r  r  s      r7   r  r    s;     -2m_moJ#l:lQUV`QkdjQkllr9   )"swin_base_patch4_window7_224_in22k#swin_base_patch4_window12_384_in22k#swin_large_patch4_window7_224_in22k$swin_large_patch4_window12_384_in22krp  rn  )Mr   loggingr   typingr   r   r   r   r   r   r	   r
   r?   torch.nnr]   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r    r!   vision_transformerr"   __all__	getLoggerr   _loggerr   r   r   r8   r;   rG   rs  rI   r   r   r   r#   r*  rq  r  r   r  r  default_cfgsr  r  r  r  r  r  r  r  r  r   r9   r7   <module>r     s2  
"   I I I   AL L L L * + 3 4 Y Y 4

'

H
%#uS#X./ <<38_ \\& ELL uS#X 3 SV [`[g[g  $#s #3 #5<< #*wbii wtS299 Sl*299 *Zz299 zzbii D
.T .")) .S%,,EV@W .bc t Ra .	c 	T#s(^ 	 % H&4d E7HH& 4Tz6}	H& 5d{ Hs7DH& 5d{7~H& 6t| Hs8DH&& +Dr-u'H&, ,Ts.v-H&2 +Dr-u3H&8 ,Ts Hs.D9H&D 4T D6FEH&L ,Tv.MH&T -dw/UH&\ ,Tv.]H&d -dw HsPU/WeH&l -dw/mH&t .tx HsPU0WuH&~ p!rH&D "&p"r !%p!rKH& HV ] ] ] ^ ^ ^ ] ] ] ^ ^ ^ ^ ^ ^ _/ _ _ mO m m n_ n n mO m m H*Q+S+S,U	' r9   