
    kh^              
       r   d Z ddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZ ddlZddlmc mZ ddlmZ ddlmZmZ ddlmZmZmZmZmZmZ ddlmZmZmZmZmZ d	d
l m!Z! d	dl"m#Z# d	dl$m%Z% d	dl&m'Z'm(Z( d	dl)m*Z*m+Z+m,Z, dgZ- ej\                  e/      Z0 G d dejb                        Z2 G d dejb                        Z3 G d dejb                        Z4de5fdZ6e%de5fd       Z7 G d dejb                        Z8 G d dejb                        Z9d2dejb                  de:de;fdZ<d  Z=d! Z>d3d"Z?d4d#Z@ e+ e@        e@        e@        e@d$%       e@d$%       e@d$%      d&      ZAe*d3d'e9fd(       ZBe*d3d'e9fd)       ZCe*d3d'e9fd*       ZDe*d3d'e9fd+       ZEe*d3d'e9fd,       ZFe*d3d'e9fd-       ZG e,e/d.d/d0d1       y)5a   Nested Transformer (NesT) in PyTorch

A PyTorch implement of Aggregating Nested Transformers as described in:

'Aggregating Nested Transformers'
    - https://arxiv.org/abs/2105.12723

The official Jax code is released and available at https://github.com/google-research/nested-transformer. The weights
have been converted with convert/convert_nest_flax.py

Acknowledgments:
* The paper authors for sharing their research, code, and model weights
* Ross Wightman's existing code off which I based this

Copyright 2021 Alexander Soare
    N)partial)ListOptionalTupleUnion)nnIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPathcreate_classifiertrunc_normal__assert)create_conv2dcreate_pool2d	to_ntupleuse_fused_attn	LayerNorm   )build_model_with_cfg)feature_take_indices)register_notrace_function)checkpoint_seqnamed_apply)register_modelgenerate_default_cfgsregister_model_deprecationsNestc                   d     e Zd ZU dZej
                  j                  e   ed<   d fd	Z	d Z
 xZS )	Attentionz
    This is much like `.vision_transformer.Attention` but uses *localised* self attention by accepting an input with
     an extra "image block" dim
    
fused_attnc                 J   t         |           || _        ||z  }|dz  | _        t	               | _        t        j                  |d|z  |      | _        t        j                  |      | _
        t        j                  ||      | _        t        j                  |      | _        y )Ng         )bias)super__init__	num_headsscaler   r#   r   LinearqkvDropout	attn_dropproj	proj_drop)selfdimr)   qkv_biasr.   r0   head_dim	__class__s          L/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/nest.pyr(   zAttention.__init__1   s    ")#%
(*99S!C%h7I.IIc3'	I.    c           	         |j                   \  }}}}| j                  |      j                  |||d| j                  || j                  z        j	                  dddddd      }|j                  d      \  }}}	| j                  r<t        j                  |||	| j                  r| j                  j                  nd      }nL|| j                  z  }||j                  d	d
      z  }
|
j                  d
      }
| j                  |
      }
|
|	z  }|j	                  ddddd      j                  ||||      }| j                  |      }| j!                  |      }|S )zm
        x is shape: B (batch_size), T (image blocks), N (seq length per image block), C (embed dim)
        r%   r      r                 )	dropout_p)r2   )shaper,   reshaper)   permuteunbindr#   Fscaled_dot_product_attentiontrainingr.   pr*   	transposesoftmaxr/   r0   )r1   xBTNCr,   qkvattns              r6   forwardzAttention.forward=   s7    WW
1ahhqk!!!Q1dnna4>>>QRZZ[\^_abdeghjkl**Q-1a??..q!QVZVcVc$..BRBRiklADJJAq{{2r**D<<B<'D>>$'DqA IIaAq!$,,Q1a8IIaLNN1r7   )   Fr<   r<   )__name__
__module____qualname____doc__torchjitFinalbool__annotations__r(   rS   __classcell__r5   s   @r6   r"   r"   *   s'     		%%
/r7   r"   c                   `     e Zd ZdZdddddej
                  ej                  f fd	Zd Z xZ	S )TransformerLayerz
    This is much like `.vision_transformer.Block` but:
        - Called TransformerLayer here to allow for "block" as defined in the paper ("non-overlapping image blocks")
        - Uses modified Attention layer that handles the "block" dimension
          @Fr<   c
                    t         |            |	|      | _        t        |||||      | _        |dkD  rt        |      nt        j                         | _         |	|      | _	        t        ||z        }
t        ||
||      | _        y )N)r)   r3   r.   r0   r<   )in_featureshidden_features	act_layerdrop)r'   r(   norm1r"   rR   r   r   Identity	drop_pathnorm2intr   mlp)r1   r2   r)   	mlp_ratior3   r0   r.   rj   rf   
norm_layermlp_hidden_dimr5   s              r6   r(   zTransformerLayer.__init__\   s     	_

	 1:B),BKKM_
S9_-*	
r7   c                     | j                  |      }|| j                  | j                  |            z   }|| j                  | j                  | j	                  |                  z   }|S N)rh   rj   rR   rm   rk   )r1   rJ   ys      r6   rS   zTransformerLayer.forward{   sS    JJqMtyy|,,txx

1677r7   )
rU   rV   rW   rX   r   GELUr   r(   rS   r^   r_   s   @r6   ra   ra   V   s1     gg||
>r7   ra   c                   &     e Zd Zd fd	Zd Z xZS )ConvPoolc                     t         |           t        ||d|d      | _         ||      | _        t        ddd|      | _        y )Nr%   T)kernel_sizepaddingr&   maxr:   )rx   stridery   )r'   r(   r   convnormr   pool)r1   in_channelsout_channelsro   pad_typer5   s        r6   r(   zConvPool.__init__   sB    !+|T\cgh	|,	!%Qq(S	r7   c                 0   t        |j                  d   dz  dk(  d       t        |j                  d   dz  dk(  d       | j                  |      }| j                  |j	                  dddd            j	                  dddd      }| j                  |      }|S )z:
        x is expected to have shape (B, C, H, W)
        r>   r:   r   z1BlockAggregation requires even input spatial dimsr?   r%   r   )r   r@   r|   r}   rB   r~   r1   rJ   s     r6   rS   zConvPool.forward   s     	a1$&YZa1$&YZIIaLIIaii1a+,44Q1a@IIaLr7    )rU   rV   rW   r(   rS   r^   r_   s   @r6   rv   rv      s    T
r7   rv   
block_sizec                     | j                   \  }}}}t        ||z  dk(  d       t        ||z  dk(  d       ||z  }||z  }| j                  ||||||      } | j                  dd      j                  |||z  d|      } | S )zimage to blocks
    Args:
        x (Tensor): with shape (B, H, W, C)
        block_size (int): edge length of a single square block in units of H, W
    r   z,`block_size` must divide input height evenlyz+`block_size` must divide input width evenlyr:   r%   r?   )r@   r   rA   rH   )rJ   r   rK   HWrN   grid_height
grid_widths           r6   blockifyr      s     ''JAq!QA
Na!OPA
Na!NOz/KjJ			![*j*aHA	Aq!!![:%=r1EAHr7   c                     | j                   \  }}}}t        t        j                  |            }||z  x}}| j	                  ||||||      } | j                  dd      j	                  ||||      } | S )zblocks to image
    Args:
        x (Tensor): with shape (B, T, N, C) where T is number of blocks and N is sequence size per block
        block_size (int): edge length of a single square block in units of desired H, W
    r:   r%   )r@   rl   mathsqrtrA   rH   )	rJ   r   rK   rL   _rN   	grid_sizeheightwidths	            r6   
deblockifyr      st     JAq!QDIIaL!I++FU			!Y	:z1EA	Aq!!!VUA6AHr7   c            	       <     e Zd ZdZdddddg dddf	 fd	Zd Z xZS )		NestLevelz7 Single hierarchical level of a Nested Transformer
    Nrb   Tr<   r   c                    t         |           || _        d| _        t	        j
                  t        j                  d|||            | _        |t        ||||      | _
        nt	        j                         | _
        t        |      rt        |      |k(  sJ d       t	        j                  t        |      D cg c]  }t        ||||	|
|||   ||	       c} | _        y c c}w )NFr   )ro   r   zDMust provide as many drop path rates as there are transformer layers)	r2   r)   rn   r3   r0   r.   rj   ro   rf   )r'   r(   r   grad_checkpointingr   	ParameterrY   zeros	pos_embedrv   r~   ri   len
Sequentialrangera   transformer_encoder)r1   
num_blocksr   
seq_lengthr)   depth	embed_dimprev_embed_dimrn   r3   r0   r.   rj   ro   rf   r   ir5   s                    r6   r(   zNestLevel.__init__   s    $ 	$"'ekk!ZY&WX% z\deDIDI y>y>U*r,rr*#%== 5\3#  ##!###A,%#
3# $$  3#s   :C"c                    | j                  |      }|j                  dddd      }t        || j                        }|| j                  z   }| j
                  r5t        j                  j                         st        | j                  |      }n| j                  |      }t        || j                        }|j                  dddd      S )z+
        expects x as (B, C, H, W)
        r   r:   r%   r   )r~   rB   r   r   r   r   rY   rZ   is_scriptingr   r   r   r   s     r6   rS   zNestLevel.forward   s     IIaLIIaAq!Q(""599+A+A+Ct77;A((+Aq$//*yyAq!$$r7   )rU   rV   rW   rX   r(   rS   r^   r_   s   @r6   r   r      s0      !,$\%r7   r   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zej                  j                  dd       Zej                  j                  d        Z	ej                  j                  dd       Z
ej                  j                  dd       Zej                  j                  dej                  fd       Zdd	ed
efdZ	 	 	 	 	 ddej$                  deeeee   f      dededededeeej$                     eej$                  eej$                     f   f   fdZ	 	 	 d deeee   f   dedefdZd ZddefdZd Z xZS )!r    z Nested Transformer (NesT)

    A PyTorch impl of : `Aggregating Nested Transformers`
        - https://arxiv.org/abs/2105.12723
    c                 R   t         |           dD ]M  }t               |   }t        |t        j
                  j                        s5t        |      |k(  rDJ d| d         t        |      |      } t        |      |      } t        |      |      }|| _	        |d   x| _
        | _        g | _        |xs t        }|xs t        j                  }|| _        || _        t        |t        j
                  j                        r|d   |d   k(  sJ d       |d   }||z  dk(  sJ d       || _        d	t'        j(                  |      z  j+                  d      j-                         | _        ||z  t1        j2                  | j.                  d         z  dk(  sJ d
       t5        ||z  t1        j2                  | j.                  d         z        | _        t9        ||||d   d      | _        | j:                  j<                  | _        | j<                  | j.                  d   z  | _        g }t'        j@                  d|tC        |            jE                  |      D cg c]  }|j-                          }}d}d	}tG        t        | j.                              D ]  }||   }|jI                  tK        | j.                  |   | j6                  | j>                  ||   ||   |||	|
||||   |||             | xj                  tM        ||d|       gz  c_        |}|dz  } t        jN                  | | _(         ||d         | _)        tU        | j                  | j                  |      \  }}|| _+        t        jX                  |      | _-        || _.        | j_                  |       yc c}w )a  
        Args:
            img_size (int, tuple): input image size
            in_chans (int): number of input channels
            patch_size (int): patch size
            num_levels (int): number of block hierarchies (T_d in the paper)
            embed_dims (int, tuple): embedding dimensions of each level
            num_heads (int, tuple): number of attention heads for each level
            depths (int, tuple): number of transformer layers for each level
            num_classes (int): number of classes for classification head
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim for MLP of transformer layers
            qkv_bias (bool): enable bias for qkv if True
            drop_rate (float): dropout rate for MLP of transformer layers, MSA final projection layer, and classifier
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            norm_layer: (nn.Module): normalization layer for transformer layers
            act_layer: (nn.Module): activation layer in MLP of transformer layers
            pad_type: str: Type of padding to use '' for PyTorch symmetric, 'same' for TF SAME
            weight_init: (str): weight init scheme
            global_pool: (str): type of pooling operation to apply to final feature map

        Notes:
            - Default values follow NesT-B from the original Jax code.
            - `embed_dims`, `num_heads`, `depths` should be ints or tuples with length `num_levels`.
            - For those following the paper, Table A1 may have errors!
                - https://github.com/google-research/nested-transformer/issues/2
        
embed_dimsr)   depthszRequire `len(z) == num_levels`r?   r   r   z Model only handles square inputsz*`patch_size` must divide `img_size` evenlyr9   zUFirst level blocks don't fit evenly. Check `img_size`, `patch_size`, and `num_levels`F)img_size
patch_sizein_chansr   flattenN)rn   r3   r0   r.   rj   ro   rf   r   zlevels.)num_chs	reductionmoduler:   	pool_type)0r'   r(   locals
isinstancecollectionsabcSequencer   r   num_classesnum_featureshead_hidden_sizefeature_infor   r   rt   	drop_rate
num_levelsr   rY   arangefliptolistr   r   r   rl   r   r   patch_embednum_patchesr   linspacesumsplitr   appendr   dictr   levelsr}   r   global_poolr-   	head_dropheadinit_weights)r1   r   r   r   r   r   r)   r   r   rn   r3   r   proj_drop_rateattn_drop_ratedrop_path_ratero   rf   r   weight_initr   
param_nameparam_valuer   rJ   dp_ratesprev_dimcurr_strider   r2   r   r5   s                                 r6   r(   zNest.__init__   s   b 	? 	dJ (:.K+{'?'?@;':5czlRb7cc5	d
 +Yz*:6
)Ij))4	&:&v.&4>rNBD1,9
(	"$h 8 89A;(1+-Q/QQ-{H*$)W+WW)$ Z 88>>qAHHJJ&$))DOOA4F*GG1L 	ed	eL x:5$))DOOTUDV:WWX &! m
  ++77**dooa.@@ (-q.#f+(V(\(\]c(de1AHHJees4??+, 	AQ-CMM)"!q	#!(("1+%#! " $skT[\][^R_"`!aaH1K+	, mmV, z"~.	 .d.?.?AQAQ]hiT&I.	+&I fs   N$c                     |dv sJ d|v r t        j                  | j                         nd}| j                  D ]  }t	        |j
                  ddd        t        t        t        |      |        y )	N)nlhbr   r   r<   {Gz?r>   r:   stdab)	head_bias)	r   logr   r   r   r   r   r   _init_nest_weights)r1   moder   levels       r6   r   zNest.init_weights  sf    |###39T>TXXd..//r	[[ 	?E%//sbA>	?G.)DdKr7   c                 l    t        t        | j                              D ch c]  }d| d
 c}S c c}w )Nzlevel.z
.pos_embed)r   r   r   )r1   r   s     r6   no_weight_decayzNest.no_weight_decay  s-    05c$++6F0GH1&:&HHHs   1c                 2    t        d|rdndd fddg      }|S )Nz^patch_embedz^levels\.(\d+)z*^levels\.(\d+)\.transformer_encoder\.(\d+))z"^levels\.(\d+)\.(?:pool|pos_embed))r   )z^norm)i )stemblocks)r   )r1   coarsematchers      r6   group_matcherzNest.group_matcher  s0     &,"2_aef=$
 r7   c                 4    | j                   D ]	  }||_         y rr   )r   r   )r1   enablels      r6   set_grad_checkpointingzNest.set_grad_checkpointing  s     	*A#)A 	*r7   returnc                     | j                   S rr   )r   )r1   s    r6   get_classifierzNest.get_classifier  s    yyr7   r   r   c                 p    || _         t        | j                  | j                   |      \  | _        | _        y )Nr   )r   r   r   r   r   )r1   r   r   s      r6   reset_classifierzNest.reset_classifier  s2    &&7t//;'H#$)r7   rJ   indicesr}   
stop_early
output_fmtintermediates_onlyc           	         |dv sJ d       g }t        t        | j                        |      \  }}	| j                  |      }t        | j                        dz
  }
t
        j                  j                         s|s| j                  }n| j                  d|	dz    }t        |      D ]q  \  }} ||      }||v s|rL||
k(  rG| j                  |j                  dddd            j                  dddd      }|j                  |       a|j                  |       s |r|S |
k(  r5| j                  |j                  dddd            j                  dddd      }||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )NCHWzOutput shape must be NCHW.r   Nr   r:   r%   )r   r   r   r   r   rY   rZ   r   	enumerater}   rB   r   )r1   rJ   r   r}   r   r   r   intermediatestake_indices	max_indexlast_idxstagesfeat_idxstagex_inters                  r6   forward_intermediateszNest.forward_intermediates  sU   * Y&D(DD&"6s4;;7G"Qi Qt'!+99!!#:[[F[[)a-0F(0 	,OHeaA<'H0"ii		!Q1(=>FFq!QPQRG!((1!((+	,   x		!))Aq!Q/088Aq!DA-r7   
prune_norm
prune_headc                     t        t        | j                        |      \  }}| j                  d|dz    | _        |rt        j                         | _        |r| j                  dd       |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r   r   r   ri   r}   r   )r1   r   r  r  r   r   s         r6   prune_intermediate_layerszNest.prune_intermediate_layers  s]     #7s4;;7G"Qikk.9q=1DI!!!R(r7   c                     | j                  |      }| j                  |      }| j                  |j                  dddd            j                  dddd      }|S )Nr   r:   r%   r   )r   r   r}   rB   r   s     r6   forward_featureszNest.forward_features  sR    QKKNIIaii1a+,44Q1a@r7   
pre_logitsc                 p    | j                  |      }| j                  |      }|r|S | j                  |      S rr   )r   r   r   )r1   rJ   r  s      r6   forward_headzNest.forward_head  s5    QNN1q0DIIaL0r7   c                 J    | j                  |      }| j                  |      }|S rr   )r  r  r   s     r6   rS   zNest.forward  s'    !!!$a r7   )   r%   r9   r%         i   r9   rT      r:   r:        rb   Tr<   r<   r<   g      ?NNr   r   avgr   F)T)r  )NFFr   F)r   FT)rU   rV   rW   rX   r(   rY   rZ   ignorer   r   r   r   r   Moduler   rl   strr   Tensorr   r   r   r\   r   r  r	  r  r  rS   r^   r_   s   @r6   r    r       s    & )C'J YYL L YYI I YY	 	 YY* * YY		  HC Hc H 8<$$',1 ||1  eCcN341  	1 
 1  1  !%1  
tELL!5tELL7I)I#JJ	K1 j ./$#	3S	>*  	 1$ 1
r7   r   namer   c                 V   t        | t        j                        r|j                  d      rDt	        | j
                  ddd       t        j                  j                  | j                  |       yt	        | j
                  ddd       | j                  *t        j                  j                  | j                         yyt        | t        j                        rPt	        | j
                  ddd       | j                  *t        j                  j                  | j                         yyy)zn NesT weight initialization
    Can replicate Jax implementation. Otherwise follows vision_transformer.py
    r   r   r>   r:   r   N)r   r   r+   
startswithr   weightinit	constant_r&   zeros_Conv2d)r   r  r   s      r6   r   r     s     &"))$??6"&--SB!<GGfkk95&--SB!<{{&v{{+ '	FBII	&fmma8;;"GGNN6;;' # 
'r7   c                    t         j                  d| j                  |j                         | j                  d   }|j                  dd \  }}t        t	        j
                  ||z              }t        | t        t	        j
                  |                  j                  dddd      } t        j                  | ||gdd      } t        | j                  dddd      t        t	        j
                  |                  } | S )	z
    Rescale the grid of position embeddings when loading from state_dict
    Expected shape of position embeddings is (1, T, N, C), and considers only square images
    z$Resized position embedding: %s to %sr:   r   r%   r   bicubicF)sizer   align_corners)_loggerinfor@   rl   r   r   r   rB   rD   interpolater   )posemb
posemb_newseq_length_oldnum_blocks_newseq_length_newsize_news         r6   resize_pos_embedr4    s    
 LL7zGWGWX\\!_N%/%5%5a%:"NN499^N:;<HDIIn$= >?GG1aQRSF]]68(<9\abFfnnQ1a0#dii6O2PQFMr7   c                    | j                         D cg c]  }|j                  d      s| }}|D ]E  }| |   j                  t        ||      j                  k7  s*t	        | |   t        ||            | |<   G | S c c}w )z4 resize positional embeddings of pretrained weights 
pos_embed_)keysr!  r@   getattrr4  )
state_dictmodelrP   pos_embed_keyss       r6   checkpoint_filter_fnr<  !  s    !+!2QAall<6PaQNQ Oa='%"3"9"99,Z]GE1<MNJqMO 	 Rs
   A<A<c                 N    t        t        | |ft        dd      t        d|}|S )N)r   r   r:   T)out_indicesflatten_sequential)feature_cfgpretrained_filter_fn)r   r    r   r<  )variant
pretrainedkwargsr:  s       r6   _create_nestrE  *  s:      Y4H1 E Lr7   c                 6    | ddddgdddt         t        ddd	|S )
Nr  )r%   r  r     g      ?r(  Tzpatch_embed.projr   )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanr   
first_conv
classifierr	   )rH  rD  s     r6   _cfgrQ  7  s6    =Bx9$%.B(  r7   ztimm/)	hf_hub_id)znest_base.untrainedznest_small.untrainedznest_tiny.untrainedznest_base_jx.goog_in1kznest_small_jx.goog_in1kznest_tiny_jx.goog_in1kr   c                 >    t        ddddd|}t        dd| i|}|S ) Nest-B @ 224x224
    r  r  r  r   rC   )	nest_baser   rE  rC  rD  model_kwargsr:  s       r6   rV  rV  M  s<      W"jWOUWLLL|LELr7   c                 >    t        ddddd|}t        dd| i|}|S ) Nest-S @ 224x224
    `      i  r%         r  r   rC  rU  )
nest_smallrW  rX  s       r6   rb  rb  W  s3     e>ZPZe^deLM*MMELr7   c                 >    t        ddddd|}t        dd| i|}|S ) Nest-T @ 224x224
    r\  r_  r:   r:   rT   r   rC  rU  )	nest_tinyrW  rX  s       r6   rf  rf  `  s3     d>ZPYd]cdLLL|LELr7   c                 b    |j                  dd       t        ddddd|}t        d	d| i|}|S )
rT  r   samer  r  r  r   rC  rU  )nest_base_jx
setdefaultr   rE  rX  s       r6   ri  ri  i  sL     j&) W"jWOUWLOJO,OELr7   c                 b    |j                  dd       t        ddddd|}t        d	d| i|}|S )
r[  r   rh  r\  r_  r  r   rC  rU  )nest_small_jxrj  rX  s       r6   rm  rm  t  sC     j&)e>ZPZe^deLPZP<PELr7   c                 b    |j                  dd       t        ddddd|}t        d	d| i|}|S )
rd  r   rh  r\  r_  re  r   rC  rU  )nest_tiny_jxrj  rX  s       r6   ro  ro  ~  sC     j&)d>ZPYd]cdLOJO,OELr7   ri  rm  ro  )jx_nest_basejx_nest_smalljx_nest_tiny)r   r<   r  r   )HrX   collections.abcr   loggingr   	functoolsr   typingr   r   r   r   rY   torch.nn.functionalr   
functionalrD   	timm.datar
   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r   r   __all__	getLoggerrU   r+  r  r"   ra   rv   rl   r   r   r   r    r  floatr   r4  r<  rE  rQ  default_cfgsrV  rb  rf  ri  rm  ro  rU  r7   r6   <module>r     s  "     / /     A \ \ Z Z * + 3 4 Y Y(
'

H
%)		 )X)ryy )Xryy (C   c  ?%		 ?%DD299 DN(ryy ( (U ($"
 %6 F6"W5#g6"W5&  T   d   T            H"$"' r7   