
    khD                        d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ dgZ G d dej:                        Z G d dej>                        Z  G d dej>                        Z! G d dej>                        Z" G d dej>                        Z#d Z$d(dZ%d)dZ& e e&d       e&d       e&d       e&d       e&dd       e&dd       e&dd       e&dd      d      Z'ed(de#fd        Z(ed(de#fd!       Z)ed(de#fd"       Z*ed(de#fd#       Z+ed(de#fd$       Z,ed(de#fd%       Z-ed(de#fd&       Z.ed(de#fd'       Z/y)*a   Pooling-based Vision Transformer (PiT) in PyTorch

A PyTorch implement of Pooling-based Vision Transformers as described in
'Rethinking Spatial Dimensions of Vision Transformers' - https://arxiv.org/abs/2103.16302

This code was adapted from the original version at https://github.com/naver-ai/pit, original copyright below.

Modifications for timm by / Copyright 2020 Ross Wightman
    N)partial)ListOptionalSequenceTupleUnion)nnIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)trunc_normal_	to_2tuple   )build_model_with_cfg)feature_take_indices)register_modelgenerate_default_cfgs)BlockPoolingVisionTransformerc                        e Zd ZdZ fdZdeej                  ej                  f   deej                  ej                  f   fdZ xZ	S )SequentialTuplezI This module exists to work around torchscript typing issues list -> listc                 &    t        t        | 
  |  y N)superr   __init__)selfargs	__class__s     K/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/pit.pyr   zSequentialTuple.__init__#   s    ot-t4    xreturnc                 $    | D ]
  } ||      } |S r    )r   r!   modules      r   forwardzSequentialTuple.forward&   s     	Fq	A	r    )
__name__
__module____qualname____doc__r   r   torchTensorr&   __classcell__r   s   @r   r   r   !   sE    S5u||U\\9: uU\\SXS_S_E_?` r    r   c                        e Zd Z	 	 	 	 	 d fd	Zdeej                  ej                  f   deej                  ej                  f   fdZ xZS )Transformerc
                 H   t         t        |           ||z  }
|| _        |	r |	|
      nt	        j
                         | _        t	        j                  t        |      D cg c]1  }t        |
||d||||   t        t        j                  d            3 c} | _        y c c}w )NTư>eps)dim	num_heads	mlp_ratioqkv_bias	proj_drop	attn_drop	drop_path
norm_layer)r   r0   r   poolr	   Identitynorm
Sequentialranger   r   	LayerNormblocks)r   base_dimdepthheadsr7   r=   r9   r:   drop_path_probr<   	embed_dimir   s               r   r   zTransformer.__init__-   s     	k4)+u$		-7Jy)R[[]	mm 5\&#  ###(+"2<<T:	&# $ &#s   6Br!   r"   c                    |\  }}|j                   d   }| j                  | j                  ||      \  }}|j                   \  }}}}|j                  d      j                  dd      }t	        j
                  ||fd      }| j                  |      }| j                  |      }|d d d |f   }|d d |d f   }|j                  dd      j                  ||||      }||fS )Nr      )r5   )	shaper=   flatten	transposer+   catr?   rC   reshape)r   r!   
cls_tokenstoken_lengthBCHWs           r   r&   zTransformer.forwardK   s    :!''*99  IIa4MAzWW
1aIIaL""1a(IIz1o1-IIaLKKNq-<-'(
aKK1%%aAq1*}r    )N        rW   NN	r'   r(   r)   r   r   r+   r,   r&   r-   r.   s   @r   r0   r0   ,   sS     $<u||U\\9: uU\\SXS_S_E_?` r    r0   c                   ^     e Zd Zd fd	Zdeej                  ej                  f   fdZ xZS )Poolingc           	          t         t        |           t        j                  |||dz   |dz  |||      | _        t        j                  ||      | _        y )Nr   rK   )kernel_sizepaddingstridepadding_modegroups)r   rZ   r   r	   Conv2dconvLinearfc)r   
in_featureout_featurer^   r_   r   s        r   r   zPooling.__init__`   sQ    gt%'II
aK%
	 ))J4r    r"   c                 N    | j                  |      }| j                  |      }||fS r   )rb   rd   )r   r!   	cls_tokens      r   r&   zPooling.forwardn   s'    IIaLGGI&	)|r    )zerosrX   r.   s   @r   rZ   rZ   _   s&    5uU\\5<<-G'H r    rZ   c            	       @     e Zd Z	 	 	 	 ddedededef fdZd Z xZS )ConvEmbeddingimg_size
patch_sizer^   r]   c                    t         t        |           |}t        |      | _        t        |      | _        t        j                  | j                  d   d|z  z   | j
                  d   z
  |z  dz         | _        t        j                  | j                  d   d|z  z   | j
                  d   z
  |z  dz         | _	        | j                  | j                  f| _
        t        j                  |||||d      | _        y )Nr   rK   r   T)r\   r^   r]   bias)r   rk   r   r   rl   rm   mathfloorheightwidth	grid_sizer	   ra   rb   )r   in_channelsout_channelsrl   rm   r^   r]   r   s          r   r   zConvEmbedding.__init__u   s     	mT+-!(+#J/jj$--"2Q["@4??STCU"UY_!_bc!cdZZq!1AK!?$//RSBT!TX^ ^ab bc
++tzz2II:77	r    c                 (    | j                  |      }|S r   )rb   r   r!   s     r   r&   zConvEmbedding.forward   s    IIaLr    )         r   )r'   r(   r)   intr   r&   r-   r.   s   @r   rk   rk   t   sC    
   7 	7
 7 7 7*r    rk   c                   x    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"dededededee   dee   dee   d	ef fd
Zd Z	e
j                  j                  d        Ze
j                  j                  d#d       Ze
j                  j                  d#d       Zdej"                  fdZd$dedee   fdZ	 	 	 	 	 d%de
j*                  deeeee   f      dededededeee
j*                     ee
j*                  ee
j*                     f   f   fdZ	 	 	 d&deeee   f   dedefdZd Zd'dede
j*                  fd Zd! Z xZS )(r   z Pooling-based Vision Transformer

    A PyTorch implement of 'Rethinking Spatial Dimensions of Vision Transformers'
        - https://arxiv.org/abs/2103.16302
    rl   rm   r^   	stem_type	base_dimsrE   rF   r7   c                 f   t         t        |           |dv sJ || _        || _        |d   |d   z  }|	| _        || _        |rdnd| _        g | _        t        |
||||      | _
        t        j                  t        j                  d|| j                  j                  | j                  j                               | _        t        j                  t        j                  d| j                  |            | _        t        j&                  |      | _        g }t        j*                  d|t-        |            j/                  |      D cg c]  }|j1                          }}|}t3        t5        |            D ]u  }d }||   ||   z  }|dkD  rt7        ||d      }|t9        ||   ||   ||   ||||||         gz  }|}| xj                  t;        ||dz
  d|z  z  d| 	      gz  c_        w t=        | | _        t        j@                  |d
   |d
   z  d      | _!        |x| _"        x| _#        | _$        t        j&                  |      | _%        |	dkD  r t        jL                  | jH                  |	      nt        jN                         | _(        d | _)        |rH|	dkD  r*t        jL                  | jH                  | j
                        nt        jN                         | _)        d| _*        tW        | j"                  d       tW        | j$                  d       | jY                  | jZ                         y c c}w )N)tokenr   rK   r   )p)r^   )r=   r9   r:   rG   transformers.)num_chs	reductionr%   r2   r3   Fg{Gz?)std).r   r   r   r   rF   num_classesglobal_pool
num_tokensfeature_infork   patch_embedr	   	Parameterr+   randnrr   rs   	pos_embedrh   Dropoutpos_droplinspacesumsplittolistrA   lenrZ   r0   dictr   transformersrB   r?   num_featureshead_hidden_sizerH   	head_droprc   r>   head	head_distdistilled_trainingr   apply_init_weights)r   rl   rm   r^   r~   r   rE   rF   r7   r   in_chansr   	distilled	drop_ratepos_drop_drateproj_drop_rateattn_drop_ratedrop_path_raterH   r   r!   dprprev_dimrI   r=   r   s                            r   r   z!PoolingVisionTransformer.__init__   s   ( 	&68j((("
aL58+	&&(!a(9h
TZ[ekk!Y@P@P@W@WY]YiYiYoYo&pqekk!T__i&PQ

^4#(>>!^SZ#P#V#VW\#]^aqxxz^^s5z" 	uAD!!uQx/I1u
 [!aa(("1v	 	 	L !H$xFQJRSUVRVCVanopnq_r"s!tt)	u, ,\:LL2r!:E	ENNND1DN I.>IAoBIIdnnk:SUS^S^S`	LWZ[ORYYt~~t7G7GHacalalanDN"'dnn#.dnn#.

4%%&M _s   L.c                     t        |t        j                        rUt        j                  j	                  |j
                  d       t        j                  j	                  |j                  d       y y )Nr   g      ?)
isinstancer	   rB   init	constant_ro   weight)r   ms     r   r   z&PoolingVisionTransformer._init_weights   sE    a&GGaffa(GGahh, 'r    c                 
    ddhS )Nr   rh   r$   r   s    r   no_weight_decayz(PoolingVisionTransformer.no_weight_decay   s    [))r    c                     || _         y r   )r   r   enables     r   set_distilled_trainingz/PoolingVisionTransformer.set_distilled_training   s
    "(r    c                     |rJ d       y )Nz$gradient checkpointing not supportedr$   r   s     r   set_grad_checkpointingz/PoolingVisionTransformer.set_grad_checkpointing   s    AAAz6r    r"   c                 b    | j                   | j                  | j                   fS | j                  S r   )r   r   r   s    r   get_classifierz'PoolingVisionTransformer.get_classifier   s)    >>%99dnn,,99r    r   r   c                 J   || _         ||| _        |dkD  r t        j                  | j                  |      nt        j
                         | _        | j                  I|dkD  r*t        j                  | j                  | j                         nt        j
                         | _        y y )Nr   )r   r   r	   rc   rH   r>   r   r   )r   r   r   s      r   reset_classifierz)PoolingVisionTransformer.reset_classifier   s~    &"*D>IAoBIIdnnk:SUS^S^S`	>>%LWZ[ORYYt~~t7G7GHacalalanDN &r    r!   indicesr?   
stop_early
output_fmtintermediates_onlyc                 h   |dv sJ d       g }t        t        | j                        |      \  }}	| j                  |      }| j	                  || j
                  z         }| j                  j                  |j                  d   dd      }
t        | j                        dz
  }t        j                  j                         s|s| j                  }n| j                  d|	dz    }t        |      D ](  \  }} |||
f      \  }}
||v s|j                  |       * |r|S |k(  r| j                  |
      }
|
|fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )NCHWzOutput shape must be NCHW.r   r   r   N)r   r   r   r   r   r   rh   expandrL   r+   jitis_scripting	enumerateappendr?   )r   r!   r   r?   r   r   r   intermediatestake_indices	max_indexrQ   last_idxstagesfeat_idxstages                  r   forward_intermediatesz.PoolingVisionTransformer.forward_intermediates  s6   * Y&D(DD&"6s4;L;L7Mw"Wi QMM!dnn,-^^**1771:r2>
t(()A-99!!#:&&F&&~	A6F(0 	(OHe!1j/2MAz<'$$Q'	(
   x:.J=((r    
prune_norm
prune_headc                     t        t        | j                        |      \  }}| j                  d|dz    | _        |rt        j                         | _        |r| j                  dd       |S )z@ Prune layers not required for specified intermediates.
        Nr   r    )r   r   r   r	   r>   r?   r   )r   r   r   r   r   r   s         r   prune_intermediate_layersz2PoolingVisionTransformer.prune_intermediate_layers3  sb     #7s4;L;L7Mw"Wi --ny1}=DI!!!R(r    c                    | j                  |      }| j                  || j                  z         }| j                  j	                  |j
                  d   dd      }| j                  ||f      \  }}| j                  |      }|S )Nr   r   )r   r   r   rh   r   rL   r   r?   )r   r!   rQ   s      r   forward_featuresz)PoolingVisionTransformer.forward_featuresC  su    QMM!dnn,-^^**1771:r2>
))1j/::YYz*
r    
pre_logitsc                    | j                   | j                  dk(  sJ |d d df   |d d df   }}| j                  |      }| j                  |      }|s"| j                  |      }| j                  |      }| j                  r.| j
                  r"t        j                  j                         s||fS ||z   dz  S | j                  dk(  r	|d d df   }| j                  |      }|s| j                  |      }|S )Nr   r   r   rK   )	r   r   r   r   r   trainingr+   r   r   )r   r!   r   x_dists       r   forward_headz%PoolingVisionTransformer.forward_headK  s    >>%##w...!Q$1a4vAq!A^^A&FIIaL/&&4==AWAWAY&y  F
a''7*adGq!AIIaLHr    c                 J    | j                  |      }| j                  |      }|S r   )r   r   rx   s     r   r&   z PoolingVisionTransformer.forwardb  s'    !!!$a r    )ry   rz   r{   overlap0   r   r   rK         rK   r   r{   r        r   FrW   rW   rW   rW   rW   )Tr   )NFFr   F)r   FTF) r'   r(   r)   r*   r|   strr   floatr   r   r+   r   ignorer   r   r   r	   Moduler   r   r   r,   r   r   boolr   r   r   r   r   r&   r-   r.   s   @r   r   r      s&      &'3#,#, %L'L' L' 	L'
 L'  }L' C=L' C=L' L'\-
 YY* * YY) ) YYB B		 oC ohsm o 8<$$',/)||/) eCcN34/) 	/)
 /) /) !%/) 
tELL!5tELL7I)I#JJ	K/)f ./$#	3S	>*  	 $ 5<< .r    c                     i }t        j                  d      }| j                         D ]  \  }}|j                  d |      }|||<    |S )z preprocess checkpoints zpools\.(\d)\.c                 D    dt        | j                  d            dz    dS )Nr   r   z.pool.)r|   group)exps    r   <lambda>z&checkpoint_filter_fn.<locals>.<lambda>q  s"    }S15F5J4K6%R r    )recompileitemssub)
state_dictmodelout_dictp_blockskvs         r   checkpoint_filter_fnr   h  sV    Hzz*+H  " 1
 LLRTUV Or    c                     t        t        d            }|j                  d|      }t        t        | |ft
        t        d|      d|}|S )Nr   out_indiceshook)feature_clsr   )pretrained_filter_fnfeature_cfg)tuplerA   popr   r   r   r   )variant
pretrainedkwargsdefault_out_indicesr   r   s         r   _create_pitr  v  sY    a/**],?@K   2VE E Lr    c                 2    | ddd dddt         t        ddd|S )	Nr   )r   ry   ry   g?bicubicTzpatch_embed.convr   )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanr   
first_conv
classifierr
   )r  r  s     r   _cfgr    s2    =t%.B(  r    ztimm/)	hf_hub_id)r   r   )r  r  )zpit_ti_224.in1kzpit_xs_224.in1kzpit_s_224.in1kzpit_b_224.in1kzpit_ti_distilled_224.in1kzpit_xs_distilled_224.in1kzpit_s_distilled_224.in1kzpit_b_distilled_224.in1kr"   c           	      ^    t        ddg dg dg dd      }t        d| fi t        |fi |S )	N      @   r  r  r   r   r   r   r{   rz   r   rm   r^   r   rE   rF   r7   	pit_b_224r   r  r  r  
model_argss      r   r  r    <    J {JM$z2LV2LMMr    c           	      ^    t        ddg dg dg dd      }t        d| fi t        |fi |S )	Nrz   r{   r   r   r   r      r   r  	pit_s_224r  r   s      r   r&  r&    r"  r    c           	      ^    t        ddg dg dg dd      }t        d| fi t        |fi |S )	Nrz   r{   r   r   r   r   r  
pit_xs_224r  r   s      r   r(  r(    <    J |ZN4
3Mf3MNNr    c           	      ^    t        ddg dg dg dd      }t        d| fi t        |fi |S )	Nrz   r{       r,  r,  r   r   r   r  
pit_ti_224r  r   s      r   r-  r-    r)  r    c           	      `    t        ddg dg dg ddd      }t        d	| fi t        |fi |S )
Nr  r  r  r  r  r   Trm   r^   r   rE   rF   r7   r   pit_b_distilled_224r  r   s      r   r0  r0    @    J ,jWD<Vv<VWWr    c           	      `    t        ddg dg dg ddd      }t        d	| fi t        |fi |S )
Nrz   r{   r   r   r$  r   Tr/  pit_s_distilled_224r  r   s      r   r3  r3    r1  r    c           	      `    t        ddg dg dg ddd      }t        d	| fi t        |fi |S )
Nrz   r{   r   r   r   r   Tr/  pit_xs_distilled_224r  r   s      r   r5  r5    A    J -zXT*=WPV=WXXr    c           	      `    t        ddg dg dg ddd      }t        d	| fi t        |fi |S )
Nrz   r{   r+  r   r   r   Tr/  pit_ti_distilled_224r  r   s      r   r8  r8    r6  r    r   )r   )0r*   rp   r   	functoolsr   typingr   r   r   r   r   r+   r	   	timm.datar   r   timm.layersr   r   _builderr   	_featuresr   	_registryr   r   vision_transformerr   __all__r@   r   r   r0   rZ   rk   r   r   r  r  default_cfgsr  r&  r(  r-  r0  r3  r5  r8  r$   r    r   <module>rC     s    	  9 9   A 0 * + < % &
&bmm 0")) 0fbii *BII 6Vryy Vr %g.g.W-W-!%("* "&("* !%(!* !%(!*& * 	N-E 	N 	N 	N-E 	N 	N 	O.F 	O 	O 	O.F 	O 	O 
X7O 
X 
X 
X7O 
X 
X 
Y8P 
Y 
Y 
Y8P 
Y 
Yr    