
    khd                     8   d Z ddlZddlmZmZmZ ddlZddlmc m	Z
 ddlmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZmZm Z  ddl!m"Z# g Z$dgdZ%dhdZ&didZ'djdZ( e) e e%dddd       e%dddd       e&ddddddd       e&ddddddd       e&ddddddd      fddddd d!"       e e%dd#d$       e%ddd$       e&dddddd%       e&dddd&dd%       e&dddd'dd%      fddddd d("       e e%dd#d$       e%ddd$       e&dddd'dd%       e&dd)dd*dd%       e&dd+dd,dd%      fddddd d-"       e e%dd#d$       e%ddd$       e&dddd'dd%       e&dd)dd*dd%       e&dd+dd,dd%      fddddd. e)d/0      d-1       e(d       e(d2       e(d3       e(d       e(d4       e(d5       e(d      6      Z*e G d7 d8ejV                               Z, G d9 d:ejV                        Z- G d; d<ejV                        Z.e G d= d>ejV                               Z/ ed?e,        ed@e/       dkdAZ0dkdBZ1dldCZ2 ei dD e2dEF      dG e2dEF      dH e2dEF      dI e2dEdJK      dL e2dEdJK      dM e2dEdJK      dN e2dEdJK      dO e2dEdJK      dP e2dEdJK      dQ e2dEdJK      dR e2dEdJK      dS e2dEdJK      dT e2dEdJK      dU e2dEdVdWdX      dY e2dEdVdWdX      dZ e2dEdVdWdX            Z3edmd[efd\       Z4edmd[efd]       Z5edmd[efd^       Z6edmd[efd_       Z7edmd[efd`       Z8edmd[efda       Z9edmd[efdb       Z:edmd[efdc       Z;edmd[efdd       Z<edmd[efde       Z= ee>dRdSdTdUdYdZdf       y)na   MobileViT

Paper:
V1: `MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer` - https://arxiv.org/abs/2110.02178
V2: `Separable Self-attention for Mobile Vision Transformers` - https://arxiv.org/abs/2206.02680

MobileVitBlock and checkpoints adapted from https://github.com/apple/ml-cvnets (original copyright below)
License: https://github.com/apple/ml-cvnets/blob/main/LICENSE (Apple open source)

Rest of code, ByobNet, and Transformer block hacked together by / Copyright 2022, Ross Wightman
    N)CallableTupleOptional)nn)	to_2tuplemake_divisible
GroupNorm1ConvMlpDropPathis_exportable   )build_model_with_cfg)register_notrace_module)register_modelgenerate_default_cfgsregister_model_deprecations)register_blockByoBlockCfgByoModelCfgByobNetLayerFn
num_groups)Blockc                 <    t        d| ||d|t        dd            S )Nbottler   T)	bottle_in
linear_out)typedcsgsbrblock_kwargs)r   dictr   r    r!   r#   s       Q/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/mobilevit.py_inverted_residual_blockr(   !   s&    a1rDT:< <       c                 X    t        | |||      t        dd|dt        |||            fS )Nr&   	mobilevitr   )transformer_dimtransformer_depth
patch_size)r   r   r    r!   r$   r(   r   r%   r   r    r!   r-   r.   r/   r#   s          r'   _mobilevit_blockr2   (   s>     	!1Q26Q! /"3%'	
	 	r)             @      ?c                 Z    t        | |||      t        dd|d|dt        ||            fS )Nr&   
mobilevit2r   )r.   r/   )r   r   r    r!   r#   r"   r$   r0   )r   r    r!   r.   r/   r#   transformer_brs          r'   _mobilevitv2_blockr9   6   s?     	!1Q26a1A"3%'	
 r)         ?c                 L   d}| dk7  r%t        |D cg c]  }t        || z         c}      }t        t        d|d   dd      t        d|d   dd      t	        d|d   dd      t	        d|d	   dd
      t	        d|d
   dd	      ft        d| z        dddd      }|S c c}w )N)@           i   r:   r   r   r4   r&   r3   )r   r    r!   r.      r*       3x3 silu)blocksstem_chs	stem_type	stem_pool
downsample	act_layer)tupleintr   r(   r9   )
multiplierchsr    cfgs       r'   _mobilevitv2_cfgrP   C   s    
"CS#6QSZ(67
$qCFaC@$qCFaC@c!fQGc!fQGc!fQG
 R*_%C J 7s   B!   r&   r@      0   r<   r1   P   `   rB   rC   rD   i@  )rE   rF   rG   rH   rI   rJ   num_featuresrA   )r   r    r!   )r   r    r!   r-   r.   r/   x      r?   r=            i  seg      ?)rd_ratio)rE   rF   rG   rH   rI   
attn_layerattn_kwargsrV   g      ?g      ?g      ?g      ?)mobilevit_xxsmobilevit_xsmobilevit_ssemobilevit_smobilevitv2_050mobilevitv2_075mobilevitv2_125mobilevitv2_100mobilevitv2_150mobilevitv2_175mobilevitv2_200c            %            e Zd ZdZdddddddddd	d
dddddej
                  fdedee   dedededee   de	eef   dedee   dededededede
dededef$ fdZd ej                  d!ej                  fd"Z xZS )#MobileVitBlockzS MobileViT block
        Paper: https://arxiv.org/abs/2110.02178?context=cs.LG
    Nr@   r   r:   r   r   r4   r3      r*           Fin_chsout_chskernel_sizestridebottle_ratio
group_sizedilation	mlp_ratior-   r.   r/   	num_heads	attn_dropdrop	no_fusiondrop_path_ratelayerstransformer_norm_layerc                    t         t        |           |xs
 t               }t	        ||      }|xs |}|	xs t        ||z        }	|j                  ||||||d         | _        t        j                  ||	dd      | _
        t        j                  t        |
      D cg c]   }t        |	||d||||j                  |	      " c} | _         ||	      | _        |j                  |	|dd      | _        |rd | _        n|j                  ||z   ||d      | _        t'        |      | _        | j(                  d   | j(                  d   z  | _        y c c}w )	Nr   rr   rs   groupsrv   r   Frr   biasT)rw   rx   qkv_biasry   	proj_drop	drop_pathrJ   
norm_layerrr   rs   )superrl   __init__r   r   r   conv_norm_actconv_kxkr   Conv2dconv_1x1
SequentialrangeTransformerBlockacttransformernorm	conv_projconv_fusionr   r/   
patch_area)selfrp   rq   rr   rs   rt   ru   rv   rw   r-   r.   r/   rx   ry   rz   r{   r|   r}   r~   kwargsr   _	__class__s                         r'   r   zMobileVitBlock.__init__   sg   , 	nd,.$79J/#V)R^L6<Q-R,,F&8A; - @ 		&/quU== ,-+
  ###( **1
+
  +?;	--owTU^_-`#D%33FW4Dg[fop3qD#J///!,tq/AA1+
s   %Exreturnc                 ^   |}| j                  |      }| j                  |      }| j                  \  }}|j                  \  }}}}t	        j
                  ||z        |z  t	        j
                  ||z        |z  }
}	|	|z  |
|z  }}||z  }d}|	|k7  s|
|k7  rt        j                  ||	|
fdd      }d}|j                  ||z  |z  |||      j                  dd      }|j                  |||| j                        j                  dd      j                  || j                  z  |d      }| j                  |      }| j                  |      }|j                         j                  || j                  |d      }|j                  dd      j                  ||z  |z  |||      }|j                  dd      j                  ||||z  ||z        }|rt        j                  |||fdd      }| j                  |      }| j                   (| j!                  t#        j$                  ||fd	            }|S )
NFbilinearsizemodealign_cornersTr   r3   r@   dim)r   r   r/   shapemathceilFinterpolatereshape	transposer   r   r   
contiguousviewr   r   torchcat)r   r   shortcutpatch_hpatch_wBCHWnew_hnew_wnum_patch_hnum_patch_wnum_patchesr   s                  r'   forwardzMobileVitBlock.forward   s#    MM!MM!  ??WW
1ayyW-71w;9ORY9Yu#(G#3Ug5E[!K/A:!auen:UZ[AK IIa!ek)7KISSTUWXYIIaK9CCAqIQQRSVZVeVeRegrtvw QIIaL LLN4??KDKK1%%a!ek&9;QXYKK1%%aK',A;QXCXYaq!f:USANN1'  Ha=a!@AAr)   )__name__
__module____qualname____doc__r   	LayerNormrL   r   floatr   boolr   r   r   r   Tensorr   __classcell__r   s   @r'   rl   rl      s;    &* "%(,(."-1%&!#$&"/1||':B:B c]:B 	:B
 :B  :B !:B CHo:B :B &c]:B  #:B :B :B :B :B  !:B" "#:B$ %:B& %-':Bx( (%,, (r)   rl   c                   b    e Zd ZdZ	 	 	 ddededededdf
 fdZd	ej                  dej                  fd
Z
ej                  j                         dd	ej                  deej                     dej                  fd       Zdd	ej                  deej                     dej                  fdZ xZS )LinearSelfAttentiona  
    This layer applies a self-attention with linear complexity, as described in `https://arxiv.org/abs/2206.02680`
    This layer can be used for self- as well as cross-attention.
    Args:
        embed_dim (int): :math:`C` from an expected input of size :math:`(N, C, H, W)`
        attn_drop (float): Dropout value for context scores. Default: 0.0
        bias (bool): Use bias in learnable layers. Default: True
    Shape:
        - Input: :math:`(N, C, P, N)` where :math:`N` is the batch size, :math:`C` is the input channels,
        :math:`P` is the number of pixels in the patch, and :math:`N` is the number of patches
        - Output: same as the input
    .. note::
        For MobileViTv2, we unfold the feature map [B, C, H, W] into [B, C, P, N] where P is the number of pixels
        in a patch and N is the number of patches. Because channel is the first dimension in this unfolded tensor,
        we use point-wise convolution (instead of a linear layer). This avoids a transpose operation (which may be
        expensive on resource-constrained devices) that may be required to convert the unfolded tensor from
        channel-first to channel-last format in case of a linear layer.
    	embed_dimry   r   r   r   Nc                    t         |           || _        t        j                  |dd|z  z   |d      | _        t        j                  |      | _        t        j                  |||d      | _        t        j                  |      | _	        y )Nr   r3   )in_channelsout_channelsr   rr   )
r   r   r   r   r   qkv_projDropoutry   out_projout_drop)r   r   ry   r   r   r   s        r'   r   zLinearSelfAttention.__init__$  s|     	"		!a)m,	
 I.		!"	
 

9-r)   r   c                    | j                  |      }|j                  d| j                  | j                  gd      \  }}}t        j                  |d      }| j                  |      }||z  j                  dd      }t        j                  |      |j                  |      z  }| j                  |      }| j                  |      }|S )Nr   r   r   Tr   keepdim)r   splitr   r   softmaxry   sumrelu	expand_asr   r   )	r   r   qkvquerykeyvaluecontext_scorescontext_vectorouts	            r'   _forward_self_attnz&LinearSelfAttention._forward_self_attn=  s    mmA
  IIq$..$..&IqIQsE 5b17 .33D3I ffUmn66u==mmC mmC 
r)   x_prevc                 f   |j                   \  }}}}|j                   dd  \  }}||k(  sJ d       t        j                  || j                  j                  d | j
                  dz    | j                  j                  d | j
                  dz          }	|	j                  d| j
                  gd      \  }
}t        j                  || j                  j                  | j
                  dz      | j                  j                  &| j                  j                  | j
                  dz      nd       }t        j                  |
d      }| j                  |      }||z  j                  dd      }t        j                  |      |j                  |      z  }| j                  |      }| j                  |      }|S )	NzJThe number of pixels in a patch for query and key_value should be the samer   )weightr   r   r   Tr   )r   r   conv2dr   r   r   r   r   r   ry   r   r   r   r   r   )r   r   r   
batch_sizein_dimkv_patch_areakv_num_patchesq_patch_areaq_num_patchesqkr   r   r   r   r   r   s                   r'   _forward_cross_attnz'LinearSelfAttention._forward_cross_attnU  s    =>GG9
FM>&'ggbcl#m \)	XW	X)
 XX==''(;!);<##$7T^^a%78
 XXq$..1qX9
s==''(:;;?==;M;M;Y##DNNQ$67_c
 5b17 .33D3I ffUmn66u==mmC mmC 
r)   c                 N    || j                  |      S | j                  ||      S )N)r   )r   r   )r   r   r   s      r'   r   zLinearSelfAttention.forward  s.    >**1--++Af+==r)   )ro   ro   TN)r   r   r   r   rL   r   r   r   r   r   r   jitignorer   r   r   r   r   s   @r'   r   r     s    , .. . 	.
 . 
.2ELL U\\ 0 YY(U\\ (8ELL;Q (]b]i]i ( (T> >x/E >QVQ]Q] >r)   r   c                        e Zd ZdZ	 	 	 	 	 	 ddedededededdf fd	Zdd
ej                  de	ej                     dej                  fdZ
 xZS )LinearTransformerBlockaF  
    This class defines the pre-norm transformer encoder with linear self-attention in `MobileViTv2 paper <>`_
    Args:
        embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(B, C_{in}, P, N)`
        mlp_ratio (float): Inner dimension ratio of the FFN relative to embed_dim
        drop (float): Dropout rate. Default: 0.0
        attn_drop (float): Dropout rate for attention in multi-head attention. Default: 0.0
        drop_path (float): Stochastic depth rate Default: 0.0
        norm_layer (Callable): Normalization layer. Default: layer_norm_2d
    Shape:
        - Input: :math:`(B, C_{in}, P, N)` where :math:`B` is batch size, :math:`C_{in}` is input embedding dim,
            :math:`P` is number of pixels in a patch, and :math:`N` is number of patches,
        - Output: same shape as the input
    Nr   rw   rz   ry   r   r   c                 8   t         |           |xs t        j                  }|xs t        } ||      | _        t        |||      | _        t        |      | _	         ||      | _
        t        |t        ||z        ||      | _        t        |      | _        y )N)r   ry   r   )in_featureshidden_featuresrJ   rz   )r   r   r   SiLUr	   norm1r   attnr   
drop_path1norm2r
   rL   mlp
drop_path2)	r   r   rw   rz   ry   r   rJ   r   r   s	           r'   r   zLinearTransformerBlock.__init__  s     	(	-:
	*
')y\`a	"9-	*
!	I 56	
 #9-r)   r   r   c                 F   |3|| j                  | j                  | j                  |                  z   }n9|}| j                  |      }| j                  ||      }| j                  |      |z   }|| j                  | j	                  | j                  |                  z   }|S r   )r   r   r   r   r   r   )r   r   r   ress       r'   r   zLinearTransformerBlock.forward  s    >DOODIIdjjm$<==A C

1A		!V$A"S(A A 788r)   )r4   ro   ro   ro   NNr   )r   r   r   r   rL   r   r   r   r   r   r   r   r   s   @r'   r   r     s    $ .. . 	.
 . . 
.4 x/E QVQ]Q] r)   r   c                        e Zd ZdZddddddddd	d
d
d
defdedee   dededee   deeef   dedee   dededededede	de
f fdZdej                  dej                  fdZ xZS )MobileVitV2Blockz8
    This class defines the `MobileViTv2 block <>`_
    Nr@   r:   r   rm   r4   r3   rn   ro   rp   rq   rr   rt   ru   rv   rw   r-   r.   r/   ry   rz   r|   r}   r~   c                 x   t         t        |           |xs
 t               }t	        ||      }|xs |}|xs t        ||z        }|j                  |||d||d         | _        t        j                  ||dd      | _
        t        j                  t        |	      D cg c]  }t        ||||||j                  |        c} | _         ||      | _        |j                  ||ddd      | _        t%        |
      | _        | j&                  d   | j&                  d   z  | _        t+               | _        y c c}w )Nr   r   r   Fr   )rw   ry   rz   r   rJ   r   )rr   rs   	apply_act)r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r/   r   r   coreml_exportable)r   rp   rq   rr   rt   ru   rv   rw   r-   r.   r/   ry   rz   r|   r}   r~   r   r   r   r   s                      r'   r   zMobileVitV2Block.__init__  s@   & 	.0$79J/#V)R^L6<Q-R,,FVhqk - ; 		&/quU== ,-+
  ###( **1+
  +?;	--owTU^_kp-q#J///!,tq/AA!.%+
s   #D7r   r   c                    |j                   \  }}}}| j                  \  }}t        j                  ||z        |z  t        j                  ||z        |z  }	}||z  |	|z  }}
|
|z  }||k7  s|	|k7  rt	        j
                  |||	fdd      }| j                  |      }| j                  |      }|j                   d   }| j                  rt	        j                  |||f||f      }n*|j                  |||
|||      j                  ddddd	d
      }|j                  ||d|      }| j                  |      }| j                  |      }| j                  r2|j                  |||z  |z  |
|      }t	        j                  ||      }nD|j                  |||||
|      j                  ddd
d	dd      }|j                  |||
|z  ||z        }| j                  |      }|S )Nr   Tr   r   r   r   r@      r3   r*   r   )upscale_factor)r   r/   r   r   r   r   r   r   r  unfoldr   permuter   r   pixel_shuffler   )r   r   r   r   r   r   r   r   r   r   r   r   r   s                r'   r   zMobileVitV2Block.forward  s   WW
1a??yyW-71w;9ORY9Yu#(G#3Ug5E[!K/A:!auen:UYZA MM!MM! GGAJ!!'(:GWCUVA		!QWk7KSSTUWXZ[]^`acdeAIIaB, QIIaL !!		!Q[72KMA':A		!Q+{KSSTUWXZ[]^`acdeA		!Qg 5{W7LMANN1r)   )r   r   r   r   r	   rL   r   r   r   r   r   r   r   r   r   r   r   s   @r'   r  r    s     "&!$%$*)-!" "+5!0101 #01 	01
 01 SM01 S/01 01 "#01 01 01 01 01 01 01  !)!01d# #%,, #r)   r  r,   r7   c                 d    t        t        | |f|s	t        |    nt        |   t        d      d|S NT)flatten_sequential)	model_cfgfeature_cfgr   r   
model_cfgsr%   variantcfg_variant
pretrainedr   s       r'   _create_mobilevitr  #  >    *-8*W%j>UD1 	 r)   c                 d    t        t        | |f|s	t        |    nt        |   t        d      d|S r  r  r  s       r'   _create_mobilevit2r  +  r  r)   c                 "    | ddddddddd	d
d|S )Ni  )r@   r>   r>   )rn   rn   g?bicubic)ro   ro   ro   )r:   r:   r:   z	stem.convzhead.fcF)urlnum_classes
input_size	pool_sizecrop_pctinterpolationmeanstd
first_conv
classifierfixed_input_size )r  r   s     r'   _cfgr*  3  s2    4}SY)\!!  r)   zmobilevit_xxs.cvnets_in1kztimm/)	hf_hub_idzmobilevit_xs.cvnets_in1kzmobilevit_s.cvnets_in1kzmobilevitv2_050.cvnets_in1kg"~j?)r+  r"  zmobilevitv2_075.cvnets_in1kzmobilevitv2_100.cvnets_in1kzmobilevitv2_125.cvnets_in1kzmobilevitv2_150.cvnets_in1kzmobilevitv2_175.cvnets_in1kzmobilevitv2_200.cvnets_in1kz$mobilevitv2_150.cvnets_in22k_ft_in1kz$mobilevitv2_175.cvnets_in22k_ft_in1kz$mobilevitv2_200.cvnets_in22k_ft_in1kz(mobilevitv2_150.cvnets_in22k_ft_in1k_384)r@   r?   r?   )   r,  )r+  r   r!  r"  z(mobilevitv2_175.cvnets_in22k_ft_in1k_384z(mobilevitv2_200.cvnets_in22k_ft_in1k_384r   c                     t        dd| i|S )Nr  )r`   r  r  r   s     r'   r`   r`   o  s    NNvNNr)   c                     t        dd| i|S )Nr  )ra   r.  r/  s     r'   ra   ra   t  s    M
MfMMr)   c                     t        dd| i|S )Nr  )rb   r.  r/  s     r'   rb   rb   y  s    LzLVLLr)   c                     t        dd| i|S )Nr  )rd   r.  r/  s     r'   rd   rd   ~      P:PPPr)   c                     t        dd| i|S )Nr  )re   r.  r/  s     r'   re   re     r3  r)   c                     t        dd| i|S )Nr  )rg   r.  r/  s     r'   rg   rg     r3  r)   c                     t        dd| i|S )Nr  )rf   r.  r/  s     r'   rf   rf     r3  r)   c                     t        dd| i|S )Nr  )rh   r.  r/  s     r'   rh   rh     r3  r)   c                     t        dd| i|S )Nr  )ri   r.  r/  s     r'   ri   ri     r3  r)   c                     t        dd| i|S )Nr  )rj   r.  r/  s     r'   rj   rj     r3  r)   )mobilevitv2_150_in22ft1kmobilevitv2_175_in22ft1kmobilevitv2_200_in22ft1kmobilevitv2_150_384_in22ft1kmobilevitv2_175_384_in22ft1kmobilevitv2_200_384_in22ft1k)      @)r*   r@  )r3   r4   r5   )r:   )NF)rC   )F)?r   r   typingr   r   r   r   torch.nn.functionalr   
functionalr   timm.layersr   r   r	   r
   r   r   _builderr   _features_fxr   	_registryr   r   r   byobnetr   r   r   r   r   r   vision_transformerr   r   __all__r(   r2   r9   rP   r%   r  Modulerl   r   r   r  r  r  r*  default_cfgsr`   ra   rb   rd   re   rg   rf   rh   ri   rj   r   r)  r)   r'   <module>rM     s  
  , ,     _ _ * 1 Y Y [ [ 9
<
* $qB!<$qB!<qB!RSTabgjkqB!RSTabgjkqB!RSTabgjk
   $qB!4$qB!4qB!RSTabcqB!STUbcdqB!STUbcd
   $qB!4$qB!4qB!STUbcdqC1cUVcdeqC1cUVcde
   $qB!4$qB!4qB!STUbcdqC1cUVcdeqC1cUVcde
 #&" %S)$S)$T*$S)$S)$T*$S)QI
X hRYY h hVt>")) t>n7RYY 7t Zryy Z Zz {N + |- . % .&!8.&w 7.& tg6.&
 "4$.& "4$.& "4$.& "4$.&" "4$#.&( "4$).&. "4$/.&6 +D-7.&< +D-=.&B +D-C.&J / Hs1DK.&P / Hs1DQ.&V / Hs1DW.& .b O O O N N N Mw M M Q7 Q Q Q7 Q Q Q7 Q Q Q7 Q Q Q7 Q Q Q7 Q Q Q7 Q Q H F F F$N$N$N' r)   