
    kh o                     T   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlmc mZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ dd	l%m&Z& dd
l'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 dgZ1 ejd                  e3      Z4de5de5de
jl                  de
jl                  fdZ7 e*e7       de
jl                  de
jl                  de
jl                  dee5e5f   dee5e5f   de
jl                  fdZ8 G d dejr                        Z: G d dejr                        Z; G d dejr                        Z<de
jl                  de5dee
jl                  ee5e5f   f   fd Z=	 d:d!e
jl                  de5d"ee5e5f   d#eee5e5f      de
jl                  f
d$Z> G d% dejr                        Z?d& Z@d;d'ZA e/ eAd(d)d*eedd+d,-       eAd.d)d*eedd+d,-       eAd/d)d*eedd+d,-       eAeed0d1d23      d4      ZBd<d5ZCe0d<de?fd6       ZDe0d<de?fd7       ZEe0d<de?fd8       ZFe0d<de?fd9       ZGy)=a+   Vision Transformer (ViT) in PyTorch

A PyTorch implement of Vision Transformers as described in:

'Exploring Plain Vision Transformer Backbones for Object Detection'
    - https://arxiv.org/abs/2203.16527

'Segment Anything Model (SAM)'
    - https://github.com/facebookresearch/segment-anything/

    N)partial)CallableListOptionalTupleUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)
PatchEmbedMlpDropPathPatchDropoutLayerNorm2dClassifierHeadNormMlpClassifierHeadFormatresample_abs_pos_embed_nhwcRotaryEmbeddingCatapply_rot_embed_cat	to_2tupleuse_fused_attn)Final   )build_model_with_cfg)feature_take_indices)register_notrace_function)
checkpointcheckpoint_seq)generate_default_cfgsregister_modelVisionTransformerSAMq_sizek_sizerel_posreturnc                 @   t        dt        | |      z  dz
        }|j                  d   |k7  rjt        j                  |j                  d|j                  d   d      j                  ddd      |d      }|j                  d|      j                  dd      }n|}t        j                  |       dddf   t        || z  d      z  }t        j                  |      dddf   t        | |z  d      z  }||z
  |dz
  t        | |z  d      z  z   }||j                            S )	a\  
    Get relative positional embeddings according to the relative positions of
        query and key sizes.
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
        rel_pos (Tensor): relative position embeddings (L, C).

    Returns:
        Extracted positional embeddings according to relative positions.
       r   r   linear)sizemodeN      ?)
intmaxshapeFinterpolatereshapepermutetorcharangelong)r$   r%   r&   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           ^/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/models/vision_transformer_sam.pyget_rel_posr?   %   s    q3vv..23L}}Q<'--OOAw}}Q/4<<Q1E

 *11"lCKKAqQ! ||F#AtG,s6F?C/HHH||F#D!G,s6F?C/HHH(*vzS&RU=V.VVO?//122    q	rel_pos_h	rel_pos_wc                 j   |\  }}|\  }}t        |||      }	t        |||      }
| j                  \  }}}| j                  ||||      }t        j                  d||	      }t        j                  d||
      }|dddddddddf   |dddddddddf   z   }|j                  d||z  ||z        S )a  
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
    Args:
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

    Returns:
        bias (Tensor): attention bias to add to attention map
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkNr*   )r?   r1   r4   r6   einsum)rA   rB   rC   r$   r%   q_hq_wk_hk_wRhRwB_dimr_qrel_hrel_w	attn_biass                     r>   get_decomposed_rel_pos_biasrS   H   s    ( HCHC	S#y	)B	S#y	)BIAq#
))AsC
%CLL)33ELL)33EaAq$&'%1aq0@*AAIRsC#I66r@   c            	            e Zd ZU ee   ed<   dddddej                  dddf	dedee	e
e
f      d	eej                     f fd
Zd Z xZS )	Attention
fused_attn   TF        Nuse_rel_pos
input_sizeropec                    t         |           ||z  dk(  sJ d       || _        ||z  | _        | j                  dz  | _        t               | _        t        j                  ||dz  |      | _	        |r || j                        nt        j                         | _        |r || j                        nt        j                         | _        t        j                  |      | _        t        j                  ||      | _        t        j                  |      | _        || _        | j"                  r|
J |	J d       t        j$                  t'        j(                  d|	d   z  dz
  | j                              | _        t        j$                  t'        j(                  d|	d   z  dz
  | j                              | _        |
| _        y )	Nr   z$dim should be divisible by num_headsg         )biaszBInput size must be provided if using relative positional encoding.r)   r   )super__init__	num_headshead_dimscaler   rV   nnLinearqkvIdentityq_normk_normDropout	attn_dropproj	proj_droprY   	Parameterr6   zerosrB   rC   r[   )selfrN   ra   qkv_biasqk_normrk   rm   
norm_layerrY   rZ   r[   	__class__s              r>   r`   zAttention.__init__m   ss    	Y!#K%KK#"y(]]d*
(*99S#'93:j/3:j/I.IIc3'	I.&<<&TST&  \\%++JqM!A%t}}+6 7DN\\%++JqM!A%t}}+6 7DN	r@   c                    |j                   \  }}}}||z  }|j                  ||d      }| j                  |      j                  ||d| j                  d      j                  ddddd      }|j                  d|| j                  z  |d      j                  d      \  }}	}
| j                  |      | j                  |	      }	}| j                  r(t        || j                  | j                  ||f||f      }n^d }| j                  P| j                  j                         }t        ||      j!                  |
      }t        |	|      j!                  |
      }	| j"                  rQt$        j&                  j(                  j+                  ||	|
|| j,                  r| j.                  j0                  nd      }nS|| j2                  z  }||	j5                  d	d      z  }|||z   }|j7                  d
      }| j/                  |      }||
z  }|j                  || j                  |d      j5                  dd      j                  ||d      }| j9                  |      }| j;                  |      }|j                  |||d      }|S )Nr*   r]   r)   r   r      rX   )	attn_mask	dropout_p)rN   )r1   r4   rf   viewra   r5   unbindrh   ri   rY   rS   rB   rC   r[   	get_embedr   type_asrV   r6   rd   
functionalscaled_dot_product_attentiontrainingrk   prc   	transposesoftmaxrl   rm   )rp   xrL   HWrM   Nrf   rA   kvrR   r[   attns                 r>   forwardzAttention.forward   s6   WW
1aEIIaBhhqkq!Q;CCAq!QPQR++aT^^!3Q;BB1E1a{{1~t{{1~13At~~t~~XY[\W]`acd_efIIyy$yy**,'4088;'4088;??##@@1a#.2mm$..** A A DJJAq{{2r**D$i'<<B<'D>>$'DqAFF1dnna,66q!<DDQ2NIIaLNN1FF1aBr@   )__name__
__module____qualname__r   bool__annotations__rd   	LayerNormr   r   r/   Moduler`   r   __classcell__rt   s   @r>   rU   rU   j   sm    d
 || %48(,% % !sCx1% 299%%N&r@   rU   c                   &     e Zd Zd fd	Zd Z xZS )
LayerScalec                     t         |           || _        t        j                  |t        j                  |      z        | _        y N)r_   r`   inplacerd   rn   r6   onesgamma)rp   rN   init_valuesr   rt   s       r>   r`   zLayerScale.__init__   s2    \\+

3"?@
r@   c                 n    | j                   r|j                  | j                        S || j                  z  S r   )r   mul_r   rp   r   s     r>   r   zLayerScale.forward   s(    %)\\qvvdjj!Eq4::~Er@   )gh㈵>F)r   r   r   r`   r   r   r   s   @r>   r   r      s    A
Fr@   r   c                   j     e Zd Zdddddddej                  ej
                  eddddf fd	Zd Z xZ	S )	Block      @TFrX   Nr   c                 4   t         |           || _         ||      | _        t	        |||||||||dk(  r|n||f|
      | _        |rt        ||      nt        j                         | _	        |	dkD  rt        |	      nt        j                         | _         ||      | _         ||t        ||z        |
|      | _        |rt        ||      nt        j                         | _        |	dkD  rt        |	      | _        y t        j                         | _        y )Nr   )	ra   rq   rr   rk   rm   rs   rY   rZ   r[   )r   rX   )in_featureshidden_features	act_layerdrop)r_   r`   window_sizenorm1rU   r   r   rd   rg   ls1r   
drop_path1norm2r/   mlpls2
drop_path2)rp   rN   ra   	mlp_ratiorq   rr   rm   rk   r   	drop_pathr   rs   	mlp_layerrY   r   rZ   r[   rt   s                    r>   r`   zBlock.__init__   s    & 	&_
!#%0A%5zK;U
	 @K:c{;PRP[P[P]1:R(9-R[[]_
i0	
 @K:c{;PRP[P[P]1:R(9-R[[]r@   c           
      2   |j                   \  }}}}|}| j                  |      }d }| j                  dkD  rt        || j                        \  }}| j	                  | j                  | j                  |                  }| j                  dkD  rt        || j                  ||f|      }||z   }|j                  |||z  d      }|| j                  | j                  | j                  | j                  |                        z   }|j                  |||d      }|S )Nr   r*   )r1   r   r   window_partitionr   r   r   window_unpartitionr4   r   r   r   r   )rp   r   rL   r   r   rM   shortcutpad_hws           r>   r   zBlock.forward   s    WW
1aJJqM,0a(D,<,<=IAvOODHHTYYq\23 a"1d&6&6AGAqLIIaQ#$**Q-)@ ABBIIaAr"r@   )
r   r   r   rd   GELUr   r   r`   r   r   r   s   @r>   r   r      sC     gg||#-S^r@   r   r   r   c           	      L   | j                   \  }}}}|||z  z
  |z  }|||z  z
  |z  }t        j                  | ddd|d|f      } ||z   ||z   }	}| j                  |||z  ||	|z  ||      } | j	                  dddddd      j                         j                  d|||      }
|
||	ffS )aU  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.

    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   r   r]   r)   rv      r*   )r1   r2   padrz   r5   
contiguous)r   r   rL   r   r   Cpad_hpad_wHpWpwindowss              r>   r   r     s     JAq!Q1{?*k9E1{?*k9E	a!Q5!U+,AYE	B	q"#["2C[RSTAii1aAq)446;;B[Z[\GRHr@   r   hwr   c                 :   ||n|\  }}|\  }}| j                   d   ||z  |z  |z  z  }| j                  |||z  ||z  ||d      }	|	j                  dddddd      j                         j                  |||d      }	|	ddd|d|ddf   j                         }	|	S )	a  
    Window unpartition into original sequences and removing padding.
    Args:
        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.

    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    Nr   r*   r   r]   r)   rv   r   )r1   rz   r5   r   )
r   r   r   r   r   r   r   r   rL   r   s
             r>   r   r   (  s     )VrFBDAqaR"W3{BCAQk)2+<k;XZ[A			!Q1a#..055aRDA	!RaR!Q,""$AHr@   c            H           e Zd ZdZdddddddddd	d
d	ddddddd eeej                  d	      ej                  ej                  eedd	d	ddddd
d
f!dededededededededededee   dededed ed!ed"ed#ed$ed%ed&ee   d'ee   d(ed)ed*ed+ed,ed-ed.eed/f   d0ed1ed2ee   d3eeeeef   eeef   f      fB fd4Zej,                  j.                  d5        Zej,                  j.                  dId6       Zej,                  j.                  dJd7       Zej,                  j.                  d8ej6                  fd9       ZdKded1ee   fd:Z	 	 	 	 	 dLd;ej<                  d<eeee e   f      d=ed>ed?ed@ed8ee ej<                     eej<                  e ej<                     f   f   fdAZ!	 	 	 dMd<eeee e   f      dBedCefdDZ"dE Z#dIdFefdGZ$dH Z% xZ&S )Nr#   z Vision Transformer for Segment-Anything Model(SAM)

    A PyTorch impl of : `Exploring Plain Vision Transformer Backbones for Object Detection` or `Segment Anything Model (SAM)`
        - https://arxiv.org/abs/2010.11929
          r]         r   TFNrX    )
output_fmtstrict_img_size       avgimg_size
patch_sizein_chansnum_classes	embed_dimdepthra   r   rq   rr   r   pre_norm	drop_ratepos_drop_ratepatch_drop_rateproj_drop_rateattn_drop_ratedrop_path_rateweight_initembed_layerrs   r   block_fnr   use_abs_posrY   use_roper   global_attn_indexes.
neck_chansglobal_poolhead_hidden_sizeref_feat_shapec"                 b   t         )|           |xs t        t        j                  d      }|xs t        j
                  }|| _        || _        |x| _        x| _	        | _
        d| _         ||||||       | _        | j                  j                  }"t        | j                  d      r| j                  j                         n|}#|r7t        j                   t#        j$                  d|"d   |"d   |            | _        nd| _        t        j(                  |	      | _        |dkD  rt-        |d
      | _        nt        j0                         | _        |r ||      nt        j0                         | _        |rt|rJ d       |!-t5        |!      dk(  sJ t7        |!d         }$t7        |!d         }%ndx}$}%t9        ||z  d|"|$      | _        t9        ||z  dt7        |      |%      | _        nd| _        d| _        t#        j>                  d||      D &cg c]  }&|&jA                          }'}&t        jB                  tE        |      D (cg c][  }( |d%i d|d|d|d|	d|
d|d|d|d|'|(   d|d|d|d|d|(|vr|ndd|"d|(|vr| j<                  n| j:                  ] c}( | _#        tE        |      D (cg c]  }(tI        d|( ||#       c}(| _%        |rft        jB                  t        jL                  ||dd       tO        |      t        jL                  ||d!dd"      tO        |            | _(        || _        n.| rt        j0                         | _(        ntO        |      | _(        |}| rtS        ||| ||#      | _*        ytW        ||||$      | _*        yc c}&w c c}(w c c}(w )&a  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of image input channels.
            num_classes: Number of classes for classification head.
            global_pool: Type of global pooling for final sequence (default: 'token').
            embed_dim: Transformer embedding dimension.
            depth: Depth of transformer.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: Enable bias for qkv projections if True.
            init_values: Layer-scale init values (layer-scale enabled if not None).
            drop_rate: Head dropout rate.
            pos_drop_rate: Position embedding dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth rate.
            weight_init: Weight initialization scheme.
            embed_layer: Patch embedding layer.
            norm_layer: Normalization layer.
            act_layer: MLP activation layer.
            block_fn: Transformer block layer.
            use_abs_pos: If True, use absolute positional embeddings.
            use_rel_pos: If True, add relative positional embeddings to the attention map.
            use_rope: If True, add rotary position embeddings to q/k in attention block.
            window_size: Window size for window attention blocks. If 0, not use window attention.
            global_attn_indexes: Indexes for blocks using global attention. Used when window_size > 0.
            global_pool: Global pooling type.
            head_hidden_size: If set, use NormMlpHead
            ref_feat_shape: Tuple of reference feature shapes for ROPE, (global, local)
        gư>)epsF)r   r   r   r   r^   
feat_ratior   r   N)r   )num_prefix_tokenszCROPE and relative pos embeddings should not be enabled at same timer)   )	in_pixels
feat_shaper   rN   ra   r   rq   rr   r   rm   rk   r   rs   r   r   rY   r   rZ   r[   zblocks.)modulenum_chs	reduction)kernel_sizer^   r]   )r   paddingr^   )hidden_size	pool_typer   )r   r   r   ),r_   r`   r   rd   r   r   r   r   num_featuresr   r   grad_checkpointingpatch_embed	grid_sizehasattrr   rn   r6   ro   	pos_embedrj   pos_dropr   
patch_droprg   norm_prelenr   r   rope_globalrope_windowlinspaceitem
Sequentialrangeblocksdictfeature_infoConv2dr   neckr   headr   )*rp   r   r   r   r   r   r   ra   r   rq   rr   r   r   r   r   r   r   r   r   r   r   rs   r   r   r   r   rY   r   r   r   r   r   r   r   r   rref_feat_shape_globalref_feat_shape_windowr   dprirt   s*                                            r>   r`   zVisionTransformerSAM.__init__F  s   F 	B72<<T#B
(	&&ENNND1DN"'&!
 $$..	-4T5E5E|-TD'')Zd\\%++a1yQR|U^*_`DN!DN

]3Q*"#DO
 !kkmDO19
9-r{{}"i$ii?)>*a///(1.2C(D%(1.2C(D%@DD%(=1Y&$4	 D  2Y&$[14	 D  $D#D "'>5!IJAqvvxJJmm& 5\'&#& %  # $ "	
   ( ) ) a& & $ $ ( ,-4G+GKQ %  *+2E)ET%%4K[K[!&# $* QVV[P\^KLD'!yAF^ 		 !	 J'		 ! J'DI" !+DKKM	 (	2	"J -,%#DI '%#	DI{ K&#(^s   N"	A N'?N,c                 
    ddhS )Nr   
dist_tokenr   rp   s    r>   no_weight_decayz$VisionTransformerSAM.no_weight_decay  s    \**r@   c                      t        dddg      S )Nz^pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr  )r  )rp   coarses     r>   group_matcherz"VisionTransformerSAM.group_matcher  s    *-/CD
 	
r@   c                     || _         y r   )r   )rp   enables     r>   set_grad_checkpointingz+VisionTransformerSAM.set_grad_checkpointing  s
    "(r@   r'   c                     | j                   S r   r
  r  s    r>   get_classifierz#VisionTransformerSAM.get_classifier  s    yyr@   c                 J    || _         | j                  j                  ||       y r   )r   r
  reset)rp   r   r   s      r>   reset_classifierz%VisionTransformerSAM.reset_classifier  s    &		[1r@   r   indicesnorm
stop_earlyr   intermediates_onlyc                    |dk(  sJ d       g }t        t        | j                        |      \  }}	| j                  |      }| j                  &|t        | j                  |j                  dd       z   }| j                  |      }| j                  |      }| j                  |      }t        j                  j                         s|s| j                  }
n| j                  d|	dz    }
t        |
      D ]  \  }}| j                  r+t        j                  j                         st        ||      }n ||      }||v sJ|r3|j!                  | j#                  |j%                  dddd                   |j!                  |j%                  dddd              |r|S | j#                  |j%                  dddd            }||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        NCHWz&Output shape for ViT-SAM must be NCHW.Nr   r]   r   r)   )r   r   r  r   r   r   r1   r   r   r   r6   jitis_scripting	enumerater   r   appendr	  r5   )rp   r   r!  r"  r#  r   r$  intermediatestake_indices	max_indexr  r  blks                r>   forward_intermediatesz*VisionTransformerSAM.forward_intermediates  s   * V#M%MM#"6s4;;7G"Qi Q>>%/!MMAMM!OOAMM!99!!#:[[F[[)a-0F' 	@FAs&&uyy/E/E/GsA&FL  "((199Q1a3H)IJ!((1aA)>?	@   IIaii1a+,-r@   
prune_norm
prune_headc                     t        t        | j                        |      \  }}| j                  d|dz    | _        |rt        j                         | _        |r| j                  dd       |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r   r  rd   rg   r	  r   )rp   r!  r0  r1  r,  r-  s         r>   prune_intermediate_layersz.VisionTransformerSAM.prune_intermediate_layersZ  s]     #7s4;;7G"Qikk.9q=1DI!!!R(r@   c                    | j                  |      }| j                  &|t        | j                  |j                  dd       z   }| j	                  |      }| j                  |      }| j                  |      }| j                  r5t        j                  j                         st        | j                  |      }n| j                  |      }| j                  |j                  dddd            }|S )Nr   r]   r   r)   )r   r   r   r1   r   r   r   r   r6   r'  r(  r    r  r	  r5   r   s     r>   forward_featuresz%VisionTransformerSAM.forward_featuresk  s    Q>>%/!MMAMM!OOAMM!""599+A+A+Ct{{A.AAAIIaii1a+,r@   
pre_logitsc                 N    |r| j                  |d      S | j                  |      S )NT)r6  r  )rp   r   r6  s      r>   forward_headz!VisionTransformerSAM.forward_headz  s$    0:tyyty,L		!Lr@   c                 J    | j                  |      }| j                  |      }|S r   )r5  r8  r   s     r>   r   zVisionTransformerSAM.forward}  s'    !!!$a r@   F)Tr   )NFFr&  F)NFT)'r   r   r   __doc__r   r   r   NHWCrd   r   r   r   r   r/   floatr   r   strr   r   r`   r6   r'  ignorer  r  r  r   r  r   Tensorr   r   r/  r3  r5  r8  r   r   r   s   @r>   r#   r#   ?  s    ! " !!!+/"!#%%'$&$&$&!$+J6;;`e$f-/\\,.GG!&"% $ %"!35!$.2PTE@@ @ 	@
 @ @ @ @ @ @ @ "%@ @ @ !@  #!@" "#@$ "%@& "'@( )@* "+@, !*-@.  )/@0 1@2  3@4 5@6 7@8 9@: ;@< "'sCx=@> ?@@ A@B 'smC@D %U5c?E#s(O+K%LME@D YY+ + YY
 
 YY) ) YY		  2C 2hsm 2 8<$$',9 ||9  eCcN349  	9 
 9  9  !%9  
tELL!5tELL7I)I#JJ	K9 z 8<$#	eCcN34  	"M$ Mr@   c                     d| v }i }| j                         D ]6  \  }}|j                  d      r|dd }|j                  dd      }n|r2|||<   8 |S )z Remap SAM checkpoints -> timm z%image_encoder.patch_embed.proj.weightzimage_encoder.r   Nzmlp.linzmlp.fc)items
startswithreplace)
state_dictmodelsam_checkpointout_dictr   r   s         r>   checkpoint_filter_fnrI    sm    
 =
JNH  " 1<<()"#A		)X.A Or@   c                 2    | ddd dddt         t        ddd|S )	N  r]   r   r   ?bicubicTzpatch_embed.projzhead.fc)urlr   rZ   	pool_sizecrop_pctinterpolationfixed_input_sizemeanstd
first_conv
classifier)r   r   )rO  kwargss     r>   _cfgrY    s2    ?'0F(	  r@   zDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pthztimm/z
apache-2.0rL  r.   )rO  	hf_hub_idlicenserT  rU  r   rZ   rQ  zDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pthzDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pthrK  )r]      r\  rM  )rT  rU  r   rZ   rQ  )zsamvit_base_patch16.sa1bzsamvit_large_patch16.sa1bzsamvit_huge_patch16.sa1bsamvit_base_patch16_224c                 n    |j                  dd      }t        t        | |ft        t	        |d      d|S )Nout_indicesr]   getter)r_  feature_cls)pretrained_filter_fnfeature_cfg)popr   r#   rI  r  )variant
pretrainedrX  r_  s       r>   _create_vision_transformerrg    sF    **]A.K 2[hG  r@   c           
      `    t        ddddg dddd      }t        	 d
d	| it        |fi |}|S )z# ViT-B/16 for Segment-Anything
    r   r   r   r)   r   rW      r   Tr   r   r   r   ra   r   r   rY   r   rf  )samvit_base_patch16r  rg  rf  rX  
model_argsrF  s       r>   rl  rl    sR     B"R_D4J 'T*4T8<Z8R68RTELr@   c           
      `    t        ddddg dddd      }t        	 d	d| it        |fi |}|S )
z# ViT-L/16 for Segment-Anything
    r   r      )r   rj        r   Trk  rf  )samvit_large_patch16rm  rn  s       r>   rt  rt    sR     R2SbD4J 'U+5U9=j9SF9SUELr@   c           
      `    t        ddddg dddd      }t        	 d
d	| it        |fi |}|S )z# ViT-H/16 for Segment-Anything
    r   i       )      rs     r   Tr   rk  rf  )samvit_huge_patch16rm  rn  s       r>   rz  rz    sR     R2SbD4J 'T*4T8<Z8R68RTELr@   c                 d    t        ddddg dddddd	

      }t        	 dd| it        |fi |}|S )z# ViT-B/16 based on samvit arch
    r   r   r   ri  r   TFr\  N)
r   r   r   ra   r   r   rY   r   r   r   rf  )r]  rm  rn  s       r>   r]  r]    sW     B"R_DecVZJ '!X.8X<@<Vv<VXELr@   r   )r   r:  )Hr;  logging	functoolsr   typingr   r   r   r   r   r6   torch.nnrd   torch.nn.functionalr~   r2   	timm.datar	   r
   r   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   r   	torch.jitr   _builderr   	_featuresr   _features_fxr   _manipulater   r    	_registryr!   r"   __all__	getLoggerr   _loggerr/   r@  r?   rS   r   rU   r   r   r   r   r#   rI  rY  default_cfgsrg  rl  rt  rz  r]  r   r@   r>   <module>r     s  
   9 9     r rl l l l  * + 3 3 < "
" '

H
%3 3S 35<< 3ELL 3@ + &7||7||7 ||7 #s(O	7
 #s(O7 \\7DP		 PfF FGBII GT 3 5uUXZ]U]A^;_ 0 gk\\(+16sCxJRSXY\^aYaSbJc
\\.A299 AH
$ % !%R"(<!"S!2 "&R"(<!"S"2 !%R"(<!"S!2  $"(<$ 3 0-& 8	 	7K 	 	 	8L 	 	 	7K 	 	 	;O 	 	r@   