
    kh12                         d dl mZmZmZmZ d dlZd dlmZ d dlmZ	 ddl
mZ ddlmZ ddlmZ dd	lmZ  G d
 dej"                        Z G d dej"                        Z G d dej"                        Zy)    )ListOptionalTypeUnionN)nn)
functional   )use_fused_attn)create_conv2d)	to_2tuple)create_pool2dc                        e Zd ZdZ	 	 	 	 	 	 ddedee   dededededef fd	Zd
 Zddee	j                     fdZ xZS )MultiQueryAttentionV2a  Multi Query Attention.

    Fast Transformer Decoding: One Write-Head is All You Need
    https://arxiv.org/pdf/1911.02150.pdf

    This is an acceletor optimized version - removing multiple unnecessary
    tensor transpose by re-arranging indices according to the following rules: 1)
    contracted indices are at the end, 2) other indices have the same order in the
    input and output tensores.

    Compared to V1, this gives 3x speed up.
    dimdim_out	num_headskey_dim	value_dim	attn_drop	proj_dropc                    t         |           |xs |}|| _        || _        || _        |dz  | _        t        j                  t        j                  | j                  | j                  |g            | _
        t        j                  t        j                  || j                  g            | _        t        j                  t        j                  || j                  g            | _        t        j                  |      | _        t        j                  t        j                  || j                  | j                  g            | _        t        j                  |      | _        y)zInitializer.      N)super__init__r   r   r   scaler   	Parametertorchrandn
query_projkey_proj
value_projDropoutr   out_projr   )	selfr   r   r   r   r   r   r   	__class__s	           S/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/layers/attention2d.pyr   zMultiQueryAttentionV2.__init__   s     	.S""_
,,u{{DNNDLLRU3V'WXU[[#t||1D%EF,,u{{C3H'IJI.U[['4>>4>>1Z%[\I.    c                 l    |j                   }|j                  |d   |d   d      j                  dd      S )zBReshapes a tensor to three dimensions, keeping the first and last.r   r	      )shapereshape	transposer$   tss      r&   _reshape_inputz$MultiQueryAttentionV2._reshape_input4   s5    GG yy1qtR(221a88r'   mc                    |j                   \  }}}}||n|}| j                  |      }| j                  |      }t        j                  d|| j                        }	t        j                  d|| j
                        }
t        j                  d|	|
      | j                  z  }|j                  d      }| j                  |      }t        j                  d|| j                        }t        j                  d||      }t        j                  d|| j                        }| j                  |      }|j                  |d||      S )	Run layer computation.zbnd,hkd->bnhkzbmd,dk->bmkzbnhk,bmk->bnhmr)   r   zbmd,dv->bmvzbnhm,bmv->bnhvzbnhv,dhv->bdn)r+   r1   r   einsumr   r    r   softmaxr   r!   r#   r   r,   )r$   xr2   b_hw
reshaped_x
reshaped_mqkattnvoresults                  r&   forwardzMultiQueryAttentionV2.forward<   s   WW
1aAA((+
((+
LL*dooFLL
DMMB||,a3djj@|||#~~d#LL
DOODLL)43oq$--@'~~aQ**r'   )N   @   rG           rH   N)__name__
__module____qualname____doc__intr   floatr   r1   r   TensorrE   __classcell__r%   s   @r&   r   r      s      &*!!// c]/ 	/
 / / / /29+HU\\2 +r'   r   c                       e Zd ZU dZej
                  j                  e   ed<   ddddddddddde	j                  d	fd
edee   dedee   dee   dededededeeeee   f   dededee	j$                     def fdZd Zdej*                  fdZdej*                  dedefdZdej*                  dededefdZd"d eej*                     fd!Z xZS )#MultiQueryAttention2da  Multi Query Attention with spatial downsampling.

     3 parameters are introduced for the spatial downsampling:
     1. kv_stride: downsampling factor on Key and Values only.
     2. query_strides: horizontal & vertical strides on Query only.

    This is an optimized version.
    1. Projections in Attention is explicit written out as 1x1 Conv2D.
    2. Additional reshapes are introduced to bring a up to 3x speed up.
    
fused_attnNrF   r	       rH   Fr   r   r   r   r   query_strides	kv_stridedw_kernel_sizedilationpaddingr   r   
norm_layeruse_biasc                    t         |           |xs |}|| _        |xs ||z  | _        |xs ||z  | _        t        |      | _        || _        t        | j                  D cg c]  }|dkD  	 c}      | _	        | j                  dz  | _
        t               | _        || _        t        j                         | _        | j                  r|
dk(  r3| j                   j#                  dt%        d| j                  d             n0| j                   j#                  dt        j&                  |             | j                   j#                  d ||             | j                   j#                  d	t)        || j                  | j                  z  d|
             t        j                         | _        |dkD  rN| j*                  j#                  dt)        |||||	|
d             | j*                  j#                  d ||             | j*                  j#                  d	t)        || j                  d|
|             t        j                         | _        |dkD  rN| j,                  j#                  dt)        |||||	|
d             | j,                  j#                  d ||             | j,                  j#                  d	t)        || j                  d|
             t        j.                  |      | _        t        j                         | _        | j                  r<| j2                  j#                  dt        j4                  | j                  dd             | j2                  j#                  d	t)        | j                  | j                  z  |d|
             | j2                  j#                  dt        j.                  |             d| _        yc c}w )a{  Initializer.

        Args:
          num_heads: Number of attention heads.
          key_dim: Size of the attention key dimension.
          value_dim: Size of the attention value dimension.
          query_strides: Vertical stride size for query only.
          kv_stride: Key and value stride size.
          dw_kernel_size: Spatial dimension of the depthwise kernel.
        r	   r   same	down_poolavg)kernel_sizer\   )rc   normproj)rc   bias	down_convT)rc   strider[   r\   	depthwise)rc   r\   rf   upsamplebilinearF)scale_factormodealign_cornersdropN)r   r   r   r   r   r   rX   rY   anyhas_query_stridesr   r
   rU   ro   r   
Sequentialquery
add_moduler   	AvgPool2dr   keyvaluer"   r   outputUpsampler6   )r$   r   r   r   r   r   rX   rY   rZ   r[   r\   r   r   r]   r^   r0   r%   s                   r&   r   zMultiQueryAttention2d.__init___   s!   6 	.S"2#"2"6cY&6&}5"!$T5G5G%Ha!e%H!I\\T)
(*	]]_
!!& 

%%k=$($6$6 &4  

%%k2<<M3Z[JJ!!&*S/:

fmNNT\\)	'
 	 ==?q=HH]* !.  HH
38FMLL%
 	 ]]_
q=JJ!!+}* !0  JJ!!&*S/:

fmNN	'
 	 I.mmo!!KK"":r{{HZHZak  |A  0B  Cv}NNT^^+	(
 	 	v

9(=>[ &Is   #Oc                    t         j                  j                  | j                  j                  j
                         t         j                  j                  | j                  j                  j
                         t         j                  j                  | j                  j                  j
                         | j                  dkD  rzt         j                  j                  | j                  j                  j
                         t         j                  j                  | j                  j                  j
                         t         j                  j                  | j                  j                  j
                         y )Nr	   )r   initxavier_uniform_rs   re   weightrv   rw   rY   rg   rx   )r$   s    r&   init_weightsz"MultiQueryAttention2d.init_weights   s    


 6 67
 4 45


 6 67>>AGG##DHH$6$6$=$=>GG##DJJ$8$8$?$?@
 0 0 7 78r'   r/   c                     |j                   }|j                  |d   |d   d      j                  dd      }| j                  r|S |j	                  d      j                         S )zFReshapes a tensor to three dimensions, keeping the batch and channels.r   r	   r)   r*   )r+   r,   r-   r6   	unsqueeze
contiguousr.   s      r&   r1   z$MultiQueryAttention2d._reshape_input   sU    GGIIadAaD"%//15;;H;;q>,,..r'   c                     |j                   }|j                  |d   ||d      }| j                  r"|j                  dddd      j	                         S |j                  dd      j	                         S )z?Reshapes projected query: [b, n, n, h x k] -> [b, n x n, h, k].r   r)   rV   r	   r*   )r+   r,   r6   permuter   r-   )r$   r/   r   r   r0   s        r&   _reshape_projected_queryz.MultiQueryAttention2d._reshape_projected_query   sb    GGIIadIw3;;99Q1a(3355;;r2&1133r'   h_pxw_pxc                     |j                   }|d   |z  }| j                  s|j                  dd      }|j                  |d   |||      j	                  dddd      j                         S )z2Reshape output:[b, n x n x h, k] -> [b, n, n, hk].r)   r	   r*   r   rV   )r+   r6   r-   r,   r   r   )r$   r/   r   r   r   r0   feat_dims          r&   _reshape_outputz%MultiQueryAttention2d._reshape_output   sc    GGR59${{Aq!Ayy1tT84<<Q1aHSSUUr'   	attn_maskc                    |j                   x\  }}}}}| j                  |      }| j                  || j                  | j                        }| j                  |      }	| j                  |	      }	| j                  |      }
| j                  |
      }
| j                  rft        j                  d||	      | j                  z  }|||z   }|j                  d      }| j                  |      }t        j                  d||
      }n| j                  r=t        j                  ||	|
|| j                   r| j                  j"                  nd      }nS|| j                  z  }||	j%                  dd      z  }|||z   }|j                  d      }| j                  |      }||
z  }| j'                  || j                  || j(                  d   z  || j(                  d	   z        }| j+                  |      }|S )
r4   zblhk,bpk->blhpr)   r5   zblhp,bpk->blhkrH   r   	dropout_pr   r   r	   )r+   rs   r   r   r   rv   r1   rw   r6   r   r   r7   r   rU   Fscaled_dot_product_attentiontrainingpr-   r   rX   rx   )r$   r8   r   BCHWr0   r?   r@   rB   rA   rC   s                r&   rE   zMultiQueryAttention2d.forward   s    
1aQJJqM))!T^^T\\JHHQK"JJqM"
 ;;<< 0!Q7$**DD$i'<<B<'D>>$'D-tQ7A22q!'26--dnn..R 

N1;;r2..()+D|||+~~d+1H   DNNA9K9KA9N4NPQUYUgUghiUjPjkKKNr'   rI   )rJ   rK   rL   rM   r   jitFinalbool__annotations__r   BatchNorm2drN   r   r   strr   rO   r   Moduler   r~   rP   r1   r   r   rE   rQ   rR   s   @r&   rT   rT   R   s   	 		%%
 &*%)'+!""#24!!*,.."oo c]o 	o
 c]o  }o o o  o o 3T#Y./o o o RYYo ob9/ /4%,, 43 4QT 4V V# VS VPS V/HU\\$: /r'   rT   c                        e Zd ZU ej                  j
                  e   ed<   	 	 	 	 	 	 	 	 ddede	e   dedededede
d	e
f fd
Zdde	ej                     fdZ xZS )Attention2drU   r   r   r   rf   expand_first
head_firstr   r   c	                 n   t         
|           |xs |}|r|n|}	|| _        |	|z  | _        || _        t               | _        t        j                  ||	dz  d|      | _	        t        j                  |      | _        t        j                  |	|d|      | _        t        j                  |      | _        y )NrV   r	   )rf   )r   r   r   dim_headr   r
   rU   r   Conv2dqkvr"   r   re   r   )r$   r   r   r   rf   r   r   r   r   dim_attnr%   s             r&   r   zAttention2d.__init__*  s     	.S*7" I-$(*99S(Q,=I.IIh>	I.r'   r   c                    |j                   \  }}}}| j                  rP| j                  |      j                  || j                  | j
                  dz  d      j                  dd      \  }}}	nK| j                  |      j                  |d| j                  | j
                  d      j                  d      \  }}}	| j                  rt        j                  j                  j                  |j                  dd      j                         |j                  dd      j                         |	j                  dd      j                         || j                   r| j"                  j$                  nd      j                  dd      j                  |d||      }n|j                  dd      }|	j                  dd      }	||z  |j'                  d      d	z  z  }
||
|z   }
|
j)                  d      }
| j#                  |
      }
|
|	z  j                  dd      j                  |d||      }| j+                  |      }| j-                  |      }|S )
NrV   r)   r*   r5   r	   r   rH   r   r   )r+   r   r   viewr   r   chunkr,   unbindrU   r   r   r   r   r-   r   r   r   r   sizer7   re   r   )r$   r8   r   r   r   r   r   r?   r@   rB   rA   s              r&   rE   zAttention2d.forwardB  s   WW
1a??hhqk&&q$..$--!:KRPVVWX^_V`GAq!hhqk))!QrRYYZ[\GAq!??##@@B#..0B#..0B#..0#.2mm$..** A  iB2q! 4  B#AB#Aq5166":--D$i'<<B<'D>>$'D$$R,44QAqAAIIaLNN1r'   )N    TFFrH   rH   rI   )rJ   rK   rL   r   r   r   r   r   rN   r   rO   r   rP   rE   rQ   rR   s   @r&   r   r   &  s    		%%3 &*!&$!!// c]/ 	/
 / / / / /0HU\\$: r'   r   )typingr   r   r   r   r   r   torch.nnr   r   configr
   r   helpersr   pool2d_samer   r   r   rT   r    r'   r&   <module>r      sV    . .   $ " (  &B+BII B+JQBII Qh9")) 9r'   